diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index cb3705b06e..cd4a6786a7 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -173,6 +173,15 @@ set_target_properties(leveldb PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib add_library(brpc STATIC IMPORTED) set_target_properties(brpc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib64/libbrpc.a) +add_library(rocksdb STATIC IMPORTED) +set_target_properties(rocksdb PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/librocksdb.a) + +add_library(librdkafka STATIC IMPORTED) +set_target_properties(librdkafka PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/librdkafka.a) + +add_library(librdkafka_cpp STATIC IMPORTED) +set_target_properties(librdkafka_cpp PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/librdkafka++.a) + find_program(THRIFT_COMPILER thrift ${CMAKE_SOURCE_DIR}/bin) # llvm-config @@ -430,7 +439,7 @@ include_directories( set(WL_START_GROUP "-Wl,--start-group") set(WL_END_GROUP "-Wl,--end-group") -# Set doris libraries +# Set Palo libraries set(DORIS_LINK_LIBS ${WL_START_GROUP} Agent @@ -447,11 +456,15 @@ set(DORIS_LINK_LIBS PaloGen Webserver TestUtil + AES ${WL_END_GROUP} ) # Set thirdparty libraries set(DORIS_DEPENDENCIES + rocksdb + librdkafka + librdkafka_cpp lzo snappy ${Boost_LIBRARIES} @@ -477,7 +490,7 @@ set(DORIS_DEPENDENCIES leveldb ) -# Add all external dependencies. They should come after the doris libs. +# Add all external dependencies. They should come after the palo libs. # static link gcc's lib set(DORIS_LINK_LIBS ${DORIS_LINK_LIBS} ${DORIS_DEPENDENCIES} @@ -542,6 +555,8 @@ add_subdirectory(${SRC_DIR}/exprs) add_subdirectory(${SRC_DIR}/udf) add_subdirectory(${SRC_DIR}/runtime) add_subdirectory(${SRC_DIR}/testutil) +add_subdirectory(${SRC_DIR}/aes) +add_subdirectory(${SRC_DIR}/tools) # Utility CMake function to make specifying tests and benchmarks less verbose FUNCTION(ADD_BE_TEST TEST_NAME) diff --git a/be/src/agent/agent_server.cpp b/be/src/agent/agent_server.cpp index a3adba9144..88e12f00ef 100644 --- a/be/src/agent/agent_server.cpp +++ b/be/src/agent/agent_server.cpp @@ -33,7 +33,6 @@ #include "gen_cpp/MasterService_types.h" #include "gen_cpp/Status_types.h" #include "olap/utils.h" -#include "olap/command_executor.h" #include "runtime/exec_env.h" #include "runtime/etl_job_mgr.h" #include "util/debug_util.h" @@ -55,21 +54,17 @@ AgentServer::AgentServer(ExecEnv* exec_env, _exec_env(exec_env), _master_info(master_info), _topic_subscriber(new TopicSubscriber()) { - - // clean dpp download dir - _command_executor = new CommandExecutor(); - vector root_paths_info; - _command_executor->get_all_root_path_info(&root_paths_info, false); - for (auto root_path_info: root_paths_info) { + + for (auto& path : exec_env->store_paths()) { try { - string dpp_download_path_str = root_path_info.path + DPP_PREFIX; + string dpp_download_path_str = path.path + DPP_PREFIX; boost::filesystem::path dpp_download_path(dpp_download_path_str); if (boost::filesystem::exists(dpp_download_path)) { boost::filesystem::remove_all(dpp_download_path); } } catch (...) { - OLAP_LOG_WARNING("boost exception when remove dpp download path. [path='%s']", - root_path_info.path.c_str()); + LOG(WARNING) << "boost exception when remove dpp download path. path=" + << path.path; } } @@ -93,6 +88,18 @@ AgentServer::AgentServer(ExecEnv* exec_env, TaskWorkerPool::TaskWorkerType::PUSH, _exec_env, master_info); + _publish_version_workers = new TaskWorkerPool( + TaskWorkerPool::TaskWorkerType::PUBLISH_VERSION, + _exec_env, + master_info); + _clear_alter_task_workers = new TaskWorkerPool( + TaskWorkerPool::TaskWorkerType::CLEAR_ALTER_TASK, + _exec_env, + master_info); + _clear_transaction_task_workers = new TaskWorkerPool( + TaskWorkerPool::TaskWorkerType::CLEAR_TRANSACTION_TASK, + exec_env, + master_info); _delete_workers = new TaskWorkerPool( TaskWorkerPool::TaskWorkerType::DELETE, _exec_env, @@ -149,10 +156,17 @@ AgentServer::AgentServer(ExecEnv* exec_env, TaskWorkerPool::TaskWorkerType::MOVE, _exec_env, master_info); + _recover_tablet_workers = new TaskWorkerPool( + TaskWorkerPool::TaskWorkerType::RECOVER_TABLET, + _exec_env, + master_info); #ifndef BE_TEST _create_table_workers->start(); _drop_table_workers->start(); _push_workers->start(); + _publish_version_workers->start(); + _clear_alter_task_workers->start(); + _clear_transaction_task_workers->start(); _delete_workers->start(); _alter_table_workers->start(); _clone_workers->start(); @@ -167,6 +181,7 @@ AgentServer::AgentServer(ExecEnv* exec_env, _make_snapshot_workers->start(); _release_snapshot_workers->start(); _move_dir_workers->start(); + _recover_tablet_workers->start(); // Add subscriber here and register listeners TopicListener* user_resource_listener = new UserResourceListener(exec_env, master_info); LOG(INFO) << "Register user resource listener"; @@ -175,9 +190,6 @@ AgentServer::AgentServer(ExecEnv* exec_env, } AgentServer::~AgentServer() { - if (_command_executor != NULL) { - delete _command_executor; - } if (_create_table_workers != NULL) { delete _create_table_workers; } @@ -187,6 +199,15 @@ AgentServer::~AgentServer() { if (_push_workers != NULL) { delete _push_workers; } + if (_publish_version_workers != NULL) { + delete _publish_version_workers; + } + if (_clear_alter_task_workers != NULL) { + delete _clear_alter_task_workers; + } + if (_clear_transaction_task_workers != NULL) { + delete _clear_transaction_task_workers; + } if (_delete_workers != NULL) { delete _delete_workers; } @@ -226,6 +247,9 @@ AgentServer::~AgentServer() { if (_move_dir_workers!= NULL) { delete _move_dir_workers; } + if (_recover_tablet_workers != NULL) { + delete _recover_tablet_workers; + } if (_release_snapshot_workers != NULL) { delete _release_snapshot_workers; } @@ -270,6 +294,7 @@ void AgentServer::submit_tasks( status_code = TStatusCode::ANALYSIS_ERROR; } break; + case TTaskType::REALTIME_PUSH: case TTaskType::PUSH: if (task.__isset.push_req) { if (task.push_req.push_type == TPushType::LOAD @@ -284,6 +309,27 @@ void AgentServer::submit_tasks( status_code = TStatusCode::ANALYSIS_ERROR; } break; + case TTaskType::PUBLISH_VERSION: + if (task.__isset.publish_version_req) { + _publish_version_workers->submit_task(task); + } else { + status_code = TStatusCode::ANALYSIS_ERROR; + } + break; + case TTaskType::CLEAR_ALTER_TASK: + if (task.__isset.clear_alter_task_req) { + _clear_alter_task_workers->submit_task(task); + } else { + status_code = TStatusCode::ANALYSIS_ERROR; + } + break; + case TTaskType::CLEAR_TRANSACTION_TASK: + if (task.__isset.clear_transaction_task_req) { + _clear_transaction_task_workers->submit_task(task); + } else { + status_code = TStatusCode::ANALYSIS_ERROR; + } + break; case TTaskType::ROLLUP: case TTaskType::SCHEMA_CHANGE: if (task.__isset.alter_tablet_req) { @@ -355,6 +401,13 @@ void AgentServer::submit_tasks( status_code = TStatusCode::ANALYSIS_ERROR; } break; + case TTaskType::RECOVER_TABLET: + if (task.__isset.recover_tablet_req) { + _recover_tablet_workers->submit_task(task); + } else { + status_code = TStatusCode::ANALYSIS_ERROR; + } + break; default: status_code = TStatusCode::ANALYSIS_ERROR; break; @@ -378,7 +431,7 @@ void AgentServer::make_snapshot(TAgentResult& return_value, string snapshot_path; OLAPStatus make_snapshot_status = - _command_executor->make_snapshot(snapshot_request, &snapshot_path); + _exec_env->olap_engine()->make_snapshot(snapshot_request, &snapshot_path); if (make_snapshot_status != OLAP_SUCCESS) { status_code = TStatusCode::RUNTIME_ERROR; OLAP_LOG_WARNING("make_snapshot failed. tablet_id: %ld, schema_hash: %ld, status: %d", @@ -387,15 +440,17 @@ void AgentServer::make_snapshot(TAgentResult& return_value, error_msgs.push_back("make_snapshot failed. status: " + boost::lexical_cast(make_snapshot_status)); } else { - OLAP_LOG_INFO("make_snapshot success. tablet_id: %ld, schema_hash: %ld, snapshot_path: %s", - snapshot_request.tablet_id, snapshot_request.schema_hash, - snapshot_path.c_str()); + LOG(INFO) << "make_snapshot success. tablet_id: " << snapshot_request.tablet_id + << " schema_hash: " << snapshot_request.schema_hash << " snapshot_path: " << snapshot_path; return_value.__set_snapshot_path(snapshot_path); } status.__set_error_msgs(error_msgs); status.__set_status_code(status_code); return_value.__set_status(status); + if (snapshot_request.__isset.allow_incremental_clone) { + return_value.__set_allow_incremental_clone(snapshot_request.allow_incremental_clone); + } } void AgentServer::release_snapshot(TAgentResult& return_value, const std::string& snapshot_path) { @@ -403,16 +458,14 @@ void AgentServer::release_snapshot(TAgentResult& return_value, const std::string TStatusCode::type status_code = TStatusCode::OK; OLAPStatus release_snapshot_status = - _command_executor->release_snapshot(snapshot_path); + _exec_env->olap_engine()->release_snapshot(snapshot_path); if (release_snapshot_status != OLAP_SUCCESS) { status_code = TStatusCode::RUNTIME_ERROR; - OLAP_LOG_WARNING("release_snapshot failed. snapshot_path: %s, status: %d", - snapshot_path.c_str(), release_snapshot_status); + LOG(WARNING) << "release_snapshot failed. snapshot_path: " << snapshot_path << ", status: " << release_snapshot_status; error_msgs.push_back("release_snapshot failed. status: " + boost::lexical_cast(release_snapshot_status)); } else { - OLAP_LOG_INFO("release_snapshot success. snapshot_path: %s, status: %d", - snapshot_path.c_str(), release_snapshot_status); + LOG(INFO) << "release_snapshot success. snapshot_path: " << snapshot_path << ", status: " << release_snapshot_status; } return_value.status.__set_error_msgs(error_msgs); diff --git a/be/src/agent/agent_server.h b/be/src/agent/agent_server.h index a52d60ab1a..9e546f230a 100644 --- a/be/src/agent/agent_server.h +++ b/be/src/agent/agent_server.h @@ -23,7 +23,6 @@ #include "agent/utils.h" #include "gen_cpp/AgentService_types.h" #include "gen_cpp/Types_types.h" -#include "olap/command_executor.h" #include "olap/olap_define.h" #include "olap/utils.h" #include "runtime/exec_env.h" @@ -91,11 +90,13 @@ public: private: ExecEnv* _exec_env; const TMasterInfo& _master_info; - CommandExecutor* _command_executor; TaskWorkerPool* _create_table_workers; TaskWorkerPool* _drop_table_workers; TaskWorkerPool* _push_workers; + TaskWorkerPool* _publish_version_workers; + TaskWorkerPool* _clear_alter_task_workers; + TaskWorkerPool* _clear_transaction_task_workers; TaskWorkerPool* _delete_workers; TaskWorkerPool* _alter_table_workers; TaskWorkerPool* _clone_workers; @@ -110,6 +111,7 @@ private: TaskWorkerPool* _make_snapshot_workers; TaskWorkerPool* _release_snapshot_workers; TaskWorkerPool* _move_dir_workers; + TaskWorkerPool* _recover_tablet_workers; DISALLOW_COPY_AND_ASSIGN(AgentServer); diff --git a/be/src/agent/cgroups_mgr.cpp b/be/src/agent/cgroups_mgr.cpp index 7ccc2ddcc4..55e50133a3 100644 --- a/be/src/agent/cgroups_mgr.cpp +++ b/be/src/agent/cgroups_mgr.cpp @@ -25,7 +25,8 @@ #include #include "boost/filesystem.hpp" #include "common/logging.h" -#include "olap/olap_rootpath.h" +#include "olap/store.h" +#include "olap/olap_engine.h" #include "runtime/exec_env.h" #include "runtime/load_path_mgr.h" @@ -185,29 +186,26 @@ AgentStatus CgroupsMgr::_config_disk_throttle(std::string user_name, } // add olap engine data path here - vector data_paths(0); - OLAPRootPath::get_instance()->get_table_data_path(&data_paths); + auto stores = OLAPEngine::get_instance()->get_stores(); // buld load data path, it is alreay in data path // _exec_env->load_path_mgr()->get_load_data_path(&data_paths); stringstream ctrl_cmd; - for (vector::iterator it = data_paths.begin(); - it != data_paths.end(); - ++it) { + for (auto store : stores) { // check disk type int64_t read_iops = hdd_read_iops; int64_t write_iops = hdd_write_iops; int64_t read_mbps = hdd_read_mbps; int64_t write_mbps = hdd_write_mbps; // if user set hdd not ssd, then use hdd for ssd - if (OLAPRootPath::is_ssd_disk(*it)) { + if (store->is_ssd_disk()) { read_iops = ssd_read_iops == -1 ? hdd_read_iops : ssd_read_iops; write_iops = ssd_write_iops == -1 ? hdd_write_iops : ssd_write_iops; read_mbps = ssd_read_mbps == -1 ? hdd_read_mbps : ssd_read_mbps; write_mbps = ssd_write_mbps == -1 ? hdd_write_mbps : ssd_write_mbps; } struct stat file_stat; - if (stat(it->c_str(), &file_stat) != 0) { + if (stat(store->path().c_str(), &file_stat) != 0) { continue; } int major_number = major(file_stat.st_dev); diff --git a/be/src/agent/heartbeat_server.cpp b/be/src/agent/heartbeat_server.cpp index d8a0bb480c..389e7662dc 100644 --- a/be/src/agent/heartbeat_server.cpp +++ b/be/src/agent/heartbeat_server.cpp @@ -22,7 +22,6 @@ #include "gen_cpp/Status_types.h" #include "common/status.h" -#include "olap/olap_rootpath.h" #include "olap/olap_engine.h" #include "olap/utils.h" #include "service/backend_options.h" @@ -38,11 +37,11 @@ namespace palo { HeartbeatServer::HeartbeatServer(TMasterInfo* master_info) : _master_info(master_info), _epoch(0) { - _olap_rootpath_instance = OLAPRootPath::get_instance(); + _olap_engine = OLAPEngine::get_instance(); } void HeartbeatServer::init_cluster_id() { - _master_info->cluster_id = _olap_rootpath_instance->effective_cluster_id(); + _master_info->cluster_id = _olap_engine->effective_cluster_id(); } void HeartbeatServer::heartbeat( @@ -85,17 +84,14 @@ Status HeartbeatServer::_heartbeat( if (_master_info->cluster_id == -1) { OLAP_LOG_INFO("get first heartbeat. update cluster id"); // write and update cluster id - OLAPStatus res = _olap_rootpath_instance->set_cluster_id(master_info.cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to set cluster id. [res=%d]", res); + auto st = _olap_engine->set_cluster_id(master_info.cluster_id); + if (!st.ok()) { + LOG(WARNING) << "fail to set cluster id. status=" << st.get_error_msg(); return Status("fail to set cluster id."); } else { _master_info->cluster_id = master_info.cluster_id; - OLAP_LOG_INFO("record cluster id." - "host: %s, port: %d, cluster id: %d", - master_info.network_address.hostname.c_str(), - master_info.network_address.port, - master_info.cluster_id); + LOG(INFO) << "record cluster id. host: " << master_info.network_address.hostname + << ". port: " << master_info.network_address.port << ". cluster id: " << master_info.cluster_id; } } else { if (_master_info->cluster_id != master_info.cluster_id) { @@ -110,16 +106,13 @@ Status HeartbeatServer::_heartbeat( _master_info->network_address.hostname = master_info.network_address.hostname; _master_info->network_address.port = master_info.network_address.port; _epoch = master_info.epoch; - OLAP_LOG_INFO("master change, new master host: %s, port: %d, epoch: %ld", - _master_info->network_address.hostname.c_str(), - _master_info->network_address.port, - _epoch); + LOG(INFO) << "master change. new master host: " << _master_info->network_address.hostname + << ". port: " << _master_info->network_address.port << ". epoch: " << _epoch; } else { - OLAP_LOG_WARNING("epoch is not greater than local. ignore heartbeat." - "host: %s, port: %d, local epoch: %ld, received epoch: %ld", - _master_info->network_address.hostname.c_str(), - _master_info->network_address.port, - _epoch, master_info.epoch); + LOG(WARNING) << "epoch is not greater than local. ignore heartbeat. host: " + << _master_info->network_address.hostname + << " port: " << _master_info->network_address.port + << " local epoch: " << _epoch << " received epoch: " << master_info.epoch; return Status("epoch is not greater than local. ignore heartbeat."); } } @@ -127,7 +120,7 @@ Status HeartbeatServer::_heartbeat( if (master_info.__isset.token) { if (!_master_info->__isset.token) { _master_info->__set_token(master_info.token); - OLAP_LOG_INFO("get token. token: %s", _master_info->token.c_str()); + LOG(INFO) << "get token. token: " << _master_info->token; } else if (_master_info->token != master_info.token) { LOG(WARNING) << "invalid token. local_token:" << _master_info->token << ". token:" << master_info.token; diff --git a/be/src/agent/heartbeat_server.h b/be/src/agent/heartbeat_server.h index 9ebd2e52dd..45751d7672 100644 --- a/be/src/agent/heartbeat_server.h +++ b/be/src/agent/heartbeat_server.h @@ -22,13 +22,11 @@ #include "gen_cpp/HeartbeatService.h" #include "gen_cpp/Status_types.h" #include "olap/olap_define.h" -#include "olap/olap_rootpath.h" #include "runtime/exec_env.h" namespace palo { const uint32_t HEARTBEAT_INTERVAL = 10; - class OLAPEngine; class Status; @@ -53,7 +51,7 @@ private: const TMasterInfo& master_info); TMasterInfo* _master_info; - OLAPRootPath* _olap_rootpath_instance; + OLAPEngine* _olap_engine; int64_t _epoch; DISALLOW_COPY_AND_ASSIGN(HeartbeatServer); }; // class HeartBeatServer diff --git a/be/src/agent/pusher.cpp b/be/src/agent/pusher.cpp index d8b68763b4..febce3cdba 100644 --- a/be/src/agent/pusher.cpp +++ b/be/src/agent/pusher.cpp @@ -26,7 +26,6 @@ #include "agent/cgroups_mgr.h" #include "agent/file_downloader.h" #include "gen_cpp/AgentService_types.h" -#include "olap/command_executor.h" #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/olap_engine.h" @@ -39,16 +38,12 @@ using std::vector; namespace palo { -Pusher::Pusher(const TPushReq& push_req) : - _push_req(push_req) { - _command_executor = new CommandExecutor(); +Pusher::Pusher(OLAPEngine* engine, const TPushReq& push_req) : + _push_req(push_req), _engine(engine) { _download_status = PALO_SUCCESS; } Pusher::~Pusher() { - if (_command_executor != NULL) { - delete _command_executor; - } } AgentStatus Pusher::init() { @@ -60,8 +55,8 @@ AgentStatus Pusher::init() { } // Check replica exist - SmartOLAPTable olap_table; - olap_table = _command_executor->get_table( + OLAPTablePtr olap_table; + olap_table = _engine->get_table( _push_req.tablet_id, _push_req.schema_hash); if (olap_table.get() == NULL) { @@ -84,12 +79,12 @@ AgentStatus Pusher::init() { remote_full_path = _push_req.http_file_path; // Get local download path - OLAP_LOG_INFO("start get file. remote_full_path:%s", remote_full_path.c_str()); + LOG(INFO) << "start get file. remote_full_path: " << remote_full_path; string root_path = olap_table->storage_root_path_name(); status = _get_tmp_file_dir(root_path, &tmp_file_dir); if (PALO_SUCCESS != status) { - OLAP_LOG_WARNING("get local path failed. tmp file dir: %s", tmp_file_dir.c_str()); + LOG(WARNING) << "get local path failed. tmp file dir: " << tmp_file_dir; } } @@ -118,14 +113,14 @@ AgentStatus Pusher::_get_tmp_file_dir(const string& root_path, string* download_ boost::filesystem::path full_path(*download_path); if (!boost::filesystem::exists(full_path)) { - OLAP_LOG_INFO("download dir not exist: %s", download_path->c_str()); + LOG(INFO) << "download dir not exist: " << *download_path; boost::system::error_code error_code; boost::filesystem::create_directories(*download_path, error_code); if (0 != error_code) { status = PALO_ERROR; - OLAP_LOG_WARNING("create download dir failed.path: %s, error code: %d", - download_path->c_str(), error_code); + LOG(WARNING) << "create download dir failed.path: " + << *download_path << ", error code: " << error_code; } } @@ -155,10 +150,9 @@ AgentStatus Pusher::_download_file() { _downloader_param.remote_file_path.c_str(), _push_req.tablet_id, cost, _push_req.http_file_size, rate); } else { - OLAP_LOG_WARNING("down load file failed. remote_file=%s, tablet=%d, cost=%ld, " - "file size: %ld B", - _downloader_param.remote_file_path.c_str(), _push_req.tablet_id, cost, - _push_req.http_file_size); + LOG(WARNING) << "down load file failed. remote_file=" << _downloader_param.remote_file_path + << " tablet=" << _push_req.tablet_id + << " cost=" << cost << " file size: " << _push_req.http_file_size << " B"; } // todo check data length and mv name tmp @@ -262,10 +256,12 @@ AgentStatus Pusher::process(vector* tablet_infos) { if (status == PALO_SUCCESS) { // Load delta file time_t push_begin = time(NULL); - OLAPStatus push_status = _command_executor->push(_push_req, tablet_infos); + OLAPStatus push_status = _engine->push(_push_req, tablet_infos); time_t push_finish = time(NULL); OLAP_LOG_INFO("Push finish, cost time: %ld", push_finish - push_begin); - if (push_status != OLAPStatus::OLAP_SUCCESS) { + if (push_status == OLAPStatus::OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST) { + status = PALO_PUSH_HAD_LOADED; + } else if (push_status != OLAPStatus::OLAP_SUCCESS) { status = PALO_ERROR; } } diff --git a/be/src/agent/pusher.h b/be/src/agent/pusher.h index 1cf36d931f..f944ac4a92 100644 --- a/be/src/agent/pusher.h +++ b/be/src/agent/pusher.h @@ -21,7 +21,7 @@ #include "agent/file_downloader.h" #include "agent/status.h" #include "gen_cpp/AgentService_types.h" -#include "olap/command_executor.h" +#include "gen_cpp/MasterService_types.h" #include "olap/olap_common.h" #include "olap/olap_define.h" @@ -29,10 +29,11 @@ namespace palo { const uint32_t MAX_RETRY = 3; const uint32_t DEFAULT_DOWNLOAD_TIMEOUT = 3600; +class OLAPEngine; class Pusher { public: - explicit Pusher(const TPushReq& push_req); + explicit Pusher(OLAPEngine* engine, const TPushReq& push_req); virtual ~Pusher(); // The initial function of pusher @@ -52,7 +53,7 @@ private: bool _is_init = false; TPushReq _push_req; FileDownloader::FileDownloaderParam _downloader_param; - CommandExecutor* _command_executor; + OLAPEngine* _engine; FileDownloader* _file_downloader; AgentStatus _download_status; diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 7690bcc101..fc6e099156 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -35,6 +35,7 @@ #include "olap/olap_common.h" #include "olap/olap_engine.h" #include "olap/olap_table.h" +#include "olap/store.h" #include "olap/utils.h" #include "common/resource_tls.h" #include "common/status.h" @@ -60,6 +61,7 @@ namespace palo { const uint32_t DOWNLOAD_FILE_MAX_RETRY = 3; const uint32_t TASK_FINISH_MAX_RETRY = 3; const uint32_t PUSH_MAX_RETRY = 1; +const uint32_t PUBLISH_VERSION_MAX_RETRY = 3; const uint32_t REPORT_TASK_WORKER_COUNT = 1; const uint32_t REPORT_DISK_STATE_WORKER_COUNT = 1; const uint32_t REPORT_OLAP_TABLE_WORKER_COUNT = 1; @@ -69,15 +71,15 @@ const std::string HTTP_REQUEST_TOKEN_PARAM = "token="; const std::string HTTP_REQUEST_FILE_PARAM = "&file="; std::atomic_ulong TaskWorkerPool::_s_report_version(time(NULL) * 10000); -MutexLock TaskWorkerPool::_s_task_signatures_lock; -MutexLock TaskWorkerPool::_s_running_task_user_count_lock; +Mutex TaskWorkerPool::_s_task_signatures_lock; +Mutex TaskWorkerPool::_s_running_task_user_count_lock; map> TaskWorkerPool::_s_task_signatures; map> TaskWorkerPool::_s_running_task_user_count; map> TaskWorkerPool::_s_total_task_user_count; map TaskWorkerPool::_s_total_task_count; FrontendServiceClientCache TaskWorkerPool::_master_service_client_cache; -boost::mutex TaskWorkerPool::_disk_broken_lock; -boost::posix_time::time_duration TaskWorkerPool::_wait_duration; +std::mutex TaskWorkerPool::_disk_broken_lock; +std::chrono::seconds TaskWorkerPool::_wait_duration; TaskWorkerPool::TaskWorkerPool( const TaskWorkerType task_worker_type, @@ -88,7 +90,6 @@ TaskWorkerPool::TaskWorkerPool( _task_worker_type(task_worker_type) { _agent_utils = new AgentUtils(); _master_client = new MasterServerClient(_master_info, &_master_service_client_cache); - _command_executor = new CommandExecutor(); _env = env; _backend.__set_host(BackendOptions::get_localhost()); _backend.__set_be_port(config::be_port); @@ -104,10 +105,6 @@ TaskWorkerPool::~TaskWorkerPool() { delete _master_client; _master_client = NULL; } - if (_command_executor != NULL) { - delete _command_executor; - _command_executor = NULL; - } } void TaskWorkerPool::start() { @@ -122,10 +119,23 @@ void TaskWorkerPool::start() { _callback_function = _drop_table_worker_thread_callback; break; case TaskWorkerType::PUSH: + case TaskWorkerType::REALTIME_PUSH: _worker_count = config::push_worker_count_normal_priority + config::push_worker_count_high_priority; _callback_function = _push_worker_thread_callback; break; + case TaskWorkerType::PUBLISH_VERSION: + _worker_count = config::publish_version_worker_count; + _callback_function = _publish_version_worker_thread_callback; + break; + case TaskWorkerType::CLEAR_ALTER_TASK: + _worker_count = config::clear_alter_task_worker_count; + _callback_function = _clear_alter_task_worker_thread_callback; + break; + case TaskWorkerType::CLEAR_TRANSACTION_TASK: + _worker_count = config::clear_transaction_task_worker_count; + _callback_function = _clear_transaction_task_worker_thread_callback; + break; case TaskWorkerType::DELETE: _worker_count = config::delete_worker_count; _callback_function = _push_worker_thread_callback; @@ -155,12 +165,12 @@ void TaskWorkerPool::start() { _callback_function = _report_task_worker_thread_callback; break; case TaskWorkerType::REPORT_DISK_STATE: - _wait_duration = boost::posix_time::time_duration(0, 0, config::report_disk_state_interval_seconds, 0); + _wait_duration = std::chrono::seconds(config::report_disk_state_interval_seconds); _worker_count = REPORT_DISK_STATE_WORKER_COUNT; _callback_function = _report_disk_state_worker_thread_callback; break; case TaskWorkerType::REPORT_OLAP_TABLE: - _wait_duration = boost::posix_time::time_duration(0, 0, config::report_disk_state_interval_seconds, 0); + _wait_duration = std::chrono::seconds(config::report_disk_state_interval_seconds); _worker_count = REPORT_OLAP_TABLE_WORKER_COUNT; _callback_function = _report_olap_table_worker_thread_callback; break; @@ -184,6 +194,10 @@ void TaskWorkerPool::start() { _worker_count = 1; _callback_function = _move_dir_thread_callback; break; + case TaskWorkerType::RECOVER_TABLET: + _worker_count = 1; + _callback_function = _recover_tablet_thread_callback; + break; default: // pass break; @@ -208,7 +222,7 @@ void TaskWorkerPool::submit_task(const TAgentTaskRequest& task) { bool ret = _record_task_info(task_type, signature, user); if (ret == true) { { - lock_guard worker_thread_lock(_worker_thread_lock); + lock_guard worker_thread_lock(_worker_thread_lock); _tasks.push_back(task); _worker_thread_condition_lock.notify(); } @@ -220,17 +234,21 @@ bool TaskWorkerPool::_record_task_info( int64_t signature, const string& user) { bool ret = true; - lock_guard task_signatures_lock(_s_task_signatures_lock); + lock_guard task_signatures_lock(_s_task_signatures_lock); set& signature_set = _s_task_signatures[task_type]; + std::string task_name; + EnumToString(TTaskType, task_type, task_name); if (signature_set.count(signature) > 0) { - OLAP_LOG_INFO("type: %d, signature: %ld has exist. queue size: %d", - task_type, signature, signature_set.size()); + LOG(INFO) << "type: " << task_name << ", " + << "signature: " << signature << ", has been inserted." + << "queue size: " << signature_set.size(); ret = false; } else { signature_set.insert(signature); - OLAP_LOG_INFO("type: %d, signature: %ld insert success. queue size: %d", - task_type, signature, signature_set.size()); + LOG(INFO) << "type: " << task_name << ", " + << "signature: " << signature << ", has been inserted." + << "queue size: " << signature_set.size(); if (task_type == TTaskType::PUSH) { _s_total_task_user_count[task_type][user] += 1; _s_total_task_count[task_type] += 1; @@ -244,7 +262,7 @@ void TaskWorkerPool::_remove_task_info( const TTaskType::type task_type, int64_t signature, const string& user) { - lock_guard task_signatures_lock(_s_task_signatures_lock); + lock_guard task_signatures_lock(_s_task_signatures_lock); set& signature_set = _s_task_signatures[task_type]; signature_set.erase(signature); @@ -253,13 +271,16 @@ void TaskWorkerPool::_remove_task_info( _s_total_task_count[task_type] -= 1; { - lock_guard running_task_user_count_lock(_s_running_task_user_count_lock); + lock_guard running_task_user_count_lock(_s_running_task_user_count_lock); _s_running_task_user_count[task_type][user] -= 1; } } - OLAP_LOG_INFO("type: %d, signature: %ld has been erased. queue size: %d", - task_type, signature, signature_set.size()); + std::string task_name; + EnumToString(TTaskType, task_type, task_name); + LOG(INFO) << "type: " << task_name << ", " + << "signature: " << signature << ", has been erased." + << "queue size: " << signature_set.size(); } void TaskWorkerPool::_spawn_callback_worker_thread(CALLBACK_FUNCTION callback_func) { @@ -343,7 +364,7 @@ uint32_t TaskWorkerPool::_get_next_task_index( float user_total_rate = 0; float user_running_rate = 0; { - lock_guard task_signatures_lock(_s_task_signatures_lock); + lock_guard task_signatures_lock(_s_task_signatures_lock); user_total_rate = _s_total_task_user_count[task.task_type][user] * 1.0 / _s_total_task_count[task.task_type]; user_running_rate = (_s_running_task_user_count[task.task_type][user] + 1) * 1.0 / @@ -382,7 +403,7 @@ uint32_t TaskWorkerPool::_get_next_task_index( } { - lock_guard running_task_user_count_lock(_s_running_task_user_count_lock); + lock_guard running_task_user_count_lock(_s_running_task_user_count_lock); _s_running_task_user_count[tasks[index].task_type][user] += 1; } return index; @@ -397,7 +418,7 @@ void* TaskWorkerPool::_create_table_worker_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TCreateTabletReq create_tablet_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -411,7 +432,8 @@ void* TaskWorkerPool::_create_table_worker_thread_callback(void* arg_this) { vector error_msgs; TStatus task_status; - OLAPStatus create_status = worker_pool_this->_command_executor->create_table(create_tablet_req); + OLAPStatus create_status = + worker_pool_this->_env->olap_engine()->create_table(create_tablet_req); if (create_status != OLAPStatus::OLAP_SUCCESS) { OLAP_LOG_WARNING("create table failed. status: %d, signature: %ld", create_status, agent_task_req.signature); @@ -448,7 +470,7 @@ void* TaskWorkerPool::_drop_table_worker_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TDropTabletReq drop_tablet_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -495,7 +517,7 @@ void* TaskWorkerPool::_alter_table_worker_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TAlterTabletReq alter_tablet_request; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -550,8 +572,10 @@ void TaskWorkerPool::_alter_table( process_name = "schema change"; break; default: - OLAP_LOG_WARNING("schema change type invalid. type: %d, signature: %ld.", - task_type, signature); + std::string task_name; + EnumToString(TTaskType, task_type, task_name); + LOG(WARNING) << "schema change type invalid. type: " << task_name << ", " + << "signature: " << signature; status = PALO_TASK_REQUEST_ERROR; break; } @@ -587,17 +611,17 @@ void TaskWorkerPool::_alter_table( } if (status == PALO_SUCCESS) { - if (alter_table_status == ALTER_TABLE_DONE + if (alter_table_status == ALTER_TABLE_FINISHED || alter_table_status == ALTER_TABLE_FAILED || alter_table_status == ALTER_TABLE_WAITING) { // Create rollup table OLAPStatus ret = OLAPStatus::OLAP_SUCCESS; switch (task_type) { case TTaskType::ROLLUP: - ret = _command_executor->create_rollup_table(alter_tablet_request); + ret = _env->olap_engine()->create_rollup_table(alter_tablet_request); break; case TTaskType::SCHEMA_CHANGE: - ret = _command_executor->schema_change(alter_tablet_request); + ret = _env->olap_engine()->schema_change(alter_tablet_request); break; default: // pass @@ -605,8 +629,7 @@ void TaskWorkerPool::_alter_table( } if (ret != OLAPStatus::OLAP_SUCCESS) { status = PALO_ERROR; - OLAP_LOG_WARNING("%s failed. signature: %ld, status: %d", - process_name.c_str(), signature, status); + LOG(WARNING) << process_name << " failed. signature: " << signature << " status: " << status; } } } @@ -614,7 +637,7 @@ void TaskWorkerPool::_alter_table( if (status == PALO_SUCCESS) { ++_s_report_version; - OLAP_LOG_INFO("%s finished. signature: %ld", process_name.c_str(), signature); + LOG(INFO) << process_name << " finished. signature: " << signature; } // Return result to fe @@ -646,7 +669,7 @@ void TaskWorkerPool::_alter_table( if (status == PALO_SUCCESS) { finish_task_request->__set_finish_tablet_infos(finish_tablet_infos); - OLAP_LOG_INFO("%s success. signature: %ld", process_name.c_str(), signature); + LOG(INFO) << process_name << " success. signature: " << signature; error_msgs.push_back(process_name + " success"); task_status.__set_status_code(TStatusCode::OK); } else if (status == PALO_TASK_REQUEST_ERROR) { @@ -655,7 +678,7 @@ void TaskWorkerPool::_alter_table( error_msgs.push_back("alter table request new tablet id or schema count invalid."); task_status.__set_status_code(TStatusCode::ANALYSIS_ERROR); } else { - OLAP_LOG_WARNING("%s failed. signature: %ld", process_name.c_str(), signature); + LOG(WARNING) << process_name << " failed. signature: " << signature; error_msgs.push_back(process_name + " failed"); error_msgs.push_back("status: " + _agent_utils->print_agent_status(status)); task_status.__set_status_code(TStatusCode::RUNTIME_ERROR); @@ -675,7 +698,7 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { int32_t push_worker_count_high_priority = config::push_worker_count_high_priority; static uint32_t s_worker_count = 0; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); if (s_worker_count < push_worker_count_high_priority) { ++s_worker_count; priority = TPriority::HIGH; @@ -691,7 +714,7 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { string user; int32_t index = 0; do { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -723,13 +746,12 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { } #endif - OLAP_LOG_INFO("get push task. signature: %ld, user: %s, priority: %d", - agent_task_req.signature, user.c_str(), priority); - + LOG(INFO) << "get push task. signature: " << agent_task_req.signature + << " user: " << user << " priority: " << priority; vector tablet_infos; if (push_req.push_type == TPushType::LOAD || push_req.push_type == TPushType::LOAD_DELETE) { #ifndef BE_TEST - Pusher pusher(push_req); + Pusher pusher(worker_pool_this->_env->olap_engine(), push_req); status = pusher.init(); #else status = worker_pool_this->_pusher->init(); @@ -743,6 +765,12 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { #else status = worker_pool_this->_pusher->process(&tablet_infos); #endif + if (status == PALO_PUSH_HAD_LOADED) { + OLAP_LOG_WARNING("transaction exists when realtime push, " + "but unfinished, do not report to fe, signature: %ld", + agent_task_req.signature); + break; // not retry any more + } // Internal error, need retry if (status == PALO_ERROR) { OLAP_LOG_WARNING("push internal error, need retry.signature: %ld", @@ -755,7 +783,7 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { } } else if (push_req.push_type == TPushType::DELETE) { OLAPStatus delete_data_status = - worker_pool_this->_command_executor->delete_data(push_req, &tablet_infos); + worker_pool_this->_env->olap_engine()->delete_data(push_req, &tablet_infos); if (delete_data_status != OLAPStatus::OLAP_SUCCESS) { OLAP_LOG_WARNING("delete data failed. status: %d, signature: %ld", delete_data_status, agent_task_req.signature); @@ -765,6 +793,14 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { status = PALO_TASK_REQUEST_ERROR; } +#ifndef BE_TEST + if (status == PALO_PUSH_HAD_LOADED) { + // remove the task and not return to fe + worker_pool_this->_remove_task_info( + agent_task_req.task_type, agent_task_req.signature, user); + continue; + } +#endif // Return result to fe vector error_msgs; TStatus task_status; @@ -811,6 +847,173 @@ void* TaskWorkerPool::_push_worker_thread_callback(void* arg_this) { return (void*)0; } +void* TaskWorkerPool::_publish_version_worker_thread_callback(void* arg_this) { + + TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; +#ifndef BE_TEST + while (true) { +#endif + TAgentTaskRequest agent_task_req; + TPublishVersionRequest publish_version_req; + { + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + while (worker_pool_this->_tasks.empty()) { + worker_pool_this->_worker_thread_condition_lock.wait(); + } + + agent_task_req = worker_pool_this->_tasks.front(); + publish_version_req = agent_task_req.publish_version_req; + worker_pool_this->_tasks.pop_front(); + } + OLAP_LOG_INFO("get publish version task, signature: %ld", agent_task_req.signature); + + TStatusCode::type status_code = TStatusCode::OK; + vector error_msgs; + TStatus task_status; + + vector error_tablet_ids; + uint32_t retry_time = 0; + OLAPStatus res = OLAP_SUCCESS; + while (retry_time < PUBLISH_VERSION_MAX_RETRY) { + error_tablet_ids.clear(); + res = worker_pool_this->_env->olap_engine()->publish_version( + publish_version_req, &error_tablet_ids); + if (res == OLAP_SUCCESS) { + break; + } else { + OLAP_LOG_WARNING("publish version error, retry. " + "[transaction_id=%ld, error_tablet_size=%d]", + publish_version_req.transaction_id, error_tablet_ids.size()); + retry_time += 1; + sleep(1); + } + } + + TFinishTaskRequest finish_task_request; + if (res != OLAP_SUCCESS) { + status_code = TStatusCode::RUNTIME_ERROR; + OLAP_LOG_WARNING("publish version failed. signature: %ld", agent_task_req.signature); + error_msgs.push_back("publish version failed"); + finish_task_request.__set_error_tablet_ids(error_tablet_ids); + } else { + OLAP_LOG_INFO("publish_version success. signature: %ld", agent_task_req.signature); + } + + task_status.__set_status_code(status_code); + task_status.__set_error_msgs(error_msgs); + + finish_task_request.__set_task_status(task_status); + finish_task_request.__set_backend(worker_pool_this->_backend); + finish_task_request.__set_task_type(agent_task_req.task_type); + finish_task_request.__set_signature(agent_task_req.signature); + + worker_pool_this->_finish_task(finish_task_request); + worker_pool_this->_remove_task_info(agent_task_req.task_type, agent_task_req.signature, ""); +#ifndef BE_TEST + } +#endif + return (void*)0; +} + +void* TaskWorkerPool::_clear_alter_task_worker_thread_callback(void* arg_this) { + + TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; +#ifndef BE_TEST + while (true) { +#endif + TAgentTaskRequest agent_task_req; + TClearAlterTaskRequest clear_alter_task_req; + { + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + while (worker_pool_this->_tasks.empty()) { + worker_pool_this->_worker_thread_condition_lock.wait(); + } + + agent_task_req = worker_pool_this->_tasks.front(); + clear_alter_task_req = agent_task_req.clear_alter_task_req; + worker_pool_this->_tasks.pop_front(); + } + OLAP_LOG_INFO("get clear alter task task, signature: %ld", agent_task_req.signature); + + TStatusCode::type status_code = TStatusCode::OK; + vector error_msgs; + TStatus task_status; + + OLAPStatus clear_status = worker_pool_this->_env->olap_engine()-> + clear_alter_task(clear_alter_task_req.tablet_id, clear_alter_task_req.schema_hash); + if (clear_status != OLAPStatus::OLAP_SUCCESS) { + OLAP_LOG_WARNING("clear alter task failed. [signature: %ld status=%d]", + agent_task_req.signature, clear_status); + error_msgs.push_back("clear alter task failed"); + status_code = TStatusCode::RUNTIME_ERROR; + } else { + OLAP_LOG_INFO("clear alter task success. signature: %ld", agent_task_req.signature); + } + + task_status.__set_status_code(status_code); + task_status.__set_error_msgs(error_msgs); + + TFinishTaskRequest finish_task_request; + finish_task_request.__set_task_status(task_status); + finish_task_request.__set_backend(worker_pool_this->_backend); + finish_task_request.__set_task_type(agent_task_req.task_type); + finish_task_request.__set_signature(agent_task_req.signature); + + worker_pool_this->_finish_task(finish_task_request); + worker_pool_this->_remove_task_info(agent_task_req.task_type, agent_task_req.signature, ""); +#ifndef BE_TEST + } +#endif + return (void*)0; +} + +void* TaskWorkerPool::_clear_transaction_task_worker_thread_callback(void* arg_this) { + + TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; +#ifndef BE_TEST + while (true) { +#endif + TAgentTaskRequest agent_task_req; + TClearTransactionTaskRequest clear_transaction_task_req; + { + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + while (worker_pool_this->_tasks.empty()) { + worker_pool_this->_worker_thread_condition_lock.wait(); + } + + agent_task_req = worker_pool_this->_tasks.front(); + clear_transaction_task_req = agent_task_req.clear_transaction_task_req; + worker_pool_this->_tasks.pop_front(); + } + OLAP_LOG_INFO("get clear transaction task task, signature: %ld, transaction_id: %ld", + agent_task_req.signature, clear_transaction_task_req.transaction_id); + + TStatusCode::type status_code = TStatusCode::OK; + vector error_msgs; + TStatus task_status; + + worker_pool_this->_env->olap_engine()->clear_transaction_task( + clear_transaction_task_req.transaction_id, clear_transaction_task_req.partition_id); + OLAP_LOG_INFO("finish to clear transaction task. signature: %ld, transaction_id: %ld", + agent_task_req.signature, clear_transaction_task_req.transaction_id); + + task_status.__set_status_code(status_code); + task_status.__set_error_msgs(error_msgs); + + TFinishTaskRequest finish_task_request; + finish_task_request.__set_task_status(task_status); + finish_task_request.__set_backend(worker_pool_this->_backend); + finish_task_request.__set_task_type(agent_task_req.task_type); + finish_task_request.__set_signature(agent_task_req.signature); + + worker_pool_this->_finish_task(finish_task_request); + worker_pool_this->_remove_task_info(agent_task_req.task_type, agent_task_req.signature, ""); +#ifndef BE_TEST + } +#endif + return (void*)0; +} + void* TaskWorkerPool::_clone_worker_thread_callback(void* arg_this) { TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; @@ -822,7 +1025,7 @@ void* TaskWorkerPool::_clone_worker_thread_callback(void* arg_this) { TCloneReq clone_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -838,87 +1041,138 @@ void* TaskWorkerPool::_clone_worker_thread_callback(void* arg_this) { OLAP_LOG_INFO("get clone task. signature: %ld", agent_task_req.signature); vector error_msgs; + string src_file_path; + TBackend src_host; // Check local tablet exist or not - SmartOLAPTable tablet = - worker_pool_this->_command_executor->get_table( + OLAPTablePtr tablet = + worker_pool_this->_env->olap_engine()->get_table( clone_req.tablet_id, clone_req.schema_hash); if (tablet.get() != NULL) { - OLAP_LOG_INFO("clone tablet exist yet. tablet_id: %ld, schema_hash: %ld, " - "signature: %ld", - clone_req.tablet_id, clone_req.schema_hash, - agent_task_req.signature); - error_msgs.push_back("clone tablet exist yet."); - status = PALO_CREATE_TABLE_EXIST; - } + OLAP_LOG_INFO("clone tablet exist yet, begin to incremental clone. " + "signature: %ld, tablet_id: %ld, schema_hash: %ld, " + "committed_version: %d", agent_task_req.signature, + clone_req.tablet_id, clone_req.schema_hash, clone_req.committed_version); - // Get local disk from olap - string local_shard_root_path; - if (status == PALO_SUCCESS) { - OLAPStatus olap_status = worker_pool_this->_command_executor->obtain_shard_path( - clone_req.storage_medium, &local_shard_root_path); + // try to incremental clone + vector missing_versions; + string local_data_path = worker_pool_this->_env->olap_engine()-> + get_info_before_incremental_clone(tablet, clone_req.committed_version, &missing_versions); + + bool allow_incremental_clone = false; + status = worker_pool_this->_clone_copy(clone_req, + agent_task_req.signature, + local_data_path, + &src_host, + &src_file_path, + &error_msgs, + &missing_versions, + &allow_incremental_clone); + if (status == PALO_SUCCESS) { + OLAPStatus olap_status = worker_pool_this->_env->olap_engine()-> + finish_clone(tablet, local_data_path, clone_req.committed_version, allow_incremental_clone); + if (olap_status != OLAP_SUCCESS) { + LOG(WARNING) << "failed to finish incremental clone. [table=" << tablet->full_name() + << " res=" << olap_status << "]"; + error_msgs.push_back("incremental clone error."); + status = PALO_ERROR; + } + } else { + // begin to full clone if incremental failed + LOG(INFO) << "begin to full clone. [table=" << tablet->full_name(); + status = worker_pool_this->_clone_copy(clone_req, + agent_task_req.signature, + local_data_path, + &src_host, + &src_file_path, + &error_msgs, + NULL, NULL); + if (status == PALO_SUCCESS) { + LOG(INFO) << "download successfully when full clone. [table=" << tablet->full_name() + << " src_host=" << src_host.host << " src_file_path=" << src_file_path + << " local_data_path=" << local_data_path << "]"; + + OLAPStatus olap_status = worker_pool_this->_env->olap_engine()-> + finish_clone(tablet, local_data_path, clone_req.committed_version, false); + + if (olap_status != OLAP_SUCCESS) { + LOG(WARNING) << "fail to finish full clone. [table=" << tablet->full_name() + << " res=" << olap_status << "]"; + error_msgs.push_back("full clone error."); + status = PALO_ERROR; + } + } + } + } else { + + // Get local disk from olap + string local_shard_root_path; + OlapStore* store = nullptr; + OLAPStatus olap_status = worker_pool_this->_env->olap_engine()->obtain_shard_path( + clone_req.storage_medium, &local_shard_root_path, &store); if (olap_status != OLAP_SUCCESS) { OLAP_LOG_WARNING("clone get local root path failed. signature: %ld", agent_task_req.signature); error_msgs.push_back("clone get local root path failed."); status = PALO_ERROR; } - } - string src_file_path; - TBackend src_host; - if (status == PALO_SUCCESS) { - status = worker_pool_this->_clone_copy( - clone_req, - agent_task_req.signature, - local_shard_root_path, - &src_host, - &src_file_path, - &error_msgs); - } - - if (status == PALO_SUCCESS) { - OLAP_LOG_INFO("clone copy done, src_host: %s, src_file_path: %s", - src_host.host.c_str(), src_file_path.c_str()); - // Load header - OLAPStatus load_header_status = - worker_pool_this->_command_executor->load_header( - local_shard_root_path, - clone_req.tablet_id, - clone_req.schema_hash); - if (load_header_status != OLAP_SUCCESS) { - OLAP_LOG_WARNING("load header failed. local_shard_root_path: %s, schema_hash: %d, " - "status: %d, signature: %ld", - local_shard_root_path.c_str(), clone_req.schema_hash, - load_header_status, agent_task_req.signature); - error_msgs.push_back("load header failed."); - status = PALO_ERROR; + if (status == PALO_SUCCESS) { + stringstream tablet_dir_stream; + tablet_dir_stream << local_shard_root_path + << "/" << clone_req.tablet_id + << "/" << clone_req.schema_hash; + status = worker_pool_this->_clone_copy(clone_req, + agent_task_req.signature, + tablet_dir_stream.str(), + &src_host, + &src_file_path, + &error_msgs, + NULL, NULL); + } + + if (status == PALO_SUCCESS) { + LOG(INFO) << "clone copy done. src_host: " << src_host.host + << " src_file_path: " << src_file_path; + // Load header + OLAPStatus load_header_status = + worker_pool_this->_env->olap_engine()->load_header( + store, + local_shard_root_path, + clone_req.tablet_id, + clone_req.schema_hash); + if (load_header_status != OLAP_SUCCESS) { + LOG(WARNING) << "load header failed. local_shard_root_path: '" << local_shard_root_path + << "' schema_hash: " << clone_req.schema_hash << ". status: " << load_header_status + << ". signature: " << agent_task_req.signature; + error_msgs.push_back("load header failed."); + status = PALO_ERROR; + } } - } #ifndef BE_TEST - // Clean useless dir, if failed, ignore it. - if (status != PALO_SUCCESS && status != PALO_CREATE_TABLE_EXIST) { - stringstream local_data_path_stream; - local_data_path_stream << local_shard_root_path - << "/" << clone_req.tablet_id - << "/" << clone_req.schema_hash; - string local_data_path = local_data_path_stream.str(); - OLAP_LOG_INFO("clone failed. want to delete local dir: %s, signature: %ld", - local_data_path.c_str(), agent_task_req.signature); - try { - boost::filesystem::path local_path(local_data_path); - if (boost::filesystem::exists(local_path)) { - boost::filesystem::remove_all(local_path); + // Clean useless dir, if failed, ignore it. + if (status != PALO_SUCCESS && status != PALO_CREATE_TABLE_EXIST) { + stringstream local_data_path_stream; + local_data_path_stream << local_shard_root_path + << "/" << clone_req.tablet_id; + string local_data_path = local_data_path_stream.str(); + LOG(INFO) << "clone failed. want to delete local dir: " << local_data_path + << ". signature: " << agent_task_req.signature; + try { + boost::filesystem::path local_path(local_data_path); + if (boost::filesystem::exists(local_path)) { + boost::filesystem::remove_all(local_path); + } + } catch (boost::filesystem::filesystem_error e) { + // Ignore the error, OLAP will delete it + OLAP_LOG_WARNING("clone delete useless dir failed. " + "error: %s, local dir: %s, signature: %ld", + e.what(), local_data_path.c_str(), + agent_task_req.signature); } - } catch (boost::filesystem::filesystem_error e) { - // Ignore the error, OLAP will delete it - OLAP_LOG_WARNING("clone delete useless dir failed. " - "error: %s, local dir: %s, signature: %ld", - e.what(), local_data_path.c_str(), - agent_task_req.signature); } - } #endif + } // Get clone tablet info vector tablet_infos; @@ -1015,11 +1269,12 @@ AgentStatus TaskWorkerPool::_clone_copy( const string& local_data_path, TBackend* src_host, string* src_file_path, - vector* error_msgs) { + vector* error_msgs, + const vector* missing_versions, + bool* allow_incremental_clone) { AgentStatus status = PALO_SUCCESS; std::string token = _master_info.token; - for (auto src_backend : clone_req.src_backends) { stringstream http_host_stream; http_host_stream << "http://" << src_backend.host << ":" << src_backend.http_port; @@ -1032,10 +1287,19 @@ AgentStatus TaskWorkerPool::_clone_copy( TAgentResult make_snapshot_result; status = PALO_SUCCESS; - OLAP_LOG_INFO("pre make snapshot. backend_ip: %s", src_host->host.c_str()); + LOG(INFO) << "pre make snapshot. backend_ip: " << src_host->host; TSnapshotRequest snapshot_request; snapshot_request.__set_tablet_id(clone_req.tablet_id); snapshot_request.__set_schema_hash(clone_req.schema_hash); + if (missing_versions != NULL) { + // TODO: missing version composed of singleton delta. + // if not, this place should be rewrote. + vector snapshot_versions; + for (Version version : *missing_versions) { + snapshot_versions.push_back(version.first); + } + snapshot_request.__set_missing_version(snapshot_versions); + } #ifndef BE_TEST agent_client.make_snapshot( snapshot_request, @@ -1046,16 +1310,20 @@ AgentStatus TaskWorkerPool::_clone_copy( &make_snapshot_result); #endif + if (make_snapshot_result.__isset.allow_incremental_clone) { + // During upgrading, some BE nodes still be installed an old previous old. + // which incremental clone is not ready in those nodes. + // should add a symbol to indicate it. + *allow_incremental_clone = make_snapshot_result.allow_incremental_clone; + } if (make_snapshot_result.status.status_code == TStatusCode::OK) { if (make_snapshot_result.__isset.snapshot_path) { *src_file_path = make_snapshot_result.snapshot_path; if (src_file_path->at(src_file_path->length() - 1) != '/') { src_file_path->append("/"); } - OLAP_LOG_INFO("make snapshot success. backend_ip: %s, src_file_path: %s," - " signature: %ld", - src_host->host.c_str(), src_file_path->c_str(), - signature); + LOG(INFO) << "make snapshot success. backend_ip: " << src_host->host << ". src_file_path: " + << *src_file_path << ". signature: " << signature; } else { OLAP_LOG_WARNING("clone make snapshot success, " "but get src file path failed. signature: %ld", @@ -1064,11 +1332,9 @@ AgentStatus TaskWorkerPool::_clone_copy( continue; } } else { - OLAP_LOG_WARNING("make snapshot failed. tablet_id: %ld, schema_hash: %ld, " - "backend_ip: %s, backend_port: %d, signature: %ld", - clone_req.tablet_id, clone_req.schema_hash, - src_host->host.c_str(), src_host->be_port, - signature); + LOG(WARNING) << "make snapshot failed. tablet_id: " << clone_req.tablet_id + << ". schema_hash: " << clone_req.schema_hash << ". backend_ip: " << src_host->host + << ". backend_port: " << src_host->be_port << ". signature: " << signature; error_msgs->push_back("make snapshot failed. backend_ip: " + src_host->host); status = PALO_ERROR; continue; @@ -1082,9 +1348,7 @@ AgentStatus TaskWorkerPool::_clone_copy( src_file_full_path_stream << *src_file_path << "/" << clone_req.tablet_id << "/" << clone_req.schema_hash << "/"; - local_file_full_path_stream << local_data_path - << "/" << clone_req.tablet_id - << "/" << clone_req.schema_hash << "/"; + local_file_full_path_stream << local_data_path << "/"; } string src_file_full_path = src_file_full_path_stream.str(); string local_file_full_path = local_file_full_path_stream.str(); @@ -1324,8 +1588,8 @@ AgentStatus TaskWorkerPool::_clone_copy( &release_snapshot_result); #endif if (release_snapshot_result.status.status_code != TStatusCode::OK) { - OLAP_LOG_WARNING("release snapshot failed. src_file_path: %s, signature: %ld", - src_file_path->c_str(), signature); + LOG(WARNING) << "release snapshot failed. src_file_path: " << *src_file_path + << ". signature: " << signature; } if (status == PALO_SUCCESS) { @@ -1346,7 +1610,7 @@ void* TaskWorkerPool::_storage_medium_migrate_worker_thread_callback(void* arg_t TAgentTaskRequest agent_task_req; TStorageMediumMigrateReq storage_medium_migrate_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1361,7 +1625,10 @@ void* TaskWorkerPool::_storage_medium_migrate_worker_thread_callback(void* arg_t TStatus task_status; OLAPStatus res = OLAPStatus::OLAP_SUCCESS; - res = worker_pool_this->_command_executor->storage_medium_migrate(storage_medium_migrate_req); + res = worker_pool_this->_env->olap_engine()->storage_medium_migrate( + storage_medium_migrate_req.tablet_id, + storage_medium_migrate_req.schema_hash, + storage_medium_migrate_req.storage_medium); if (res != OLAPStatus::OLAP_SUCCESS) { OLAP_LOG_WARNING("storage media migrate failed. status: %d, signature: %ld", res, agent_task_req.signature); @@ -1397,7 +1664,7 @@ void* TaskWorkerPool::_cancel_delete_data_worker_thread_callback(void* arg_this) TAgentTaskRequest agent_task_req; TCancelDeleteDataReq cancel_delete_data_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1415,7 +1682,7 @@ void* TaskWorkerPool::_cancel_delete_data_worker_thread_callback(void* arg_this) OLAPStatus cancel_delete_data_status = OLAPStatus::OLAP_SUCCESS; cancel_delete_data_status = - worker_pool_this->_command_executor->cancel_delete(cancel_delete_data_req); + worker_pool_this->_env->olap_engine()->cancel_delete(cancel_delete_data_req); if (cancel_delete_data_status != OLAPStatus::OLAP_SUCCESS) { OLAP_LOG_WARNING("cancel delete data failed. statusta: %d, signature: %ld", cancel_delete_data_status, agent_task_req.signature); @@ -1454,7 +1721,7 @@ void* TaskWorkerPool::_check_consistency_worker_thread_callback(void* arg_this) TAgentTaskRequest agent_task_req; TCheckConsistencyReq check_consistency_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1470,7 +1737,7 @@ void* TaskWorkerPool::_check_consistency_worker_thread_callback(void* arg_this) OLAPStatus res = OLAPStatus::OLAP_SUCCESS; uint32_t checksum = 0; - res = worker_pool_this->_command_executor->compute_checksum( + res = worker_pool_this->_env->olap_engine()->compute_checksum( check_consistency_req.tablet_id, check_consistency_req.schema_hash, check_consistency_req.version, @@ -1509,13 +1776,14 @@ void* TaskWorkerPool::_report_task_worker_thread_callback(void* arg_this) { TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; TReportRequest request; + request.__set_force_recovery(config::force_recovery); request.__set_backend(worker_pool_this->_backend); #ifndef BE_TEST while (true) { #endif { - lock_guard task_signatures_lock(_s_task_signatures_lock); + lock_guard task_signatures_lock(_s_task_signatures_lock); request.__set_tasks(_s_task_signatures); } @@ -1542,6 +1810,7 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this) TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; TReportRequest request; + request.__set_force_recovery(config::force_recovery); request.__set_backend(worker_pool_this->_backend); #ifndef BE_TEST @@ -1556,7 +1825,8 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this) #endif vector root_paths_info; - worker_pool_this->_command_executor->get_all_root_path_info(&root_paths_info); + + worker_pool_this->_env->olap_engine()->get_all_root_path_info(&root_paths_info); map disks; for (auto root_path_info : root_paths_info) { @@ -1586,9 +1856,10 @@ void* TaskWorkerPool::_report_disk_state_worker_thread_callback(void* arg_this) // wait disk_broken_cv awaken // if awaken, set is_report_disk_state_already to true, it will not notify again // if overtime, while will go to next cycle - boost::unique_lock lk(_disk_broken_lock); - if (OLAPRootPath::get_instance()->disk_broken_cv.timed_wait(lk, _wait_duration)) { - OLAPRootPath::get_instance()->is_report_disk_state_already = true; + std::unique_lock lk(_disk_broken_lock); + auto cv_status = OLAPEngine::get_instance()->disk_broken_cv.wait_for(lk, _wait_duration); + if (cv_status == std::cv_status::no_timeout) { + OLAPEngine::get_instance()->is_report_disk_state_already = true; } } } @@ -1601,6 +1872,7 @@ void* TaskWorkerPool::_report_olap_table_worker_thread_callback(void* arg_this) TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; TReportRequest request; + request.__set_force_recovery(config::force_recovery); request.__set_backend(worker_pool_this->_backend); request.__isset.tablets = true; AgentStatus status = PALO_SUCCESS; @@ -1620,7 +1892,7 @@ void* TaskWorkerPool::_report_olap_table_worker_thread_callback(void* arg_this) request.__set_report_version(_s_report_version); OLAPStatus report_all_tablets_info_status = - worker_pool_this->_command_executor->report_all_tablets_info(&request.tablets); + worker_pool_this->_env->olap_engine()->report_all_tablets_info(&request.tablets); if (report_all_tablets_info_status != OLAP_SUCCESS) { OLAP_LOG_WARNING("report get all tablets info failed. status: %d", report_all_tablets_info_status); @@ -1628,9 +1900,10 @@ void* TaskWorkerPool::_report_olap_table_worker_thread_callback(void* arg_this) // wait disk_broken_cv awaken // if awaken, set is_report_olap_table_already to true, it will not notify again // if overtime, while will go to next cycle - boost::unique_lock lk(_disk_broken_lock); - if (OLAPRootPath::get_instance()->disk_broken_cv.timed_wait(lk, _wait_duration)) { - OLAPRootPath::get_instance()->is_report_olap_table_already = true; + std::unique_lock lk(_disk_broken_lock); + auto cv_status = OLAPEngine::get_instance()->disk_broken_cv.wait_for(lk, _wait_duration); + if (cv_status == std::cv_status::no_timeout) { + OLAPEngine::get_instance()->is_report_olap_table_already = true; } continue; #else @@ -1652,9 +1925,10 @@ void* TaskWorkerPool::_report_olap_table_worker_thread_callback(void* arg_this) // wait disk_broken_cv awaken // if awaken, set is_report_olap_table_already to true, it will not notify again // if overtime, while will go to next cycle - boost::unique_lock lk(_disk_broken_lock); - if (OLAPRootPath::get_instance()->disk_broken_cv.timed_wait(lk, _wait_duration)) { - OLAPRootPath::get_instance()->is_report_olap_table_already = true; + std::unique_lock lk(_disk_broken_lock); + auto cv_status = OLAPEngine::get_instance()->disk_broken_cv.wait_for(lk, _wait_duration); + if (cv_status == std::cv_status::no_timeout) { + OLAPEngine::get_instance()->is_report_olap_table_already = true; } } #endif @@ -1671,7 +1945,7 @@ void* TaskWorkerPool::_upload_worker_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TUploadReq upload_request; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1734,7 +2008,7 @@ void* TaskWorkerPool::_download_worker_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TDownloadReq download_request; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1800,7 +2074,7 @@ void* TaskWorkerPool::_make_snapshot_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TSnapshotRequest snapshot_request; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1819,7 +2093,7 @@ void* TaskWorkerPool::_make_snapshot_thread_callback(void* arg_this) { string snapshot_path; std::vector snapshot_files; - OLAPStatus make_snapshot_status = worker_pool_this->_command_executor->make_snapshot( + OLAPStatus make_snapshot_status = worker_pool_this->_env->olap_engine()->make_snapshot( snapshot_request, &snapshot_path); if (make_snapshot_status != OLAP_SUCCESS) { status_code = TStatusCode::RUNTIME_ERROR; @@ -1886,7 +2160,7 @@ void* TaskWorkerPool::_release_snapshot_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TReleaseSnapshotRequest release_snapshot_request; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -1905,16 +2179,16 @@ void* TaskWorkerPool::_release_snapshot_thread_callback(void* arg_this) { string& snapshot_path = release_snapshot_request.snapshot_path; OLAPStatus release_snapshot_status = - worker_pool_this->_command_executor->release_snapshot(snapshot_path); + worker_pool_this->_env->olap_engine()->release_snapshot(snapshot_path); if (release_snapshot_status != OLAP_SUCCESS) { status_code = TStatusCode::RUNTIME_ERROR; - OLAP_LOG_WARNING("release_snapshot failed. snapshot_path: %s, status: %d", - snapshot_path.c_str(), release_snapshot_status); + LOG(WARNING) << "release_snapshot failed. snapshot_path: " << snapshot_path + << ". status: " << release_snapshot_status; error_msgs.push_back("release_snapshot failed. status: " + boost::lexical_cast(release_snapshot_status)); } else { - OLAP_LOG_INFO("release_snapshot success. snapshot_path: %s, status: %d", - snapshot_path.c_str(), release_snapshot_status); + LOG(INFO) << "release_snapshot success. snapshot_path: " << snapshot_path + << ". status: " << release_snapshot_status; } task_status.__set_status_code(status_code); @@ -1938,14 +2212,14 @@ AlterTableStatus TaskWorkerPool::_show_alter_table_status( TTabletId tablet_id, TSchemaHash schema_hash) { AlterTableStatus alter_table_status = - _command_executor->show_alter_table_status(tablet_id, schema_hash); + _env->olap_engine()->show_alter_table_status(tablet_id, schema_hash); return alter_table_status; } -AgentStatus TaskWorkerPool::_drop_table(const TDropTabletReq drop_tablet_req) { +AgentStatus TaskWorkerPool::_drop_table(const TDropTabletReq& req) { AgentStatus status = PALO_SUCCESS; - OLAPStatus drop_status = _command_executor->drop_table(drop_tablet_req); - if (drop_status != OLAPStatus::OLAP_SUCCESS) { + OLAPStatus drop_status = _env->olap_engine()->drop_table(req.tablet_id, req.schema_hash); + if (drop_status != OLAP_SUCCESS && drop_status != OLAP_ERR_TABLE_NOT_FOUND) { status = PALO_ERROR; } return status; @@ -1960,8 +2234,7 @@ AgentStatus TaskWorkerPool::_get_tablet_info( tablet_info->__set_tablet_id(tablet_id); tablet_info->__set_schema_hash(schema_hash); - OLAPStatus olap_status = - _command_executor->report_tablet_info(tablet_info); + OLAPStatus olap_status = _env->olap_engine()->report_tablet_info(tablet_info); if (olap_status != OLAP_SUCCESS) { OLAP_LOG_WARNING("get tablet info failed. status: %d, signature: %ld", olap_status, signature); @@ -1979,7 +2252,7 @@ void* TaskWorkerPool::_move_dir_thread_callback(void* arg_this) { TAgentTaskRequest agent_task_req; TMoveDirReq move_dir_req; { - lock_guard worker_thread_lock(worker_pool_this->_worker_thread_lock); + MutexLock worker_thread_lock(&(worker_pool_this->_worker_thread_lock)); while (worker_pool_this->_tasks.empty()) { worker_pool_this->_worker_thread_condition_lock.wait(); } @@ -2043,7 +2316,7 @@ AgentStatus TaskWorkerPool::_move_dir( bool overwrite, std::vector* error_msgs) { - SmartOLAPTable tablet = _command_executor->get_table( + OLAPTablePtr tablet = _env->olap_engine()->get_table( tablet_id, schema_hash); if (tablet.get() == NULL) { OLAP_LOG_INFO("failed to get tablet: %ld, schema hash: %d", @@ -2053,9 +2326,10 @@ AgentStatus TaskWorkerPool::_move_dir( } std::string dest_tablet_dir = tablet->construct_dir_path(); + std::string store_path = tablet->store()->path(); SnapshotLoader* loader = _env->snapshot_loader(); - Status status = loader->move(src, dest_tablet_dir, job_id, overwrite); + Status status = loader->move(src, dest_tablet_dir, store_path, job_id, overwrite); if (!status.ok()) { OLAP_LOG_WARNING("move failed. job id: %ld, msg: %s", @@ -2067,4 +2341,60 @@ AgentStatus TaskWorkerPool::_move_dir( return PALO_SUCCESS; } +void* TaskWorkerPool::_recover_tablet_thread_callback(void* arg_this) { + TaskWorkerPool* worker_pool_this = (TaskWorkerPool*)arg_this; + + while (true) { + TAgentTaskRequest agent_task_req; + TRecoverTabletReq recover_tablet_req; + { + MutexLock worker_thread_lock(&(worker_pool_this->_worker_thread_lock)); + while (worker_pool_this->_tasks.empty()) { + worker_pool_this->_worker_thread_condition_lock.wait(); + } + + agent_task_req = worker_pool_this->_tasks.front(); + recover_tablet_req = agent_task_req.recover_tablet_req; + worker_pool_this->_tasks.pop_front(); + } + // Try to register to cgroups_mgr + CgroupsMgr::apply_system_cgroup(); + + TStatusCode::type status_code = TStatusCode::OK; + vector error_msgs; + TStatus task_status; + + LOG(INFO) << "begin to recover tablet." + << "table:" << recover_tablet_req.tablet_id << "." << recover_tablet_req.schema_hash << ", " + << "version:" << recover_tablet_req.version << "-" << recover_tablet_req.version_hash; + OLAPStatus status = worker_pool_this->_env->olap_engine()->recover_tablet_until_specfic_version(recover_tablet_req); + if (status != OLAP_SUCCESS) { + status_code = TStatusCode::RUNTIME_ERROR; + LOG(WARNING) << "failed to recover tablet." + << "signature:" << agent_task_req.signature << ", " + << "table:" << recover_tablet_req.tablet_id << "." << recover_tablet_req.schema_hash << ", " + << "version:" << recover_tablet_req.version << "-" << recover_tablet_req.version_hash; + } else { + LOG(WARNING) << "succeed to recover tablet." + << "signature:" << agent_task_req.signature << ", " + << "table:" << recover_tablet_req.tablet_id << "." << recover_tablet_req.schema_hash << ", " + << "version:" << recover_tablet_req.version << "-" << recover_tablet_req.version_hash; + } + + task_status.__set_status_code(status_code); + task_status.__set_error_msgs(error_msgs); + + TFinishTaskRequest finish_task_request; + finish_task_request.__set_backend(worker_pool_this->_backend); + finish_task_request.__set_task_type(agent_task_req.task_type); + finish_task_request.__set_signature(agent_task_req.signature); + finish_task_request.__set_task_status(task_status); + + worker_pool_this->_finish_task(finish_task_request); + worker_pool_this->_remove_task_info(agent_task_req.task_type, agent_task_req.signature, ""); + + } + return (void*)0; +} + } // namespace palo diff --git a/be/src/agent/task_worker_pool.h b/be/src/agent/task_worker_pool.h index a2531b0835..5c2762dd40 100644 --- a/be/src/agent/task_worker_pool.h +++ b/be/src/agent/task_worker_pool.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include "agent/pusher.h" @@ -28,9 +29,8 @@ #include "agent/utils.h" #include "gen_cpp/AgentService_types.h" #include "gen_cpp/HeartbeatService_types.h" -#include "olap/command_executor.h" #include "olap/olap_define.h" -#include "olap/olap_rootpath.h" +#include "olap/olap_engine.h" #include "olap/utils.h" namespace palo { @@ -43,6 +43,10 @@ public: CREATE_TABLE, DROP_TABLE, PUSH, + REALTIME_PUSH, + PUBLISH_VERSION, + CLEAR_ALTER_TASK, + CLEAR_TRANSACTION_TASK, DELETE, ALTER_TABLE, QUERY_SPLIT_KEY, @@ -57,7 +61,8 @@ public: DOWNLOAD, MAKE_SNAPSHOT, RELEASE_SNAPSHOT, - MOVE + MOVE, + RECOVER_TABLET }; typedef void* (*CALLBACK_FUNCTION)(void*); @@ -90,6 +95,9 @@ private: static void* _create_table_worker_thread_callback(void* arg_this); static void* _drop_table_worker_thread_callback(void* arg_this); static void* _push_worker_thread_callback(void* arg_this); + static void* _publish_version_worker_thread_callback(void* arg_this); + static void* _clear_alter_task_worker_thread_callback(void* arg_this); + static void* _clear_transaction_task_worker_thread_callback(void* arg_this); static void* _alter_table_worker_thread_callback(void* arg_this); static void* _clone_worker_thread_callback(void* arg_this); static void* _storage_medium_migrate_worker_thread_callback(void* arg_this); @@ -103,6 +111,7 @@ private: static void* _make_snapshot_thread_callback(void* arg_this); static void* _release_snapshot_thread_callback(void* arg_this); static void* _move_dir_thread_callback(void* arg_this); + static void* _recover_tablet_thread_callback(void* arg_this); AgentStatus _clone_copy( const TCloneReq& clone_req, @@ -110,7 +119,9 @@ private: const std::string& local_data_path, TBackend* src_host, std::string* src_file_path, - std::vector* error_msgs); + std::vector* error_msgs, + const std::vector* missing_versions, + bool* allow_incremental_clone); void _alter_table( const TAlterTabletReq& create_rollup_request, @@ -122,7 +133,7 @@ private: const TTabletId tablet_id, const TSchemaHash schema_hash); - AgentStatus _drop_table(const TDropTabletReq drop_tablet_req); + AgentStatus _drop_table(const TDropTabletReq& drop_tablet_req); AgentStatus _get_tablet_info( const TTabletId tablet_id, @@ -142,7 +153,6 @@ private: TBackend _backend; AgentUtils* _agent_utils; MasterServerClient* _master_client; - CommandExecutor* _command_executor; ExecEnv* _env; #ifdef BE_TEST AgentServerClient* _agent_client; @@ -151,7 +161,7 @@ private: #endif std::deque _tasks; - MutexLock _worker_thread_lock; + Mutex _worker_thread_lock; Condition _worker_thread_condition_lock; uint32_t _worker_count; TaskWorkerType _task_worker_type; @@ -161,12 +171,12 @@ private: static std::map> _s_running_task_user_count; static std::map> _s_total_task_user_count; static std::map _s_total_task_count; - static MutexLock _s_task_signatures_lock; - static MutexLock _s_running_task_user_count_lock; + static Mutex _s_task_signatures_lock; + static Mutex _s_running_task_user_count_lock; static FrontendServiceClientCache _master_service_client_cache; - static boost::mutex _disk_broken_lock; - static boost::posix_time::time_duration _wait_duration; + static std::mutex _disk_broken_lock; + static std::chrono::seconds _wait_duration; DISALLOW_COPY_AND_ASSIGN(TaskWorkerPool); }; // class TaskWorkerPool diff --git a/be/src/agent/utils.cpp b/be/src/agent/utils.cpp index 6c9dc1904d..6f4ae6e482 100644 --- a/be/src/agent/utils.cpp +++ b/be/src/agent/utils.cpp @@ -129,11 +129,9 @@ AgentStatus MasterServerClient::finish_task( &client_status); if (!client_status.ok()) { - OLAP_LOG_WARNING("master client, get client from cache failed." - "host: %s, port: %d, code: %d", - _master_info.network_address.hostname.c_str(), - _master_info.network_address.port, - client_status.code()); + LOG(WARNING) << "master client. get client from cache failed. host: " + << _master_info.network_address.hostname << ". port: " << _master_info.network_address.port + << ". code: " << client_status.code(); return PALO_ERROR; } @@ -212,11 +210,8 @@ AgentStatus MasterServerClient::report(const TReportRequest request, TMasterResu } } } catch (TException& e) { - OLAP_LOG_WARNING("master client, finish report failed." - "host: %s, port: %d, code: %d", - _master_info.network_address.hostname.c_str(), - _master_info.network_address.port, - client_status.code()); + LOG(WARNING) << "master client. finish report failed. host: " << _master_info.network_address.hostname + << ". port: " << _master_info.network_address.port << ". code: " << client_status.code(); return PALO_ERROR; } @@ -243,7 +238,7 @@ AgentStatus AgentUtils::rsync_from_remote( cmd_stream << " --timeout=" << timeout_second; } cmd_stream << " " << remote_host << ":" << remote_file_path << " " << local_file_path; - OLAP_LOG_INFO("rsync cmd: %s", cmd_stream.str().c_str()); + LOG(INFO) << "rsync cmd: " << cmd_stream.str(); FILE* fp = NULL; fp = popen(cmd_stream.str().c_str(), "r"); diff --git a/be/src/codegen/CMakeLists.txt b/be/src/codegen/CMakeLists.txt index d69e628c89..ad98ab709b 100644 --- a/be/src/codegen/CMakeLists.txt +++ b/be/src/codegen/CMakeLists.txt @@ -70,7 +70,7 @@ add_custom_command( COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} "-msse4.2" ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${IR_SSE_TMP_OUTPUT_FILE} COMMAND ${LLVM_OPT_EXECUTABLE} --instnamer < ${IR_SSE_TMP_OUTPUT_FILE} > ${IR_SSE_OUTPUT_FILE} COMMAND rm ${IR_SSE_TMP_OUTPUT_FILE} - DEPENDS Exec Exprs Udf ${IR_INPUT_FILES} + DEPENDS Exprs Udf ${IR_INPUT_FILES} ) # Compile without sse enabled. @@ -79,7 +79,7 @@ add_custom_command( COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${IR_NO_SSE_TMP_OUTPUT_FILE} COMMAND ${LLVM_OPT_EXECUTABLE} --instnamer < ${IR_NO_SSE_TMP_OUTPUT_FILE} > ${IR_NO_SSE_OUTPUT_FILE} COMMAND rm ${IR_NO_SSE_TMP_OUTPUT_FILE} - DEPENDS Exec Exprs Udf ${IR_INPUT_FILES} + DEPENDS Exprs Udf ${IR_INPUT_FILES} ) add_custom_target(compile_to_ir_sse DEPENDS ${IR_SSE_OUTPUT_FILE}) diff --git a/be/src/codegen/llvm_codegen.cpp b/be/src/codegen/llvm_codegen.cpp index c29dc47669..c61a56b749 100644 --- a/be/src/codegen/llvm_codegen.cpp +++ b/be/src/codegen/llvm_codegen.cpp @@ -1066,6 +1066,7 @@ Value* LlvmCodeGen::codegen_array_at( void LlvmCodeGen::codegen_assign(LlvmBuilder* builder, llvm::Value* dst, llvm::Value* src, PrimitiveType type) { switch (type) { + case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_HLL: { codegen_memcpy(builder, dst, src, sizeof(StringValue)); diff --git a/be/src/common/config.h b/be/src/common/config.h index ffe877cad4..e042cb9e48 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -65,6 +65,12 @@ namespace config { CONF_Int32(push_worker_count_normal_priority, "3"); // the count of thread to high priority batch load CONF_Int32(push_worker_count_high_priority, "3"); + // the count of thread to publish version + CONF_Int32(publish_version_worker_count, "2"); + // the count of thread to clear alter task + CONF_Int32(clear_alter_task_worker_count, "1"); + // the count of thread to clear transaction task + CONF_Int32(clear_transaction_task_worker_count, "1"); // the count of thread to delete CONF_Int32(delete_worker_count, "3"); // the count of thread to alter table @@ -209,6 +215,10 @@ namespace config { CONF_Int32(default_num_rows_per_data_block, "1024"); CONF_Int32(default_num_rows_per_column_file_block, "1024"); CONF_Int32(max_tablet_num_per_shard, "1024"); + // pending data policy + CONF_Int32(pending_data_expire_time_sec, "1800"); + // incremental delta policy + CONF_Int32(incremental_delta_expire_time_sec, "1800"); // garbage sweep policy CONF_Int32(max_garbage_sweep_interval, "86400"); CONF_Int32(min_garbage_sweep_interval, "200"); @@ -240,7 +250,6 @@ namespace config { CONF_Int64(cumulative_compaction_budgeted_bytes, "104857600"); CONF_Int32(cumulative_compaction_write_mbytes_per_sec, "100"); - CONF_Int32(delete_delta_expire_time, "1440"); // Port to start debug webserver on CONF_Int32(webserver_port, "8040"); // Interface to start debug webserver on. If blank, webserver binds to 0.0.0.0 @@ -255,6 +264,9 @@ namespace config { // Used for mini Load CONF_Int64(load_data_reserve_hours, "24"); CONF_Int64(mini_load_max_mb, "2048"); + CONF_Int32(number_tablet_writer_threads, "16"); + + CONF_Int64(streaming_load_max_mb, "10240"); // Fragment thread pool CONF_Int32(fragment_pool_thread_num, "64"); @@ -365,8 +377,17 @@ namespace config { // Aligement CONF_Int32(FLAGS_MEMORY_MAX_ALIGNMENT, "16"); + // write buffer size before flush + CONF_Int32(write_buffer_size, "104857600"); + + // update interval of tablet stat cache + CONF_Int32(tablet_stat_cache_update_interval_second, "300"); + // result buffer cancelled time (unit: second) CONF_Int32(result_buffer_cancelled_interval_time, "5"); + + // can perform recovering tablet + CONF_Bool(force_recovery, "false"); } // namespace config } // namespace palo diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index 82f5b8e105..37b6b1179b 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -45,15 +45,18 @@ #include "exprs/cast_functions.h" #include "exprs/math_functions.h" #include "exprs/encryption_functions.h" +#include "exprs/es_functions.h" #include "exprs/timestamp_functions.h" #include "exprs/decimal_operators.h" #include "exprs/utility_functions.h" #include "exprs/json_functions.h" #include "exprs/hll_hash_function.h" -#include "olap/olap_rootpath.h" +#include "olap/options.h" namespace palo { +bool k_palo_exit = false; + void* tcmalloc_gc_thread(void* dummy) { while (1) { sleep(10); @@ -105,19 +108,14 @@ void* memory_maintenance_thread(void* dummy) { return NULL; } -static void init_palo_metrics() { +static void init_palo_metrics(const std::vector& store_paths) { bool init_system_metrics = config::enable_system_metrics; std::set disk_devices; std::vector network_interfaces; if (init_system_metrics) { std::vector paths; - std::vector capacities; - auto res = OLAPRootPath::parse_root_paths_from_string( - config::storage_root_path.c_str(), &paths, &capacities); - if (res != OLAP_SUCCESS) { - LOG(WARNING) << "parse storage_root_path failed, res=" << res - << ", path=" << config::storage_root_path; - return; + for (auto& store_path : store_paths) { + paths.emplace_back(store_path.path); } auto st = DiskInfo::get_disk_devices(paths, &disk_devices); if (!st.ok()) { @@ -134,7 +132,37 @@ static void init_palo_metrics() { "palo_be", init_system_metrics, disk_devices, network_interfaces); } -void init_daemon(int argc, char** argv) { +void sigterm_handler(int signo) { + k_palo_exit = true; +} + +int install_signal(int signo, void(*handler)(int)) { + struct sigaction sa; + memset(&sa, 0, sizeof(struct sigaction)); + sa.sa_handler = handler; + sigemptyset(&sa.sa_mask); + auto ret = sigaction(signo, &sa, nullptr); + if (ret != 0) { + char buf[64]; + LOG(ERROR) << "install signal failed, signo=" << signo + << ", errno=" << errno + << ", errmsg=" << strerror_r(errno, buf, sizeof(buf)); + } + return ret; +} + +void init_signals() { + auto ret = install_signal(SIGINT, sigterm_handler); + if (ret < 0) { + exit(-1); + } + ret = install_signal(SIGTERM, sigterm_handler); + if (ret < 0) { + exit(-1); + } +} + +void init_daemon(int argc, char** argv, const std::vector& paths) { // google::SetVersionString(get_build_version(false)); // google::ParseCommandLineFlags(&argc, &argv, true); google::ParseCommandLineFlags(&argc, &argv, true); @@ -161,6 +189,7 @@ void init_daemon(int argc, char** argv) { CompoundPredicate::init(); JsonFunctions::init(); HllHashFunctions::init(); + ESFunctions::init(); pthread_t tc_malloc_pid; pthread_create(&tc_malloc_pid, NULL, tcmalloc_gc_thread, NULL); @@ -171,7 +200,8 @@ void init_daemon(int argc, char** argv) { LOG(INFO) << CpuInfo::debug_string(); LOG(INFO) << DiskInfo::debug_string(); LOG(INFO) << MemInfo::debug_string(); - init_palo_metrics(); + init_palo_metrics(paths); + init_signals(); } } diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h index 53203e0082..319c27b5c0 100644 --- a/be/src/common/daemon.h +++ b/be/src/common/daemon.h @@ -21,12 +21,16 @@ #ifndef BDG_PALO_BE_SRC_COMMON_COMMON_DAEMON_H #define BDG_PALO_BE_SRC_COMMON_COMMON_DAEMON_H +#include + namespace palo { +class StorePath; + // Initialises logging, flags etc. Callers that want to override default gflags // variables should do so before calling this method; no logging should be // performed until after this method returns. -void init_daemon(int argc, char** argv); +void init_daemon(int argc, char** argv, const std::vector& paths); } diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt index 347da037d0..64a6e1b2ab 100644 --- a/be/src/exec/CMakeLists.txt +++ b/be/src/exec/CMakeLists.txt @@ -27,65 +27,67 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/exec") set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/exec") set(EXEC_FILES - aggregation_node.cpp - #pre_aggregation_node.cpp - aggregation_node_ir.cpp - analytic_eval_node.cpp - blocking_join_node.cpp - broker_scan_node.cpp - broker_reader.cpp - broker_scanner.cpp - cross_join_node.cpp - data_sink.cpp - decompressor.cpp - empty_set_node.cpp - exec_node.cpp - exchange_node.cpp - hash_join_node.cpp - hash_join_node_ir.cpp - hash_table.cpp - local_file_reader.cpp - merge_node.cpp - merge_join_node.cpp - scan_node.cpp - select_node.cpp - text_converter.cpp - topn_node.cpp - sort_exec_exprs.cpp - sort_node.cpp - olap_rewrite_node.cpp - olap_scan_node.cpp - olap_scanner.cpp - olap_meta_reader.cpp - olap_common.cpp - plain_text_line_reader.cpp - mysql_scan_node.cpp - mysql_scanner.cpp - csv_scan_node.cpp - csv_scanner.cpp - spill_sort_node.cc - union_node.cpp - union_node_ir.cpp - schema_scanner.cpp - schema_scan_node.cpp - schema_scanner/schema_tables_scanner.cpp - schema_scanner/schema_dummy_scanner.cpp - schema_scanner/schema_schemata_scanner.cpp - schema_scanner/schema_variables_scanner.cpp - schema_scanner/schema_columns_scanner.cpp - schema_scanner/schema_charsets_scanner.cpp - schema_scanner/schema_collations_scanner.cpp - schema_scanner/frontend_helper.cpp - partitioned_hash_table.cc - partitioned_hash_table_ir.cc - partitioned_aggregation_node.cc - partitioned_aggregation_node_ir.cc - new_partitioned_hash_table.cc - new_partitioned_hash_table_ir.cc - new_partitioned_aggregation_node.cc - new_partitioned_aggregation_node_ir.cc - local_file_writer.cpp - broker_writer.cpp + aggregation_node.cpp + #pre_aggregation_node.cpp + aggregation_node_ir.cpp + analytic_eval_node.cpp + blocking_join_node.cpp + broker_scan_node.cpp + broker_reader.cpp + broker_scanner.cpp + cross_join_node.cpp + data_sink.cpp + decompressor.cpp + empty_set_node.cpp + exec_node.cpp + exchange_node.cpp + hash_join_node.cpp + hash_join_node_ir.cpp + hash_table.cpp + local_file_reader.cpp + merge_node.cpp + merge_join_node.cpp + scan_node.cpp + select_node.cpp + text_converter.cpp + topn_node.cpp + sort_exec_exprs.cpp + sort_node.cpp + olap_rewrite_node.cpp + olap_scan_node.cpp + olap_scanner.cpp + olap_meta_reader.cpp + olap_common.cpp + olap_table_info.cpp + olap_table_sink.cpp + plain_text_line_reader.cpp + mysql_scan_node.cpp + mysql_scanner.cpp + csv_scan_node.cpp + csv_scanner.cpp + spill_sort_node.cc + union_node.cpp + union_node_ir.cpp + schema_scanner.cpp + schema_scan_node.cpp + schema_scanner/schema_tables_scanner.cpp + schema_scanner/schema_dummy_scanner.cpp + schema_scanner/schema_schemata_scanner.cpp + schema_scanner/schema_variables_scanner.cpp + schema_scanner/schema_columns_scanner.cpp + schema_scanner/schema_charsets_scanner.cpp + schema_scanner/schema_collations_scanner.cpp + schema_scanner/frontend_helper.cpp + partitioned_hash_table.cc + partitioned_hash_table_ir.cc + partitioned_aggregation_node.cc + partitioned_aggregation_node_ir.cc + new_partitioned_hash_table.cc + new_partitioned_hash_table_ir.cc + new_partitioned_aggregation_node.cc + new_partitioned_aggregation_node_ir.cc + local_file_writer.cpp + broker_writer.cpp ) if(EXISTS "${BASE_DIR}/src/exec/kudu_util.cpp") diff --git a/be/src/exec/broker_scanner.cpp b/be/src/exec/broker_scanner.cpp index 2bde4a7f58..278aaad478 100644 --- a/be/src/exec/broker_scanner.cpp +++ b/be/src/exec/broker_scanner.cpp @@ -19,8 +19,11 @@ #include #include "runtime/descriptors.h" +#include "runtime/exec_env.h" #include "runtime/mem_tracker.h" #include "runtime/raw_value.h" +#include "runtime/load_stream_mgr.h" +#include "runtime/stream_load_pipe.h" #include "runtime/tuple.h" #include "exprs/expr.h" #include "exec/text_converter.h" @@ -87,9 +90,14 @@ BrokerScanner::BrokerScanner(RuntimeState* state, _skip_next_line(false), _src_tuple(nullptr), _src_tuple_row(nullptr), - _mem_pool(_state->instance_mem_tracker()), - _dest_tuple_desc(nullptr), +#if BE_TEST + _mem_tracker(new MemTracker()), + _mem_pool(_mem_tracker.get()), +#else _mem_tracker(new MemTracker(-1, "Broker Scanner", state->instance_mem_tracker())), + _mem_pool(_state->instance_mem_tracker()), +#endif + _dest_tuple_desc(nullptr), _counter(counter), _rows_read_counter(nullptr), _read_timer(nullptr), @@ -228,8 +236,13 @@ Status BrokerScanner::open_next_reader() { Status BrokerScanner::open_file_reader() { if (_cur_file_reader != nullptr) { - delete _cur_file_reader; - _cur_file_reader = nullptr; + if (_stream_load_pipe != nullptr) { + _stream_load_pipe.reset(); + _cur_file_reader = nullptr; + } else { + delete _cur_file_reader; + _cur_file_reader = nullptr; + } } const TBrokerRangeDesc& range = _ranges[_next_range]; @@ -251,6 +264,14 @@ Status BrokerScanner::open_file_reader() { _cur_file_reader = broker_reader; break; } + case TFileType::FILE_STREAM: { + _stream_load_pipe = _state->exec_env()->load_stream_mgr()->get(range.load_id); + if (_stream_load_pipe == nullptr) { + return Status("unknown stream load id"); + } + _cur_file_reader = _stream_load_pipe.get(); + break; + } default: { std::stringstream ss; ss << "Unknown file type, type=" << range.file_type; @@ -360,8 +381,13 @@ void BrokerScanner::close() { } if (_cur_file_reader != nullptr) { - delete _cur_file_reader; - _cur_file_reader = nullptr; + if (_stream_load_pipe != nullptr) { + _stream_load_pipe.reset(); + _cur_file_reader = nullptr; + } else { + delete _cur_file_reader; + _cur_file_reader = nullptr; + } } Expr::close(_dest_expr_ctx, _state); } diff --git a/be/src/exec/broker_scanner.h b/be/src/exec/broker_scanner.h index 522781f70f..ac5fed08f4 100644 --- a/be/src/exec/broker_scanner.h +++ b/be/src/exec/broker_scanner.h @@ -43,6 +43,7 @@ class TupleRow; class RowDescriptor; class MemTracker; class RuntimeProfile; +class StreamLoadPipe; struct BrokerScanCounter { BrokerScanCounter() : num_rows_returned(0), num_rows_filtered(0) { @@ -141,6 +142,7 @@ private: Tuple* _src_tuple; TupleRow* _src_tuple_row; + std::unique_ptr _mem_tracker; // Mem pool used to allocate _src_tuple and _src_tuple_row MemPool _mem_pool; @@ -148,7 +150,8 @@ private: const TupleDescriptor* _dest_tuple_desc; std::vector _dest_expr_ctx; - std::unique_ptr _mem_tracker; + // used to hold current StreamLoadPipe + std::shared_ptr _stream_load_pipe; // used for process stat BrokerScanCounter* _counter; diff --git a/be/src/exec/data_sink.cpp b/be/src/exec/data_sink.cpp index b56a306d2c..a486daffec 100644 --- a/be/src/exec/data_sink.cpp +++ b/be/src/exec/data_sink.cpp @@ -25,6 +25,7 @@ #include #include "exec/exec_node.h" +#include "exec/olap_table_sink.h" #include "exprs/expr.h" #include "gen_cpp/PaloInternalService_types.h" #include "runtime/data_stream_sender.h" @@ -105,6 +106,13 @@ Status DataSink::create_data_sink( sink->reset(export_sink.release()); break; } + case TDataSinkType::OLAP_TABLE_SINK: { + Status status; + DCHECK(thrift_sink.__isset.olap_table_sink); + sink->reset(new stream_load::OlapTableSink(pool, row_desc, output_exprs, &status)); + RETURN_IF_ERROR(status); + break; + } default: std::stringstream error_msg; diff --git a/be/src/exec/new_partitioned_aggregation_node.cc b/be/src/exec/new_partitioned_aggregation_node.cc index 2a57282c8b..3c0df412a9 100644 --- a/be/src/exec/new_partitioned_aggregation_node.cc +++ b/be/src/exec/new_partitioned_aggregation_node.cc @@ -165,7 +165,7 @@ Status NewPartitionedAggregationNode::init(const TPlanNode& tnode, RuntimeState* // Construct build exprs from intermediate_row_desc_ for (int i = 0; i < grouping_exprs_.size(); ++i) { SlotDescriptor* desc = intermediate_tuple_desc_->slots()[i]; - DCHECK(desc->type().type == TYPE_NULL || desc->type() == grouping_exprs_[i]->type()); + //DCHECK(desc->type().type == TYPE_NULL || desc->type() == grouping_exprs_[i]->type()); // Hack to avoid TYPE_NULL SlotRefs. SlotRef* build_expr = _pool->add(desc->type().type != TYPE_NULL ? new SlotRef(desc) : new SlotRef(desc, TYPE_BOOLEAN)); @@ -229,7 +229,7 @@ Status NewPartitionedAggregationNode::prepare(RuntimeState* state) { const RowDescriptor& row_desc = child(0)->row_desc(); RETURN_IF_ERROR(NewAggFnEvaluator::Create(agg_fns_, state, _pool, agg_fn_pool_.get(), &agg_fn_evals_, expr_mem_tracker(), row_desc)); - + expr_results_pool_.reset(new MemPool(_expr_mem_tracker.get())); if (!grouping_exprs_.empty()) { RowDescriptor build_row_desc(intermediate_tuple_desc_, false); @@ -695,7 +695,7 @@ Status NewPartitionedAggregationNode::close(RuntimeState* state) { // Close all the agg-fn-evaluators NewAggFnEvaluator::Close(agg_fn_evals_, state); - + if (expr_results_pool_.get() != nullptr) { expr_results_pool_->free_all(); } diff --git a/be/src/exec/new_partitioned_hash_table.cc b/be/src/exec/new_partitioned_hash_table.cc index 4fb1e4e29e..52875c6542 100644 --- a/be/src/exec/new_partitioned_hash_table.cc +++ b/be/src/exec/new_partitioned_hash_table.cc @@ -261,7 +261,10 @@ uint32_t NewPartitionedHashTableCtx::HashVariableLenRow(const uint8_t* expr_valu for (int i = 0; i < build_exprs_.size(); ++i) { // non-string and null slots are already part of 'expr_values'. // if (build_expr_ctxs_[i]->root()->type().type != TYPE_STRING - if (build_exprs_[i]->type().type != TYPE_VARCHAR) continue; + PrimitiveType type = build_exprs_[i]->type().type; + if (type != TYPE_CHAR && type != TYPE_VARCHAR) { + continue; + } const void* loc = expr_values_cache_.ExprValuePtr(expr_values, i); if (expr_values_null[i]) { diff --git a/be/src/exec/olap_meta_reader.cpp b/be/src/exec/olap_meta_reader.cpp index 75ade22ed4..1b76b76f3f 100644 --- a/be/src/exec/olap_meta_reader.cpp +++ b/be/src/exec/olap_meta_reader.cpp @@ -25,7 +25,6 @@ #include "olap_scanner.h" #include "olap_scan_node.h" #include "olap_utils.h" -#include "olap/olap_table.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" #include "runtime/mem_pool.h" @@ -42,7 +41,8 @@ Status EngineMetaReader::get_hints( RuntimeProfile* profile) { auto tablet_id = scan_range->scan_range().tablet_id; int32_t schema_hash = strtoul(scan_range->scan_range().schema_hash.c_str(), NULL, 10); - auto table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + OLAPTablePtr table = OLAPEngine::get_instance()->get_table( + tablet_id, schema_hash); if (table.get() == NULL) { LOG(WARNING) << "tablet does not exist. tablet_id=" << tablet_id << ", schema_hash=" << schema_hash; diff --git a/be/src/exec/olap_scan_node.cpp b/be/src/exec/olap_scan_node.cpp index 9f0b72159d..e4e4b8d0d3 100644 --- a/be/src/exec/olap_scan_node.cpp +++ b/be/src/exec/olap_scan_node.cpp @@ -677,7 +677,7 @@ Status OlapScanNode::normalize_in_predicate(SlotDescriptor* slot, ColumnValueRan continue; } - if (pred->get_child(0)->type() != slot->type()) { + if (pred->get_child(0)->type().type != slot->type().type) { if (!ignore_cast(slot, pred->get_child(0))) { continue; } @@ -759,7 +759,7 @@ Status OlapScanNode::normalize_in_predicate(SlotDescriptor* slot, ColumnValueRan != TExprNodeType::SLOT_REF) { continue; } - if (pred->get_child(child_idx)->type() != slot->type()) { + if (pred->get_child(child_idx)->type().type != slot->type().type) { if (!ignore_cast(slot, pred->get_child(child_idx))) { continue; } @@ -866,7 +866,7 @@ Status OlapScanNode::normalize_binary_predicate(SlotDescriptor* slot, ColumnValu if (Expr::type_without_cast(pred->get_child(child_idx)) != TExprNodeType::SLOT_REF) { continue; } - if (pred->get_child(child_idx)->type() != slot->type()) { + if (pred->get_child(child_idx)->type().type != slot->type().type) { if (!ignore_cast(slot, pred->get_child(child_idx))) { continue; } diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index 54035b45be..bed94eebc5 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -79,24 +79,32 @@ Status OlapScanner::_prepare( { _olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); if (_olap_table.get() == nullptr) { - OLAP_LOG_WARNING("table does not exists. [tablet_id=%ld schema_hash=%d]", + OLAP_LOG_WARNING("tablet does not exist. [tablet_id=%ld schema_hash=%d]", tablet_id, schema_hash); - return Status("table does not exists"); + + std::stringstream ss; + ss << "tablet does not exist: " << tablet_id; + return Status(ss.str()); } { - AutoRWLock auto_lock(_olap_table->get_header_lock_ptr(), true); - const FileVersionMessage* message = _olap_table->latest_version(); - if (message == NULL) { - OLAP_LOG_WARNING("fail to get latest version. [tablet_id=%ld]", tablet_id); - return Status("fail to get latest version"); + ReadLock rdlock(_olap_table->get_header_lock_ptr()); + const PDelta* delta = _olap_table->lastest_version(); + if (delta == NULL) { + std::stringstream ss; + ss << "fail to get latest version of tablet: " << tablet_id; + OLAP_LOG_WARNING(ss.str().c_str()); + return Status(ss.str()); } - if (message->end_version() == _version - && message->version_hash() != version_hash) { + if (delta->end_version() == _version + && delta->version_hash() != version_hash) { OLAP_LOG_WARNING("fail to check latest version hash. " "[tablet_id=%ld version_hash=%ld request_version_hash=%ld]", - tablet_id, message->version_hash(), version_hash); - return Status("fail to check version hash"); + tablet_id, delta->version_hash(), version_hash); + + std::stringstream ss; + ss << "fail to check version hash of tablet: " << tablet_id; + return Status(ss.str()); } } } @@ -134,7 +142,7 @@ Status OlapScanner::_init_params( RETURN_IF_ERROR(_init_return_columns()); _params.olap_table = _olap_table; - _params.reader_type = READER_FETCH; + _params.reader_type = READER_QUERY; _params.aggregation = _aggregation; _params.version = Version(0, _version); diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h index d57d9220f3..0a052900a8 100644 --- a/be/src/exec/olap_scanner.h +++ b/be/src/exec/olap_scanner.h @@ -113,7 +113,7 @@ private: ReaderParams _params; std::unique_ptr _reader; - SmartOLAPTable _olap_table; + OLAPTablePtr _olap_table; int64_t _version; std::vector _return_columns; diff --git a/be/src/exec/partitioned_aggregation_node.cc b/be/src/exec/partitioned_aggregation_node.cc index b04dabd90c..f20a3d42ec 100644 --- a/be/src/exec/partitioned_aggregation_node.cc +++ b/be/src/exec/partitioned_aggregation_node.cc @@ -139,7 +139,7 @@ Status PartitionedAggregationNode::prepare(RuntimeState* state) { for (int i = 0; i < _probe_expr_ctxs.size(); ++i) { SlotDescriptor* desc = _intermediate_tuple_desc->slots()[i]; DCHECK(desc->type().type == TYPE_NULL || - desc->type() == _probe_expr_ctxs[i]->root()->type()); + desc->type().type == _probe_expr_ctxs[i]->root()->type().type); // Hack to avoid TYPE_NULL SlotRefs. Expr* expr = desc->type().type != TYPE_NULL ? new SlotRef(desc) : new SlotRef(desc, TYPE_BOOLEAN); diff --git a/be/src/exec/schema_scan_node.cpp b/be/src/exec/schema_scan_node.cpp index c5bd743c6a..59713eb6a5 100644 --- a/be/src/exec/schema_scan_node.cpp +++ b/be/src/exec/schema_scan_node.cpp @@ -171,7 +171,7 @@ Status SchemaScanNode::prepare(RuntimeState* state) { return Status("no match column for this column."); } - if (_src_tuple_desc->slots()[j]->type() != _dest_tuple_desc->slots()[i]->type()) { + if (_src_tuple_desc->slots()[j]->type().type != _dest_tuple_desc->slots()[i]->type().type) { LOG(WARNING) << "schema not match. input is " << _src_tuple_desc->slots()[j]->type() << " and output is " << _dest_tuple_desc->slots()[i]->type(); return Status("schema not match."); @@ -314,6 +314,7 @@ Status SchemaScanNode::close(RuntimeState* state) { COUNTER_UPDATE(memory_used_counter(), _tuple_pool->peak_allocated_bytes()); } + _tuple_pool.reset(); return ExecNode::close(state); } diff --git a/be/src/exec/schema_scanner/frontend_helper.cpp b/be/src/exec/schema_scanner/frontend_helper.cpp index 08ccf88c34..f63042ba53 100644 --- a/be/src/exec/schema_scanner/frontend_helper.cpp +++ b/be/src/exec/schema_scanner/frontend_helper.cpp @@ -58,159 +58,91 @@ void FrontendHelper::setup(ExecEnv* exec_env) { Status FrontendHelper::get_db_names( const std::string& ip, const int32_t port, - const TGetDbsParams &db_params, - TGetDbsResult *db_result) { - Status status; - TNetworkAddress address = make_network_address(ip, port); - try { - // 500ms is enough - FrontendServiceConnection client( - _s_exec_env->frontend_client_cache(), - address, - 500, - &status); - if (!status.ok()) { - return status; - } - - try { - client->getDbNames(*db_result, db_params); - } catch (apache::thrift::transport::TTransportException& e) { - RETURN_IF_ERROR(client.reopen()); - client->getDbNames(*db_result, db_params); - } - } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "getDbNames from " << address << " failed:" << e.what(); - return Status(TStatusCode::THRIFT_RPC_ERROR, ss.str(), false); - } - return Status::OK; + const TGetDbsParams &request, + TGetDbsResult *result) { + return rpc(ip, port, + [&request, &result] (FrontendServiceConnection& client) { + client->getDbNames(*result, request); + }); } Status FrontendHelper::get_table_names( const std::string& ip, const int32_t port, - const TGetTablesParams &table_params, - TGetTablesResult *table_result) { - Status status; - TNetworkAddress address = make_network_address(ip, port); - try { - // 500ms is enough - FrontendServiceConnection client( - _s_exec_env->frontend_client_cache(), - address, - 500, - &status); - if (!status.ok()) { - return status; - } - - try { - client->getTableNames(*table_result, table_params); - } catch (apache::thrift::transport::TTransportException& e) { - RETURN_IF_ERROR(client.reopen()); - client->getTableNames(*table_result, table_params); - } - } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "getTableNames from " << address << " failed:" << e.what(); - return Status(TStatusCode::THRIFT_RPC_ERROR, ss.str(), false); - } - return Status::OK; + const TGetTablesParams &request, + TGetTablesResult *result) { + return rpc(ip, port, + [&request, &result] (FrontendServiceConnection& client) { + client->getTableNames(*result, request); + }); } Status FrontendHelper::list_table_status( const std::string& ip, const int32_t port, - const TGetTablesParams &table_params, - TListTableStatusResult *table_result) { - Status status; - TNetworkAddress address = make_network_address(ip, port); - try { - // 500ms is enough - FrontendServiceConnection client( - _s_exec_env->frontend_client_cache(), - address, - 500, - &status); - if (!status.ok()) { - return status; - } - - try { - client->listTableStatus(*table_result, table_params); - } catch (apache::thrift::transport::TTransportException& e) { - RETURN_IF_ERROR(client.reopen()); - client->listTableStatus(*table_result, table_params); - } - } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "getTableNames from " << address << " failed:" << e.what(); - return Status(TStatusCode::THRIFT_RPC_ERROR, ss.str(), false); - } - return Status::OK; + const TGetTablesParams &request, + TListTableStatusResult *result) { + return rpc(ip, port, + [&request, &result] (FrontendServiceConnection& client) { + client->listTableStatus(*result, request); + }); } Status FrontendHelper::describe_table( const std::string& ip, const int32_t port, - const TDescribeTableParams &desc_params, - TDescribeTableResult *desc_result) { - Status status; - TNetworkAddress address = make_network_address(ip, port); - try { - // 500ms is enough - FrontendServiceConnection client( - _s_exec_env->frontend_client_cache(), - address, - 500, - &status); - if (!status.ok()) { - return status; - } - - try { - client->describeTable(*desc_result, desc_params); - } catch (apache::thrift::transport::TTransportException& e) { - RETURN_IF_ERROR(client.reopen()); - client->describeTable(*desc_result, desc_params); - } - } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "describeTable from " << address << " failed:" << e.what(); - return Status(TStatusCode::THRIFT_RPC_ERROR, ss.str(), false); - } - return Status::OK; + const TDescribeTableParams &request, + TDescribeTableResult *result) { + return rpc(ip, port, + [&request, &result] (FrontendServiceConnection& client) { + client->describeTable(*result, request); + }); } Status FrontendHelper::show_varialbes( const std::string& ip, const int32_t port, - const TShowVariableRequest &var_params, - TShowVariableResult *var_result) { - Status status; + const TShowVariableRequest &request, + TShowVariableResult *result) { + return rpc(ip, port, + [&request, &result] (FrontendServiceConnection& client) { + client->showVariables(*result, request); + }); +} + +Status FrontendHelper::rpc( + const std::string& ip, + const int32_t port, + std::function callback, + int timeout_ms) { TNetworkAddress address = make_network_address(ip, port); try { - // 500ms is enough + Status status; FrontendServiceConnection client( - _s_exec_env->frontend_client_cache(), - address, - 500, - &status); + _s_exec_env->frontend_client_cache(), address, timeout_ms, &status); if (!status.ok()) { + LOG(WARNING) << "Connect frontent failed, address=" << address + << ", status=" << status.get_error_msg(); return status; } - try { - client->showVariables(*var_result, var_params); + callback(client); } catch (apache::thrift::transport::TTransportException& e) { - RETURN_IF_ERROR(client.reopen()); - client->showVariables(*var_result, var_params); + LOG(WARNING) << "retrying call frontend service, address=" + << address << ", reason=" << e.what(); + status = client.reopen(timeout_ms); + if (!status.ok()) { + LOG(WARNING) << "client repoen failed. address=" << address + << ", status=" << status.get_error_msg(); + return status; + } + callback(client); } } catch (apache::thrift::TException& e) { - std::stringstream ss; - ss << "showVariables from " << address << " failed:" << e.what(); - return Status(TStatusCode::THRIFT_RPC_ERROR, ss.str(), false); + LOG(WARNING) << "call frontend service failed, address=" << address + << ", reason=" << e.what(); + return Status(TStatusCode::THRIFT_RPC_ERROR, + "failed to call frontend service", false); } return Status::OK; } diff --git a/be/src/exec/schema_scanner/frontend_helper.h b/be/src/exec/schema_scanner/frontend_helper.h index b0ed1d9715..06ba362b51 100644 --- a/be/src/exec/schema_scanner/frontend_helper.h +++ b/be/src/exec/schema_scanner/frontend_helper.h @@ -58,6 +58,12 @@ public: TShowVariableResult *var_result); static std::string extract_db_name(const std::string& full_name); + + static Status rpc( + const std::string& ip, + const int32_t port, + std::function callback, + int timeout_ms = 5000); private: static ExecEnv* _s_exec_env; }; diff --git a/be/src/exprs/CMakeLists.txt b/be/src/exprs/CMakeLists.txt index 2670e332d5..a92250ed36 100644 --- a/be/src/exprs/CMakeLists.txt +++ b/be/src/exprs/CMakeLists.txt @@ -39,6 +39,7 @@ add_library(Exprs conditional_functions.cpp conditional_functions_ir.cpp decimal_operators.cpp + es_functions.cpp literal.cpp expr.cpp expr_ir.cpp diff --git a/be/src/exprs/aggregate_functions.cpp b/be/src/exprs/aggregate_functions.cpp index 9694af2025..6d30fd91a0 100644 --- a/be/src/exprs/aggregate_functions.cpp +++ b/be/src/exprs/aggregate_functions.cpp @@ -958,8 +958,8 @@ void AggregateFunctions::hll_union_parse_and_cal(HllSetResolver& resolver, Strin return; } if (resolver.get_hll_data_type() == HLL_DATA_EXPLICIT) { - for (int i = 0; i < resolver.get_expliclit_count(); i++) { - uint64_t hash_value = resolver.get_expliclit_value(i); + for (int i = 0; i < resolver.get_explicit_count(); i++) { + uint64_t hash_value = resolver.get_explicit_value(i); int idx = hash_value % dst->len; uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_PRECISION) + 1; dst->ptr[idx] = std::max(dst->ptr[idx], first_one_bit); diff --git a/be/src/exprs/anyval_util.cpp b/be/src/exprs/anyval_util.cpp index 6efd6ac846..a065ba6036 100755 --- a/be/src/exprs/anyval_util.cpp +++ b/be/src/exprs/anyval_util.cpp @@ -81,6 +81,7 @@ AnyVal* create_any_val(ObjectPool* pool, const TypeDescriptor& type) { case TYPE_DOUBLE: return pool->add(new DoubleVal); + case TYPE_CHAR: case TYPE_HLL: case TYPE_VARCHAR: return pool->add(new StringVal); @@ -93,7 +94,7 @@ AnyVal* create_any_val(ObjectPool* pool, const TypeDescriptor& type) { case TYPE_DATETIME: return pool->add(new DateTimeVal); -default: + default: DCHECK(false) << "Unsupported type: " << type.type; return NULL; } diff --git a/be/src/exprs/encryption_functions.cpp b/be/src/exprs/encryption_functions.cpp index 67d75e5dce..4875ceb05d 100644 --- a/be/src/exprs/encryption_functions.cpp +++ b/be/src/exprs/encryption_functions.cpp @@ -21,6 +21,7 @@ #include "exprs/encryption_functions.h" #include +#include "aes/my_aes.h" #include "exprs/anyval_util.h" #include "exprs/expr.h" #include "util/debug_util.h" @@ -33,6 +34,43 @@ namespace palo { void EncryptionFunctions::init() { } +StringVal EncryptionFunctions::aes_encrypt(FunctionContext* ctx, + const StringVal &src, const StringVal &key) { + if (src.len == 0) { + return StringVal::null(); + } + + // cipher_len = (clearLen/16 + 1) * 16; + int cipher_len = src.len + 16; + boost::scoped_array p; + p.reset(new char[cipher_len]); + + int ret_code = my_aes_encrypt((unsigned char *)src.ptr, src.len, + (unsigned char*)p.get(), (unsigned char *)key.ptr, key.len, my_aes_128_ecb, NULL); + if (ret_code < 0) { + return StringVal::null(); + } + return AnyValUtil::from_buffer_temp(ctx, p.get(), ret_code); +} + +StringVal EncryptionFunctions::aes_decrypt(FunctionContext* ctx, + const StringVal &src, const StringVal &key) { + if (src.len == 0) { + return StringVal::null(); + } + + int cipher_len = src.len; + boost::scoped_array p; + p.reset(new char[cipher_len]); + + int ret_code = my_aes_decrypt((unsigned char *)src.ptr, src.len, (unsigned char*)p.get(), + (unsigned char *)key.ptr, key.len, my_aes_128_ecb, NULL); + if (ret_code < 0) { + return StringVal::null(); + } + return AnyValUtil::from_buffer_temp(ctx, p.get(), ret_code); +} + StringVal EncryptionFunctions::from_base64(FunctionContext* ctx, const StringVal &src) { if (src.len == 0) { return StringVal::null(); diff --git a/be/src/exprs/encryption_functions.h b/be/src/exprs/encryption_functions.h index bb57e7b32c..98c86ea962 100644 --- a/be/src/exprs/encryption_functions.h +++ b/be/src/exprs/encryption_functions.h @@ -34,6 +34,10 @@ class TupleRow; class EncryptionFunctions { public: static void init(); + static palo_udf::StringVal aes_encrypt(palo_udf::FunctionContext* context, + const palo_udf::StringVal& val1, const palo_udf::StringVal& val2); + static palo_udf::StringVal aes_decrypt(palo_udf::FunctionContext* context, + const palo_udf::StringVal& val1, const palo_udf::StringVal& val2); static palo_udf::StringVal from_base64(palo_udf::FunctionContext* context, const palo_udf::StringVal& val1); static palo_udf::StringVal to_base64(palo_udf::FunctionContext* context, diff --git a/be/src/exprs/in_predicate.cpp b/be/src/exprs/in_predicate.cpp index 656a9d7eca..d8c5a3ced3 100644 --- a/be/src/exprs/in_predicate.cpp +++ b/be/src/exprs/in_predicate.cpp @@ -62,8 +62,14 @@ Status InPredicate::open( Expr::open(state, context, scope); for (int i = 1; i < _children.size(); ++i) { - if (_children[i]->type() != _children[0]->type()) { - return Status("InPredicate type not same"); + if (_children[0]->type().is_string_type()) { + if (!_children[i]->type().is_string_type()) { + return Status("InPredicate type not same"); + } + } else { + if (_children[i]->type().type != _children[0]->type().type) { + return Status("InPredicate type not same"); + } } void* value = context->get_value(_children[i], NULL); diff --git a/be/src/exprs/timestamp_functions.cpp b/be/src/exprs/timestamp_functions.cpp index aaefc3e63a..cbbd851c40 100644 --- a/be/src/exprs/timestamp_functions.cpp +++ b/be/src/exprs/timestamp_functions.cpp @@ -139,7 +139,7 @@ bool TimestampFunctions::check_format(const StringVal& format, DateTimeValue& t) void TimestampFunctions::report_bad_format(const StringVal* format) { std::string format_str((char *)format->ptr, format->len); // LOG(WARNING) << "Bad date/time conversion format: " << format_str - // << " Format must be: 'yyyy-MM-dd[ HH:mm:ss]'"; + // << " Format must be: 'yyyy-MM-dd[ HH:mm:ss]'"; } IntVal TimestampFunctions::year( diff --git a/be/src/gen_cpp/CMakeLists.txt b/be/src/gen_cpp/CMakeLists.txt index b2352c838a..9c95b07513 100644 --- a/be/src/gen_cpp/CMakeLists.txt +++ b/be/src/gen_cpp/CMakeLists.txt @@ -72,6 +72,7 @@ set(SRC_FILES ${GEN_CPP_DIR}/olap_file.pb.cc ${GEN_CPP_DIR}/column_data_file.pb.cc ${GEN_CPP_DIR}/data.pb.cc + ${GEN_CPP_DIR}/descriptors.pb.cc ${GEN_CPP_DIR}/internal_service.pb.cc ${GEN_CPP_DIR}/types.pb.cc ${GEN_CPP_DIR}/status.pb.cc diff --git a/be/src/http/CMakeLists.txt b/be/src/http/CMakeLists.txt index b0396ef63e..76ff9b5ff7 100644 --- a/be/src/http/CMakeLists.txt +++ b/be/src/http/CMakeLists.txt @@ -31,18 +31,22 @@ add_library(Webserver STATIC http_channel.cpp http_status.cpp http_parser.cpp + message_body_sink.cpp web_page_handler.cpp monitor_action.cpp default_path_handlers.cpp + utils.cpp ev_http_server.cpp - action/compaction_action.cpp action/mini_load.cpp action/health_action.cpp action/checksum_action.cpp action/snapshot_action.cpp action/reload_tablet_action.cpp + action/restore_tablet_action.cpp action/pprof_actions.cpp action/metrics_action.cpp + action/stream_load.cpp + action/meta_action.cpp # action/multi_start.cpp # action/multi_show.cpp # action/multi_commit.cpp diff --git a/be/src/http/action/checksum_action.cpp b/be/src/http/action/checksum_action.cpp index f921f86ff7..b722b74e72 100644 --- a/be/src/http/action/checksum_action.cpp +++ b/be/src/http/action/checksum_action.cpp @@ -20,12 +20,16 @@ #include "boost/lexical_cast.hpp" +#include "common/logging.h" #include "agent/cgroups_mgr.h" #include "http/http_channel.h" #include "http/http_headers.h" #include "http/http_request.h" #include "http/http_response.h" #include "http/http_status.h" +#include "olap/olap_define.h" +#include "olap/olap_engine.h" +#include "runtime/exec_env.h" namespace palo { @@ -38,7 +42,6 @@ const std::string SCHEMA_HASH = "schema_hash"; ChecksumAction::ChecksumAction(ExecEnv* exec_env) : _exec_env(exec_env) { - _command_executor = new CommandExecutor(); } void ChecksumAction::handle(HttpRequest *req) { @@ -122,7 +125,7 @@ int64_t ChecksumAction::do_checksum(int64_t tablet_id, int64_t version, int64_t OLAPStatus res = OLAPStatus::OLAP_SUCCESS; uint32_t checksum; - res = _command_executor->compute_checksum( + res = _exec_env->olap_engine()->compute_checksum( tablet_id, schema_hash, version, version_hash, &checksum); if (res != OLAPStatus::OLAP_SUCCESS) { LOG(WARNING) << "checksum failed. status: " << res @@ -136,10 +139,4 @@ int64_t ChecksumAction::do_checksum(int64_t tablet_id, int64_t version, int64_t return static_cast(checksum); } -ChecksumAction::~ChecksumAction() { - if (_command_executor != NULL) { - delete _command_executor; - } -} - } // end namespace palo diff --git a/be/src/http/action/checksum_action.h b/be/src/http/action/checksum_action.h index ed74b418d2..a2609f765c 100644 --- a/be/src/http/action/checksum_action.h +++ b/be/src/http/action/checksum_action.h @@ -19,7 +19,6 @@ #include #include "http/http_handler.h" -#include "olap/command_executor.h" namespace palo { @@ -29,7 +28,7 @@ class ChecksumAction : public HttpHandler { public: explicit ChecksumAction(ExecEnv* exec_env); - virtual ~ChecksumAction(); + virtual ~ChecksumAction() { } void handle(HttpRequest *req) override; private: @@ -37,7 +36,6 @@ private: int32_t schema_hash, HttpRequest *req); ExecEnv* _exec_env; - CommandExecutor* _command_executor; }; // end class ChecksumAction diff --git a/be/src/http/action/compaction_action.cpp b/be/src/http/action/compaction_action.cpp deleted file mode 100644 index 2fa12d8a1a..0000000000 --- a/be/src/http/action/compaction_action.cpp +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "http/action/compaction_action.h" - -#include -#include - -#include -#include "boost/lexical_cast.hpp" - -#include "agent/cgroups_mgr.h" -#include "http/http_channel.h" -#include "http/http_headers.h" -#include "http/http_request.h" -#include "http/http_response.h" -#include "http/http_status.h" -#include "olap/base_compaction.h" -#include "olap/cumulative_compaction.h" -#include "olap/olap_engine.h" - -namespace palo { - -//example: -// http://host:port/api/compaction?tablet_id=10001&schema_hash=10001&compaction_type=base_compaction -// http://host:port/api/compaction?tablet_id=10001&schema_hash=10001&compaction_type=cumulative_compaction -const std::string TABLET_ID = "tablet_id"; -const std::string SCHEMA_HASH = "schema_hash"; -const std::string COMPACTION_TYPE = "compaction_type"; - -void CompactionAction::handle(HttpRequest *req) { - LOG(INFO) << "accept one request " << req->debug_string(); - - // add tid to cgroup in order to limit read bandwidth - CgroupsMgr::apply_system_cgroup(); - // Get tablet id - const std::string tablet_id_str = req->param(TABLET_ID); - if (tablet_id_str.empty()) { - std::string error_msg = std::string( - "parameter " + TABLET_ID + " not specified in url."); - - HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); - return; - } - - // Get schema hash - const std::string schema_hash_str = req->param(SCHEMA_HASH); - if (schema_hash_str.empty()) { - std::string error_msg = std::string( - "parameter " + SCHEMA_HASH + " not specified in url."); - HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); - return; - } - - // valid str format - int64_t tablet_id; - int32_t schema_hash; - try { - tablet_id = boost::lexical_cast(tablet_id_str); - schema_hash = boost::lexical_cast(schema_hash_str); - } catch (boost::bad_lexical_cast& e) { - std::string error_msg = std::string("param format is invalid: ") + std::string(e.what()); - HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); - return; - } - - // Get compaction type - const std::string& compaction_type = req->param(COMPACTION_TYPE); - if (!boost::iequals(compaction_type, "base_compaction") - && !boost::iequals(compaction_type, "cumulative_compaction")) { - std::string error_msg = std::string( - "parameter " + COMPACTION_TYPE + " not specified in url."); - HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); - return; - } - - TableInfo tablet_info(tablet_id, schema_hash); - if (boost::iequals(compaction_type, "base_compaction")) { - OLAPEngine::get_instance()->add_tablet_to_base_compaction_queue(tablet_info); - } else { - OLAPEngine::get_instance()->add_tablet_to_cumulative_compaction_queue(tablet_info); - } - HttpChannel::send_reply(req, "succeed add compaction to queue"); -} - -} // end namespace palo diff --git a/be/src/http/action/compaction_action.h b/be/src/http/action/compaction_action.h deleted file mode 100644 index 062271fcf5..0000000000 --- a/be/src/http/action/compaction_action.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_HTTP_COMPACTION_ACTION_H -#define BDG_PALO_BE_SRC_HTTP_COMPACTION_ACTION_H - -#include - -#include "http/http_handler.h" - -namespace palo { - -class CompactionAction : public HttpHandler { -public: - void handle(HttpRequest *req) override; -}; // end class ChecksumAction - -} // end namespace palo - -#endif // BDG_PALO_BE_SRC_HTTP_COMPACTION_ACTION_H diff --git a/be/src/http/action/mini_load.cpp b/be/src/http/action/mini_load.cpp index a309011745..0786700600 100644 --- a/be/src/http/action/mini_load.cpp +++ b/be/src/http/action/mini_load.cpp @@ -90,6 +90,7 @@ const std::string LABEL_KEY = "label"; const std::string SUB_LABEL_KEY = "sub_label"; const std::string FILE_PATH_KEY = "file_path"; const char* k_100_continue = "100-continue"; +const int64_t THRIFT_RPC_TIMEOUT_MS = 3000; // 3 sec MiniLoadAction::MiniLoadAction(ExecEnv* exec_env) : _exec_env(exec_env) { @@ -100,7 +101,7 @@ static bool is_name_valid(const std::string& name) { } static Status check_request(HttpRequest* req) { - std::map& params = *req->params(); + auto& params = *req->params(); // check params if (!is_name_valid(params[DB_KEY])) { @@ -151,7 +152,7 @@ Status MiniLoadAction::_load( const TNetworkAddress& master_address = _exec_env->master_info()->network_address; Status status; FrontendServiceConnection client( - _exec_env->frontend_client_cache(), master_address, 500, &status); + _exec_env->frontend_client_cache(), master_address, THRIFT_RPC_TIMEOUT_MS, &status); if (!status.ok()) { std::stringstream ss; ss << "Connect master failed, with address(" @@ -189,12 +190,32 @@ Status MiniLoadAction::_load( LOG(WARNING) << "Retrying mini load from master(" << master_address.hostname << ":" << master_address.port << ") because: " << e.what(); - status = client.reopen(500); + status = client.reopen(THRIFT_RPC_TIMEOUT_MS); if (!status.ok()) { LOG(WARNING) << "Client repoen failed. with address(" << master_address.hostname << ":" << master_address.port << ")"; return status; } + // we may get timeout exception and the load job may already be summitted. + // set this request as 'retry', and Frontend will return success if job has been + // summitted. + req.__set_is_retry(true); + client->miniLoad(res, req); + } catch (apache::thrift::TApplicationException& e) { + LOG(WARNING) << "mini load request from master(" + << master_address.hostname << ":" << master_address.port + << ") got unknown result: " << e.what(); + + status = client.reopen(THRIFT_RPC_TIMEOUT_MS); + if (!status.ok()) { + LOG(WARNING) << "Client repoen failed. with address(" + << master_address.hostname << ":" << master_address.port << ")"; + return status; + } + // we may get timeout exception and the load job may already be summitted. + // set this request as 'retry', and Frontend will return success if job has been + // summitted. + req.__set_is_retry(true); client->miniLoad(res, req); } } catch (apache::thrift::TException& e) { @@ -210,7 +231,7 @@ Status MiniLoadAction::_load( return Status(res.status); } -static bool parse_auth(const std::string& auth, std::string* user, +static bool parse_auth(const std::string& auth, std::string* user, std::string* passwd, std::string* cluster) { std::string decoded_auth; @@ -238,7 +259,7 @@ Status MiniLoadAction::check_auth( const TNetworkAddress& master_address = _exec_env->master_info()->network_address; Status status; FrontendServiceConnection client( - _exec_env->frontend_client_cache(), master_address, 500, &status); + _exec_env->frontend_client_cache(), master_address, THRIFT_RPC_TIMEOUT_MS, &status); if (!status.ok()) { std::stringstream ss; ss << "Connect master failed, with address(" @@ -255,7 +276,19 @@ Status MiniLoadAction::check_auth( LOG(WARNING) << "Retrying mini load from master(" << master_address.hostname << ":" << master_address.port << ") because: " << e.what(); - status = client.reopen(500); + status = client.reopen(THRIFT_RPC_TIMEOUT_MS); + if (!status.ok()) { + LOG(WARNING) << "Client repoen failed. with address(" + << master_address.hostname << ":" << master_address.port << ")"; + return status; + } + client->loadCheck(res, check_load_req); + } catch (apache::thrift::TApplicationException& e) { + LOG(WARNING) << "load check request from master(" + << master_address.hostname << ":" << master_address.port + << ") got unknown result: " << e.what(); + + status = client.reopen(THRIFT_RPC_TIMEOUT_MS); if (!status.ok()) { LOG(WARNING) << "Client repoen failed. with address(" << master_address.hostname << ":" << master_address.port << ")"; @@ -409,15 +442,19 @@ void MiniLoadAction::handle(HttpRequest *http_req) { } auto st = _load( http_req, ctx->file_path, ctx->load_check_req.user, ctx->load_check_req.cluster); + + std::string status_str = "Success"; + std::string msg = "OK"; if (!st.ok()) { - HttpChannel::send_reply(http_req, HttpStatus::INTERNAL_SERVER_ERROR, st.get_error_msg()); - return; + // we do not send 500 reply to client, send 200 with error msg + status_str = "FAILED"; + msg = st.get_error_msg(); } std::stringstream ss; ss << "{\n"; - ss << "\t\"status\": \"Success\",\n"; - ss << "\t\"msg\": \"OK\"\n"; + ss << "\t\"status\": \"" << status_str << "\",\n"; + ss << "\t\"msg\": \"" << msg << "\"\n"; ss << "}\n"; std::string str = ss.str(); HttpChannel::send_reply(http_req, str); diff --git a/be/src/http/action/reload_tablet_action.cpp b/be/src/http/action/reload_tablet_action.cpp index 54d9bb07b9..e9b42ef343 100644 --- a/be/src/http/action/reload_tablet_action.cpp +++ b/be/src/http/action/reload_tablet_action.cpp @@ -20,12 +20,16 @@ #include "boost/lexical_cast.hpp" +#include "common/logging.h" #include "agent/cgroups_mgr.h" #include "http/http_channel.h" #include "http/http_headers.h" #include "http/http_request.h" #include "http/http_response.h" #include "http/http_status.h" +#include "olap/olap_define.h" +#include "olap/olap_engine.h" +#include "runtime/exec_env.h" namespace palo { @@ -35,7 +39,6 @@ const std::string SCHEMA_HASH = "schema_hash"; ReloadTabletAction::ReloadTabletAction(ExecEnv* exec_env) : _exec_env(exec_env) { - _command_executor = new CommandExecutor(); } void ReloadTabletAction::handle(HttpRequest *req) { @@ -98,7 +101,7 @@ void ReloadTabletAction::reload( clone_req.__set_schema_hash(schema_hash); OLAPStatus res = OLAPStatus::OLAP_SUCCESS; - res = _command_executor->load_header(path, clone_req); + res = _exec_env->olap_engine()->load_header(path, clone_req); if (res != OLAPStatus::OLAP_SUCCESS) { LOG(WARNING) << "load header failed. status: " << res << ", signature: " << tablet_id; diff --git a/be/src/http/action/reload_tablet_action.h b/be/src/http/action/reload_tablet_action.h index b5bb1f156b..658c73801a 100644 --- a/be/src/http/action/reload_tablet_action.h +++ b/be/src/http/action/reload_tablet_action.h @@ -19,7 +19,6 @@ #include #include "http/http_handler.h" -#include "olap/command_executor.h" #include "gen_cpp/AgentService_types.h" namespace palo { @@ -30,11 +29,7 @@ class ReloadTabletAction : public HttpHandler { public: ReloadTabletAction(ExecEnv* exec_env); - virtual ~ReloadTabletAction() { - if (_command_executor != NULL) { - delete _command_executor; - } - } + virtual ~ReloadTabletAction() { } void handle(HttpRequest *req) override; private: @@ -42,7 +37,6 @@ private: HttpRequest *req); ExecEnv* _exec_env; - CommandExecutor* _command_executor; }; // end class ReloadTabletAction diff --git a/be/src/http/action/snapshot_action.cpp b/be/src/http/action/snapshot_action.cpp index e842f7e1d2..269b2f1432 100644 --- a/be/src/http/action/snapshot_action.cpp +++ b/be/src/http/action/snapshot_action.cpp @@ -18,14 +18,19 @@ #include #include -#include "boost/lexical_cast.hpp" +#include #include "agent/cgroups_mgr.h" +#include "common/logging.h" +#include "gen_cpp/AgentService_types.h" #include "http/http_channel.h" #include "http/http_headers.h" #include "http/http_request.h" #include "http/http_response.h" #include "http/http_status.h" +#include "runtime/exec_env.h" +#include "olap/olap_define.h" +#include "olap/olap_engine.h" namespace palo { @@ -34,7 +39,6 @@ const std::string SCHEMA_HASH = "schema_hash"; SnapshotAction::SnapshotAction(ExecEnv* exec_env) : _exec_env(exec_env) { - _command_executor = new CommandExecutor(); } void SnapshotAction::handle(HttpRequest *req) { @@ -93,9 +97,12 @@ void SnapshotAction::handle(HttpRequest *req) { int64_t SnapshotAction::make_snapshot(int64_t tablet_id, int32_t schema_hash, std::string* snapshot_path) { + TSnapshotRequest request; + request.tablet_id = tablet_id; + request.schema_hash = schema_hash; OLAPStatus res = OLAPStatus::OLAP_SUCCESS; - res = _command_executor->make_snapshot(tablet_id, schema_hash, snapshot_path); + res = _exec_env->olap_engine()->make_snapshot(request, snapshot_path); if (res != OLAPStatus::OLAP_SUCCESS) { LOG(WARNING) << "make snapshot failed. status: " << res << ", signature: " << tablet_id; @@ -108,10 +115,4 @@ int64_t SnapshotAction::make_snapshot(int64_t tablet_id, int32_t schema_hash, return 0L; } -SnapshotAction::~SnapshotAction() { - if (_command_executor != NULL) { - delete _command_executor; - } -} - } // end namespace palo diff --git a/be/src/http/action/snapshot_action.h b/be/src/http/action/snapshot_action.h index 915276ff55..4daaa9184c 100644 --- a/be/src/http/action/snapshot_action.h +++ b/be/src/http/action/snapshot_action.h @@ -19,7 +19,6 @@ #include #include "http/http_handler.h" -#include "olap/command_executor.h" namespace palo { @@ -31,14 +30,13 @@ class SnapshotAction : public HttpHandler { public: explicit SnapshotAction(ExecEnv* exec_env); - virtual ~SnapshotAction(); + virtual ~SnapshotAction() { } void handle(HttpRequest *req) override; private: int64_t make_snapshot(int64_t tablet_id, int schema_hash, std::string* snapshot_path); ExecEnv* _exec_env; - CommandExecutor* _command_executor; }; // end class SnapshotAction diff --git a/be/src/http/ev_http_server.cpp b/be/src/http/ev_http_server.cpp index 0c4bc3368f..643cce84ea 100644 --- a/be/src/http/ev_http_server.cpp +++ b/be/src/http/ev_http_server.cpp @@ -90,7 +90,7 @@ Status EvHttpServer::start() { RETURN_IF_ERROR(_bind()); for (int i = 0; i < _num_workers; ++i) { auto worker = [this, i] () { - LOG(INFO) << "EvHttpSerer worker start, id=" << i; + LOG(INFO) << "EvHttpServer worker start, id=" << i; std::shared_ptr base( event_base_new(), [] (event_base* base) { event_base_free(base); }); if (base == nullptr) { @@ -116,6 +116,7 @@ Status EvHttpServer::start() { event_base_dispatch(base.get()); }; _workers.emplace_back(worker); + _workers[i].detach(); } return Status::OK; } diff --git a/be/src/http/http_channel.h b/be/src/http/http_channel.h index ae2587ba54..65f757713f 100644 --- a/be/src/http/http_channel.h +++ b/be/src/http/http_channel.h @@ -26,7 +26,6 @@ struct mg_connection; namespace palo { class HttpRequest; -class HttpResponse; class HttpChannel { public: diff --git a/be/src/http/http_headers.cpp b/be/src/http/http_headers.cpp index 7b369d35c8..6e2c9bb776 100644 --- a/be/src/http/http_headers.cpp +++ b/be/src/http/http_headers.cpp @@ -91,4 +91,6 @@ const char* HttpHeaders::WEBSOCKET_ORIGIN = "WebSocket-Origin"; const char* HttpHeaders::WEBSOCKET_PROTOCOL = "WebSocket-Protocol"; const char* HttpHeaders::WWW_AUTHENTICATE = "WWW-Authenticate"; +const std::string HttpHeaders::JsonType = "application/json"; + } diff --git a/be/src/http/http_headers.h b/be/src/http/http_headers.h index acee169492..af4bfdfa83 100644 --- a/be/src/http/http_headers.h +++ b/be/src/http/http_headers.h @@ -16,6 +16,8 @@ #ifndef BDG_PALO_BE_SRC_COMMON_UTIL_HTTP_HEADERS_H #define BDG_PALO_BE_SRC_COMMON_UTIL_HTTP_HEADERS_H +#include + namespace palo { class HttpHeaders { @@ -93,6 +95,8 @@ public: static const char* WEBSOCKET_ORIGIN; static const char* WEBSOCKET_PROTOCOL; static const char* WWW_AUTHENTICATE; + + static const std::string JsonType; }; } diff --git a/be/src/http/http_parser.h b/be/src/http/http_parser.h index ca9bc635f5..9eb2db406f 100644 --- a/be/src/http/http_parser.h +++ b/be/src/http/http_parser.h @@ -23,8 +23,8 @@ namespace palo { struct HttpChunkParseCtx { int state; // Parse state - int64_t size; // Chunk size - int64_t length; // minimal length need to read + size_t size; // Chunk size + size_t length; // minimal length need to read HttpChunkParseCtx() : state(0), size(0), length(0) { } }; diff --git a/be/src/http/http_request.h b/be/src/http/http_request.h index ccd196409e..1260e1fa2f 100644 --- a/be/src/http/http_request.h +++ b/be/src/http/http_request.h @@ -19,7 +19,12 @@ #include #include +#include + +#include "http/http_common.h" +#include "http/http_headers.h" #include "http/http_method.h" +#include "util/string_util.h" struct mg_connection; struct evhttp_request; @@ -30,6 +35,9 @@ class HttpHandler; class HttpRequest { public: + // Only used for unit test + HttpRequest() { } + HttpRequest(evhttp_request* ev_req); ~HttpRequest(); @@ -55,7 +63,7 @@ public: const std::string& param(const std::string& key) const; // return params - const std::map& headers() { + const StringCaseUnorderedMap& headers() { return _headers; } @@ -88,7 +96,8 @@ private: HttpMethod _method; std::string _uri; std::string _raw_path; - std::map _headers; + + StringCaseUnorderedMap _headers; std::map _params; std::map _query_params; diff --git a/be/src/http/web_page_handler.cpp b/be/src/http/web_page_handler.cpp index 45eda3f25d..3d50bf0b56 100644 --- a/be/src/http/web_page_handler.cpp +++ b/be/src/http/web_page_handler.cpp @@ -58,7 +58,7 @@ void WebPageHandler::register_page( void WebPageHandler::handle(HttpRequest *req) { // Should we render with css styles? bool use_style = true; - std::map& params = *req->params(); + auto& params = *req->params(); if (params.find("raw") != params.end()) { use_style = false; } diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt index 3543d1dcaa..7b4a82510b 100644 --- a/be/src/olap/CMakeLists.txt +++ b/be/src/olap/CMakeLists.txt @@ -20,40 +20,45 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/olap") set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/olap") add_library(Olap STATIC - comparison_predicate.cpp - in_list_predicate.cpp - null_predicate.cpp - base_compaction.cpp - command_executor.cpp - cumulative_compaction.cpp - delete_handler.cpp aggregate_func.cpp - types.cpp + base_compaction.cpp + comparison_predicate.cpp + cumulative_compaction.cpp + delta_writer.cpp + delete_handler.cpp field.cpp field_info.cpp - hll.cpp file_helper.cpp + hll.cpp i_data.cpp + in_list_predicate.cpp lru_cache.cpp - olap_main.cpp + memtable.cpp merger.cpp + new_status.cpp + null_predicate.cpp olap_cond.cpp olap_data.cpp olap_engine.cpp olap_header.cpp olap_index.cpp - olap_rootpath.cpp olap_server.cpp olap_snapshot.cpp + options.cpp + store.cpp olap_table.cpp push_handler.cpp reader.cpp row_block.cpp row_cursor.cpp + rowset.cpp schema_change.cpp + types.cpp utils.cpp wrapper_field.cpp writer.cpp + olap_header_manager.cpp + olap_meta.cpp column_file/bit_field_reader.cpp column_file/bit_field_writer.cpp column_file/bloom_filter.hpp diff --git a/be/src/olap/aggregate_func.h b/be/src/olap/aggregate_func.h index 64a00eb5a9..aa0193d048 100644 --- a/be/src/olap/aggregate_func.h +++ b/be/src/olap/aggregate_func.h @@ -19,10 +19,11 @@ #include "olap/field_info.h" #include "olap/hll.h" #include "olap/types.h" +#include "util/arena.h" namespace palo { -using AggregateFunc = void (*)(char* left, char* right); +using AggregateFunc = void (*)(char* left, const char* right, Arena* arena); using FinalizeFunc = void (*)(char* data); template struct AggregateFuncTraits { - static void aggregate(char* left, char* right) {} + static void aggregate(char* left, const char* right, Arena* arena) {} }; template struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; bool l_null = *reinterpret_cast(left); - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); if (l_null) { return; } else if (r_null) { *reinterpret_cast(left) = true; } else { CppType* l_val = reinterpret_cast(left + 1); - CppType* r_val = reinterpret_cast(right + 1); + const CppType* r_val = reinterpret_cast(right + 1); if (*r_val < *l_val) { *l_val = *r_val; } } } @@ -53,10 +54,10 @@ struct AggregateFuncTraits { template <> struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; bool l_null = *reinterpret_cast(left); - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); if (l_null) { return; } else if (r_null) { @@ -74,16 +75,16 @@ struct AggregateFuncTraits template struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; bool l_null = *reinterpret_cast(left); - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); if (r_null) { return; } CppType* l_val = reinterpret_cast(left + 1); - CppType* r_val = reinterpret_cast(right + 1); + const CppType* r_val = reinterpret_cast(right + 1); if (l_null) { *reinterpret_cast(left) = false; *l_val = *r_val; @@ -95,10 +96,10 @@ struct AggregateFuncTraits { template <> struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; bool l_null = *reinterpret_cast(left); - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); if (r_null) { return; } @@ -119,16 +120,16 @@ struct AggregateFuncTraits template struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; bool l_null = *reinterpret_cast(left); - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); if (r_null) { return; } CppType* l_val = reinterpret_cast(left + 1); - CppType* r_val = reinterpret_cast(right + 1); + const CppType* r_val = reinterpret_cast(right + 1); if (l_null) { *reinterpret_cast(left) = false; *l_val = *r_val; @@ -140,10 +141,10 @@ struct AggregateFuncTraits { template <> struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; bool l_null = *reinterpret_cast(left); - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); if (r_null) { return; } @@ -163,14 +164,14 @@ struct AggregateFuncTraits template struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); *reinterpret_cast(left) = r_null; if (!r_null) { CppType* l_val = reinterpret_cast(left + 1); - CppType* r_val = reinterpret_cast(right + 1); + const CppType* r_val = reinterpret_cast(right + 1); *l_val = *r_val; } } @@ -178,9 +179,9 @@ struct AggregateFuncTraits { template <> struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { typedef typename FieldTypeTraits::CppType CppType; - bool r_null = *reinterpret_cast(right); + bool r_null = *reinterpret_cast(right); *reinterpret_cast(left) = r_null; if (!r_null) { @@ -191,30 +192,36 @@ struct AggregateFuncTraits struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { - bool r_null = *reinterpret_cast(right); + static void aggregate(char* left, const char* right, Arena* arena) { + bool r_null = *reinterpret_cast(right); *reinterpret_cast(left) = r_null; if (!r_null) { StringSlice* l_slice = reinterpret_cast(left + 1); - StringSlice* r_slice = reinterpret_cast(right + 1); - memory_copy(l_slice->data, r_slice->data, r_slice->size); - l_slice->size = r_slice->size; + const StringSlice* r_slice = reinterpret_cast(right + 1); + if (arena == nullptr || l_slice->size >= r_slice->size) { + memory_copy(l_slice->data, r_slice->data, r_slice->size); + l_slice->size = r_slice->size; + } else { + l_slice->data = arena->Allocate(r_slice->size); + memory_copy(l_slice->data, r_slice->data, r_slice->size); + l_slice->size = r_slice->size; + } } } }; template <> struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { //same with char aggregate AggregateFuncTraits::aggregate(left, right); + OLAP_FIELD_TYPE_CHAR>::aggregate(left, right, arena); } }; template <> struct AggregateFuncTraits { - static void aggregate(char* left, char* right) { + static void aggregate(char* left, const char* right, Arena* arena) { StringSlice* l_slice = reinterpret_cast(left + 1); size_t hll_ptr = *(size_t*)(l_slice->data - sizeof(HllContext*)); HllContext* context = (reinterpret_cast(hll_ptr)); @@ -250,7 +257,7 @@ struct AggregateFuncTraitsdata, index_to_value, result_len); } else if (context->hash64_set->size() > 0) { // expliclit set - HllSetHelper::set_expliclit(slice->data, *(context->hash64_set), result_len); + HllSetHelper::set_explicit(slice->data, *(context->hash64_set), result_len); } slice->size = result_len & 0xffff; diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index 1030f3cfbf..342e3eaab4 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -26,7 +26,7 @@ #include "olap/olap_data.h" #include "olap/olap_engine.h" #include "olap/olap_header.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "olap/olap_table.h" #include "olap/utils.h" #include "util/palo_metrics.h" @@ -38,26 +38,25 @@ using std::vector; namespace palo { -OLAPStatus BaseCompaction::init(SmartOLAPTable table, bool is_manual_trigger) { +OLAPStatus BaseCompaction::init(OLAPTablePtr table, bool is_manual_trigger) { // 表在首次查询或PUSHç­‰æ“作时,会被加载到内存 // å¦‚æžœè¡¨æ²¡æœ‰è¢«åŠ è½½ï¼Œè¡¨æ˜Žè¯¥è¡¨ä¸Šç›®å‰æ²¡æœ‰ä»»ä½•æ“作,所以ä¸è¿›è¡ŒBEæ“作 if (!table->is_loaded()) { return OLAP_ERR_INPUT_PARAMETER_ERROR; } - OLAP_LOG_TRACE("init base compaction handler. [table=%s]", table->full_name().c_str()); + LOG(INFO) << "init base compaction handler. [table=" << table->full_name() << "]"; _table = table; // 1. å°è¯•å–å¾—base compactionçš„é” if (!_try_base_compaction_lock()) { - OLAP_LOG_WARNING("another base compaction is running. [table=%s]", - table->full_name().c_str()); + LOG(WARNING) << "another base compaction is running. table=" << table->full_name(); return OLAP_ERR_BE_TRY_BE_LOCK_ERROR; } // 2. æ£€æŸ¥æ˜¯å¦æ»¡è¶³base compaction触å‘ç­–ç•¥ - OLAP_LOG_TRACE("check whether satisfy base compaction policy."); + VLOG(3) << "check whether satisfy base compaction policy."; bool is_policy_satisfied = false; vector candidate_versions; is_policy_satisfied = _check_whether_satisfy_policy(is_manual_trigger, &candidate_versions); @@ -65,14 +64,13 @@ OLAPStatus BaseCompaction::init(SmartOLAPTable table, bool is_manual_trigger) { // 2.1 å¦‚æžœä¸æ»¡è¶³è§¦å‘策略,则直接释放base compactioné”, è¿”å›žé”™è¯¯ç  if (!is_policy_satisfied) { _release_base_compaction_lock(); - return OLAP_ERR_BE_NO_SUITABLE_VERSION; } // 2.2 如果满足触å‘策略,触å‘base compaction // ä¸é‡Šæ”¾base compactioné”, 在run()完æˆä¹‹åŽå†é‡Šæ”¾ if (!_validate_need_merged_versions(candidate_versions)) { - OLAP_LOG_FATAL("error! invalid need merged versions"); + LOG(FATAL) << "error! invalid need merged versions"; _release_base_compaction_lock(); return OLAP_ERR_BE_INVALID_NEED_MERGED_VERSIONS; } @@ -91,8 +89,6 @@ OLAPStatus BaseCompaction::run() { OLAPStatus res = OLAP_SUCCESS; OlapStopWatch stage_watch; - _table->set_base_compaction_status(BASE_COMPACTION_RUNNING, _new_base_version.second); - // 1. 计算新baseçš„version hash VersionHash new_base_version_hash; res = _table->compute_all_versions_hash(_need_merged_versions, &new_base_version_hash); @@ -100,7 +96,7 @@ OLAPStatus BaseCompaction::run() { OLAP_LOG_WARNING("fail to calculate new base version hash.[table=%s; new_base_version=%d]", _table->full_name().c_str(), _new_base_version.second); - _cleanup(); + _garbage_collection(); return res; } @@ -113,7 +109,7 @@ OLAPStatus BaseCompaction::run() { OLAP_LOG_WARNING("fail to acquire need data sources. [table=%s; version=%d]", _table->full_name().c_str(), _new_base_version.second); - _cleanup(); + _garbage_collection(); return OLAP_ERR_BE_ACQUIRE_DATA_SOURCES_ERROR; } @@ -126,8 +122,6 @@ OLAPStatus BaseCompaction::run() { PaloMetrics::base_compaction_bytes_total.increment(merge_bytes); } - // ä¿å­˜ç”Ÿæˆbase文件时候计算的selectivities - vector selectivities; // ä¿å­˜ç”Ÿæˆbase文件时候累积的行数 uint64_t row_count = 0; @@ -136,7 +130,6 @@ OLAPStatus BaseCompaction::run() { stage_watch.reset(); res = _do_base_compaction(new_base_version_hash, &base_data_sources, - &selectivities, &row_count); // 释放ä¸å†ä½¿ç”¨çš„IData对象 _table->release_data_sources(&base_data_sources); @@ -144,25 +137,23 @@ OLAPStatus BaseCompaction::run() { OLAP_LOG_WARNING("fail to do base version. [table=%s; version=%d]", _table->full_name().c_str(), _new_base_version.second); - _cleanup(); + _garbage_collection(); return res; } - OLAP_LOG_TRACE("elapsed time of doing base version", "%ldus", - stage_watch.get_elapse_time_us()); - // 4. 使新生æˆçš„base生效,并删除ä¸å†éœ€è¦ç‰ˆæœ¬å¯¹åº”的文件 - _obtain_header_wrlock(); - vector unused_olap_indices; - // 使得新生æˆçš„å„个Version生效, å¦‚æžœå¤±è´¥æŽ‰åˆ™éœ€è¦æ¸…ç†æŽ‰å·²ç»ç”Ÿæˆçš„Version文件 - res = _update_header(selectivities, row_count, &unused_olap_indices); + VLOG(3) << "elapsed time of doing base compaction:" << stage_watch.get_elapse_time_us(); + + // 4. make new versions visable. + // If success, remove files belong to old versions; + // If fail, gc files belong to new versions. + vector unused_olap_indices; + res = _update_header(row_count, &unused_olap_indices); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to update header. [table=%s; version=%d]", - _table->full_name().c_str(), - _new_base_version.second); - _cleanup(); + LOG(WARNING) << "fail to update header. table=" << _table->full_name() << ", " + << "version=" << _new_base_version.first << "-" << _new_base_version.second; + _garbage_collection(); return res; } - _release_header_lock(); _delete_old_files(&unused_olap_indices); // validate that delete action is right @@ -179,47 +170,12 @@ OLAPStatus BaseCompaction::run() { sleep(60); } - _cleanup(); + _garbage_collection(); return OLAP_ERR_BE_ERROR_DELETE_ACTION; } - _table->set_base_compaction_status(BASE_COMPACTION_WAITING, -1); _release_base_compaction_lock(); - LOG(INFO) << "succeed to do base compaction. table=" << _table->full_name() << ", " - << "base_version=" << _new_base_version.first << "-" << _new_base_version.second; - return OLAP_SUCCESS; -} - -OLAPStatus BaseCompaction::_exclude_not_expired_delete( - const vector& need_merged_versions, - vector* candidate_versions) { - const int64_t delete_delta_expire_time = config::delete_delta_expire_time * 60; - OLAPStatus res = OLAP_SUCCESS; - for (unsigned int index = 0; index < need_merged_versions.size(); ++index) { - Version temp = need_merged_versions[index]; - int64_t file_creation_time = 0; - res = _table->version_creation_time(temp, &file_creation_time); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("table doesn't have version. [table=%s; version=%d-%d]", - _table->full_name().c_str(), temp.first, temp.second); - return res; - } - - int64_t file_existed_time = time(NULL) - file_creation_time; - // 从å°ç‰ˆæœ¬å·å¾€å¤§ç‰ˆæœ¬å·æŸ¥æ‰¾ï¼›æ‰¾åˆ°ç¬¬1个没有过期的delete版本时,退出 - if (_table->is_delete_data_version(temp) - && file_existed_time < delete_delta_expire_time) { - OLAP_LOG_INFO("delete version is not expired." - "[delete_version=%d; existed_time=%ld; expired_time=%ld]", - temp.first, file_existed_time, - delete_delta_expire_time); - break; - } - - candidate_versions->push_back(temp); - } - return OLAP_SUCCESS; } @@ -229,13 +185,11 @@ static bool version_comparator(const Version& lhs, const Version& rhs) { bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, vector* candidate_versions) { - _obtain_header_rdlock(); + ReadLock rdlock(_table->get_header_lock_ptr()); int32_t cumulative_layer_point = _table->cumulative_layer_point(); if (cumulative_layer_point == -1) { - OLAP_LOG_FATAL("tablet has an unreasonable cumulative layer point. " - "[tablet='%s' cumulative_layer_point=%d]", - _table->full_name().c_str(), cumulative_layer_point); - _release_header_lock(); + LOG(FATAL) << "tablet has an unreasonable cumulative layer point. [tablet='" << _table->full_name() + << "' cumulative_layer_point=" << cumulative_layer_point << "]"; return false; } @@ -247,7 +201,6 @@ bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, &path_versions)) { OLAP_LOG_WARNING("fail to select shortest version path. [start=%d end=%d]", 0, cumulative_layer_point); - _release_header_lock(); return false; } @@ -260,8 +213,8 @@ bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, // base文件 if (temp.first == 0) { _old_base_version = temp; - base_size = _table->get_version_entity_by_version(temp).data_size; - base_creation_time = _table->file_version(index).creation_time(); + base_size = _table->get_version_data_size(temp); + base_creation_time = _table->get_delta(index)->creation_time(); continue; } @@ -274,63 +227,40 @@ bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, // åªæœ‰1个base文件和1个delta文件 if (base_compaction_layer_point == -1) { - OLAP_LOG_TRACE("can't do base compaction: no cumulative files. " - "[table=%s; base_version=0-%d; cumulative_layer_point=%d]", - _table->full_name().c_str(), - _old_base_version.second, - cumulative_layer_point + 1); - - _release_header_lock(); + VLOG(3) << "can't do base compaction: no cumulative files." + << "table=" << _table->full_name() << ", " + << "base_version=0-" << _old_base_version.second << ", " + << "cumulative_layer_point=" << cumulative_layer_point + 1; return false; } // åªæœ‰1个cumulative文件 if (base_compaction_layer_point == _old_base_version.second) { - OLAP_LOG_TRACE("can't do base compaction: only one cumulative file. " - "[table=%s; base_version=0-%d; cumulative_layer_point=%d]", - _table->full_name().c_str(), - _old_base_version.second, - cumulative_layer_point + 1); - - _release_header_lock(); + VLOG(3) << "can't do base compaction: only one cumulative file." + << "table=" << _table->full_name() << ", " + << "base_version=0-" << _old_base_version.second << ", " + << "cumulative_layer_point=" << cumulative_layer_point + 1; return false; } // 使用最短路径算法,选择å¯åˆå¹¶çš„cumulative版本 - vector need_merged_versions; if (OLAP_SUCCESS != _table->select_versions_to_span(_new_base_version, - &need_merged_versions)) { - OLAP_LOG_WARNING("fail to select shortest version path. [start=%d end=%d]", - _new_base_version.first, _new_base_version.second); - _release_header_lock(); + candidate_versions)) { + LOG(WARNING) << "fail to select shortest version path." + << "start=" << _new_base_version.first << ", " + << "end=" << _new_base_version.second; return false; } - std::sort(need_merged_versions.begin(), need_merged_versions.end(), version_comparator); + std::sort(candidate_versions->begin(), candidate_versions->end(), version_comparator); - // 如果是手动执行START_BASE_COMPACTIONå‘½ä»¤ï¼Œåˆ™ä¸æ£€æŸ¥base compaction policy, + // 如果是手动执行START_BASE_COMPACTIONå‘½ä»¤ï¼Œåˆ™ä¸æ£€æŸ¥base compaction policy, // 也ä¸è€ƒè™‘删除版本过期问题, åªè¦æœ‰å¯ä»¥åˆå¹¶çš„cumulative,就执行base compaction if (is_manual_trigger) { - OLAP_LOG_TRACE("manual trigger base compaction. [table=%s]", _table->full_name().c_str()); - - *candidate_versions = need_merged_versions; - _release_header_lock(); + VLOG(3) << "manual trigger base compaction. table=" << _table->full_name(); return true; } - if (_exclude_not_expired_delete(need_merged_versions, candidate_versions) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to exclude not expired delete version."); - _release_header_lock(); - return false; - } - - if (candidate_versions->size() != need_merged_versions.size()) { - OLAP_LOG_INFO("reset new base version. " - "[previous_new_base_version=0-%d; new_base_version=0-%d]", - _new_base_version.second, candidate_versions->rbegin()->second); - _new_base_version = Version(0, candidate_versions->rbegin()->second); - } - // 统计å¯åˆå¹¶cumulativeç‰ˆæœ¬æ–‡ä»¶çš„æ€»å¤§å° size_t cumulative_total_size = 0; for (vector::const_iterator version_iter = candidate_versions->begin(); @@ -341,11 +271,9 @@ bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, continue; } // cumulative文件 - cumulative_total_size += _table->get_version_entity_by_version(temp).data_size; + cumulative_total_size += _table->get_version_data_size(temp); } - _release_header_lock(); - // æ£€æŸ¥æ˜¯å¦æ»¡è¶³base compactionçš„è§¦å‘æ¡ä»¶ // 满足以下æ¡ä»¶æ—¶è§¦å‘base compaction: è§¦å‘æ¡ä»¶1 || è§¦å‘æ¡ä»¶2 || è§¦å‘æ¡ä»¶3 // è§¦å‘æ¡ä»¶1:cumulative文件个数超过一个阈值 @@ -353,11 +281,9 @@ bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, = config::base_compaction_num_cumulative_deltas; // candidate_versions中包å«base文件,所以这里å‡1 if (candidate_versions->size() - 1 >= base_compaction_num_cumulative_deltas) { - OLAP_LOG_INFO("satisfy the base compaction policy. [table=%s; " - "num_cumualtive_deltas=%d; base_compaction_num_cumulative_deltas=%d]", - _table->full_name().c_str(), - candidate_versions->size() - 1, - base_compaction_num_cumulative_deltas); + LOG(INFO) << "satisfy the base compaction policy. table="<< _table->full_name() << ", " + << "num_cumulative_deltas=" << candidate_versions->size() - 1 << ", " + << "base_compaction_num_cumulative_deltas=" << base_compaction_num_cumulative_deltas; return true; } @@ -365,49 +291,42 @@ bool BaseCompaction::_check_whether_satisfy_policy(bool is_manual_trigger, const double base_cumulative_delta_ratio = config::base_cumulative_delta_ratio; double cumulative_base_ratio = static_cast(cumulative_total_size) / base_size; if (cumulative_base_ratio > base_cumulative_delta_ratio) { - OLAP_LOG_INFO("satisfy the base compaction policy. [table=%s; cumualtive_total_size=%d; " - "base_size=%d; cumulative_base_ratio=%f; policy_ratio=%f]", - _table->full_name().c_str(), - cumulative_total_size, - base_size, - cumulative_base_ratio, - base_cumulative_delta_ratio); + LOG(INFO) << "satisfy the base compaction policy. table=" << _table->full_name() << ", " + << "cumualtive_total_size=" << cumulative_total_size << ", " + << "base_size=" << base_size << ", " + << "cumulative_base_ratio=" << cumulative_base_ratio << ", " + << "policy_ratio=" << base_cumulative_delta_ratio; return true; } // è§¦å‘æ¡ä»¶3:è·ç¦»ä¸Šä¸€æ¬¡è¿›è¡Œbase compactionå·²ç»è¶…过设定的间隔时间 - const uint32_t interval_since_laste_operation = config::base_compaction_interval_seconds_since_last_operation; + const uint32_t interval_since_last_operation = config::base_compaction_interval_seconds_since_last_operation; int64_t interval_since_last_be = time(NULL) - base_creation_time; - if (interval_since_last_be > interval_since_laste_operation) { - OLAP_LOG_INFO("satisfy the base compaction policy. [table=%s; " - "interval_since_last_be=%ld; policy_interval=%ld]", - _table->full_name().c_str(), - interval_since_last_be, interval_since_laste_operation); + if (interval_since_last_be > interval_since_last_operation) { + LOG(INFO) << "satisfy the base compaction policy. table=" << _table->full_name() << ", " + << "interval_since_last_be=" << interval_since_last_be << ", " + << "policy_interval=" << interval_since_last_operation; return true; } - OLAP_LOG_TRACE( - "don't satisfy the base compaction policy." - "[cumulative_files_number=%d; cumulative_base_ratio=%f; interval_since_last_be=%ld]", - candidate_versions->size() - 1, - cumulative_base_ratio, - interval_since_last_be); + VLOG(3) << "don't satisfy the base compaction policy. table=" << _table->full_name() << ", " + << "cumulative_files_number=" << candidate_versions->size() - 1 << ", " + << "cumulative_base_ratio=" << cumulative_base_ratio << ", " + << "interval_since_last_be=" << interval_since_last_be; return false; } OLAPStatus BaseCompaction::_do_base_compaction(VersionHash new_base_version_hash, - vector* base_data_sources, - vector* selectivities, - uint64_t* row_count) { + vector* base_data_sources, + uint64_t* row_count) { // 1. ç”Ÿæˆæ–°base文件对应的olap index - OLAPIndex* new_base = new (std::nothrow) OLAPIndex(_table.get(), + Rowset* new_base = new (std::nothrow) Rowset(_table.get(), _new_base_version, new_base_version_hash, - false, - 0, 0); + false, 0, 0); if (new_base == NULL) { - OLAP_LOG_WARNING("fail to new OLAPIndex."); + OLAP_LOG_WARNING("fail to new Rowset."); return OLAP_ERR_MALLOC_ERROR; } @@ -430,18 +349,12 @@ OLAPStatus BaseCompaction::_do_base_compaction(VersionHash new_base_version_hash if (_table->data_file_type() == OLAP_DATA_FILE || _table->data_file_type() == COLUMN_ORIENTED_FILE) { _table->obtain_header_rdlock(); - bool use_simple_merge = true; - if (_table->delete_data_conditions_size() > 0) { - use_simple_merge = false; - } _table->release_header_lock(); Merger merger(_table, new_base, READER_BASE_COMPACTION); - res = merger.merge( - *base_data_sources, use_simple_merge, &merged_rows, &filted_rows); + res = merger.merge(*base_data_sources, &merged_rows, &filted_rows); if (res == OLAP_SUCCESS) { *row_count = merger.row_count(); - *selectivities = merger.selectivities(); } } else { OLAP_LOG_WARNING("unknown data file type. [type=%s]", @@ -488,23 +401,28 @@ OLAPStatus BaseCompaction::_do_base_compaction(VersionHash new_base_version_hash bool row_nums_check = config::row_nums_check; if (row_nums_check) { if (source_rows != new_base->num_rows() + merged_rows + filted_rows) { - OLAP_LOG_FATAL("fail to check row num! " - "[source_rows=%lu merged_rows=%lu filted_rows=%lu new_index_rows=%lu]", - source_rows, merged_rows, filted_rows, new_base->num_rows()); + LOG(WARNING) << "fail to check row num!" + << "source_rows=" << source_rows << ", " + << "merged_rows=" << merged_rows << ", " + << "filted_rows=" << filted_rows << ", " + << "new_index_rows=" << new_base->num_rows(); return OLAP_ERR_CHECK_LINES_ERROR; } } else { - OLAP_LOG_INFO("all row nums. " - "[source_rows=%lu merged_rows=%lu filted_rows=%lu new_index_rows=%lu]", - source_rows, merged_rows, filted_rows, new_base->num_rows()); + LOG(INFO) << "all row nums." + << "source_rows=" << source_rows << ", " + << "merged_rows=" << merged_rows << ", " + << "filted_rows=" << filted_rows << ", " + << "new_index_rows=" << new_base->num_rows(); } + LOG(INFO) << "succeed to do base compaction. table=" << _table->full_name() << ", " + << "base_version=" << _new_base_version.first << "-" << _new_base_version.second; return OLAP_SUCCESS; } -OLAPStatus BaseCompaction::_update_header(const vector& selectivities, - uint64_t row_count, - vector* unused_olap_indices) { +OLAPStatus BaseCompaction::_update_header(uint64_t row_count, vector* unused_olap_indices) { + WriteLock wrlock(_table->get_header_lock_ptr()); vector unused_versions; _get_unused_versions(&unused_versions); @@ -522,8 +440,6 @@ OLAPStatus BaseCompaction::_update_header(const vector& selectivities, return res; } - _table->set_selectivities(selectivities); - OLAP_LOG_INFO("BE remove delete conditions. [removed_version=%d]", _new_base_version.second); // Base Compaction完æˆä¹‹åŽï¼Œéœ€è¦åˆ é™¤header中版本å·å°äºŽç­‰äºŽæ–°base文件版本å·çš„删除æ¡ä»¶ @@ -546,30 +462,27 @@ OLAPStatus BaseCompaction::_update_header(const vector& selectivities, return OLAP_SUCCESS; } -void BaseCompaction::_delete_old_files(vector* unused_indices) { +void BaseCompaction::_delete_old_files(vector* unused_indices) { if (!unused_indices->empty()) { - OLAPUnusedIndex* unused_index = OLAPUnusedIndex::get_instance(); + OLAPEngine* unused_index = OLAPEngine::get_instance(); - for (vector::iterator it = unused_indices->begin(); + for (vector::iterator it = unused_indices->begin(); it != unused_indices->end(); ++it) { unused_index->add_unused_index(*it); } } } -void BaseCompaction::_cleanup() { +void BaseCompaction::_garbage_collection() { // æ¸…ç†æŽ‰å·²ç”Ÿæˆçš„版本文件 - for (vector::iterator it = _new_olap_indices.begin(); + for (vector::iterator it = _new_olap_indices.begin(); it != _new_olap_indices.end(); ++it) { (*it)->delete_all_files(); SAFE_DELETE(*it); } _new_olap_indices.clear(); - // é‡Šæ”¾æ‰“å¼€çš„é” - _release_header_lock(); _release_base_compaction_lock(); - _table->set_base_compaction_status(BASE_COMPACTION_WAITING, -1); } bool BaseCompaction::_validate_need_merged_versions( @@ -612,22 +525,20 @@ bool BaseCompaction::_validate_need_merged_versions( OLAPStatus BaseCompaction::_validate_delete_file_action() { // 1. acquire the latest version to make sure all is right after deleting files - _obtain_header_rdlock(); - const FileVersionMessage* latest_version = _table->latest_version(); - Version test_version = Version(0, latest_version->end_version()); + ReadLock rdlock(_table->get_header_lock_ptr()); + const PDelta* lastest_version = _table->lastest_version(); + Version test_version = Version(0, lastest_version->end_version()); vector test_sources; _table->acquire_data_sources(test_version, &test_sources); if (test_sources.size() == 0) { - OLAP_LOG_INFO("acquire data sources failed. version=%d-%d", - test_version.first, test_version.second); - _release_header_lock(); + LOG(INFO) << "acquire data sources failed. version=" + << test_version.first << "-" << test_version.second; return OLAP_ERR_BE_ERROR_DELETE_ACTION; } _table->release_data_sources(&test_sources); - OLAP_LOG_TRACE("delete file action is OK"); - _release_header_lock(); + VLOG(3) << "delete file action is OK"; return OLAP_SUCCESS; } diff --git a/be/src/olap/base_compaction.h b/be/src/olap/base_compaction.h index 2734f70a68..46316633c5 100644 --- a/be/src/olap/base_compaction.h +++ b/be/src/olap/base_compaction.h @@ -21,12 +21,12 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" -#include "olap/olap_index.h" #include "olap/olap_table.h" namespace palo { class IData; +class Rowset; // @brief 实现对START_BASE_COMPACTION命令的处ç†é€»è¾‘,并返回处ç†ç»“æžœ class BaseCompaction { @@ -34,8 +34,7 @@ public: BaseCompaction() : _new_base_version(0, 0), _old_base_version(0, 0), - _base_compaction_locked(false), - _header_locked(false) {} + _base_compaction_locked(false) {} virtual ~BaseCompaction() { _release_base_compaction_lock(); @@ -54,7 +53,7 @@ public: // 返回值: // - 如果init执行æˆåŠŸï¼Œå³å¯ä»¥æ‰§è¡ŒBE,则返回OLAP_SUCCESSï¼› // - å…¶å®ƒæƒ…å†µä¸‹ï¼Œè¿”å›žç›¸åº”çš„é”™è¯¯ç  - OLAPStatus init(SmartOLAPTable table, bool is_manual_trigger); + OLAPStatus init(OLAPTablePtr table, bool is_manual_trigger = false); // 执行BaseCompaction, å¯èƒ½ä¼šæŒç»­å¾ˆé•¿æ—¶é—´ // @@ -64,10 +63,6 @@ public: OLAPStatus run(); private: - // 从need_merged_versions中剔除没过期的delete版本以åŠå¤§äºŽè¯¥delete版本的cumulative - OLAPStatus _exclude_not_expired_delete(const std::vector& need_merged_versions, - std::vector* candidate_versions); - // æ£€éªŒå½“å‰æƒ…å†µæ˜¯å¦æ»¡è¶³base compaction的触å‘ç­–ç•¥ // // è¾“å…¥å‚æ•°ï¼š @@ -86,7 +81,6 @@ private: // è¾“å…¥å‚æ•°ï¼š // - new_base_version_hash: æ–°Baseçš„VersionHash // - base_data_sources: ç”Ÿæˆæ–°Base需è¦çš„IData* - // - selectivities: 生æˆBase过程中产生的selectivities // - row_count: 生æˆBase过程中产生的row_count // // 返回值: @@ -94,37 +88,30 @@ private: // - å…¶å®ƒæƒ…å†µä¸‹ï¼Œè¿”å›žç›¸åº”çš„é”™è¯¯ç  OLAPStatus _do_base_compaction(VersionHash new_base_version_hash, std::vector* base_data_sources, - std::vector* selectivities, uint64_t* row_count); // æ›´æ–°Header使得修改对外å¯è§ - // - // è¾“å…¥å‚æ•°ï¼š - // - selectivities: 生æˆBase过程中产生的selectivities - // - row_count: 生æˆBase过程中产生的row_count - // // è¾“å‡ºå‚æ•°ï¼š - // - unused_olap_indices: 需è¦è¢«ç‰©ç†åˆ é™¤çš„OLAPIndex* + // - unused_olap_indices: 需è¦è¢«ç‰©ç†åˆ é™¤çš„Rowset* // // 返回值: // - 如果执行æˆåŠŸï¼Œåˆ™è¿”å›žOLAP_SUCCESSï¼› // - å…¶å®ƒæƒ…å†µä¸‹ï¼Œè¿”å›žç›¸åº”çš„é”™è¯¯ç  - OLAPStatus _update_header(const std::vector& selectivities, - uint64_t row_count, - std::vector* unused_olap_indices); + OLAPStatus _update_header(uint64_t row_count, + std::vector* unused_olap_indices); - // 删除ä¸å†ä½¿ç”¨çš„OLAPIndex文件 + // 删除ä¸å†ä½¿ç”¨çš„Rowset文件 // // è¾“å…¥å‚æ•°ï¼š - // - unused_olap_indices: 需è¦è¢«ç‰©ç†åˆ é™¤çš„OLAPIndex* + // - unused_olap_indices: 需è¦è¢«ç‰©ç†åˆ é™¤çš„Rowset* // // 返回值: // - 如果执行æˆåŠŸï¼Œåˆ™è¿”å›žOLAP_SUCCESSï¼› // - å…¶å®ƒæƒ…å†µä¸‹ï¼Œè¿”å›žç›¸åº”çš„é”™è¯¯ç  - void _delete_old_files(std::vector* unused_indices); + void _delete_old_files(std::vector* unused_indices); // 其它函数执行失败时,调用该函数进行清ç†å·¥ä½œ - void _cleanup(); + void _garbage_collection(); // 验è¯å¾—到的candidate_versionsæ˜¯å¦æ­£ç¡® // @@ -174,25 +161,8 @@ private: } } - void _obtain_header_rdlock() { - _table->obtain_header_rdlock(); - _header_locked = true; - } - - void _obtain_header_wrlock() { - _table->obtain_header_wrlock(); - _header_locked = true; - } - - void _release_header_lock() { - if (_header_locked) { - _table->release_header_lock(); - _header_locked = false; - } - } - // 需è¦è¿›è¡Œæ“作的Table指针 - SmartOLAPTable _table; + OLAPTablePtr _table; // æ–°baseçš„version Version _new_base_version; // 现有baseçš„version @@ -201,11 +171,10 @@ private: Version _latest_cumulative; // 在此次base compaction执行过程中,将被åˆå¹¶çš„cumulative文件版本 std::vector _need_merged_versions; - // éœ€è¦æ–°å¢žçš„版本对应的OLAPIndex - std::vector _new_olap_indices; + // éœ€è¦æ–°å¢žçš„版本对应的Rowset + std::vector _new_olap_indices; bool _base_compaction_locked; - bool _header_locked; DISALLOW_COPY_AND_ASSIGN(BaseCompaction); }; diff --git a/be/src/olap/column_file/column_data.cpp b/be/src/olap/column_file/column_data.cpp index c39f59ecec..0b452009a0 100644 --- a/be/src/olap/column_file/column_data.cpp +++ b/be/src/olap/column_file/column_data.cpp @@ -23,7 +23,7 @@ namespace palo { namespace column_file { -ColumnData::ColumnData(OLAPIndex* olap_index) : +ColumnData::ColumnData(Rowset* olap_index) : IData(COLUMN_ORIENTED_FILE, olap_index), _is_using_cache(false), _segment_reader(NULL) { @@ -114,9 +114,7 @@ OLAPStatus ColumnData::_seek_to_block(const RowBlockPosition& block_pos, bool wi } SAFE_DELETE(_segment_reader); std::string file_name; - file_name = _table->construct_data_file_path(olap_index()->version(), - olap_index()->version_hash(), - block_pos.segment); + file_name = olap_index()->construct_data_file_path(olap_index()->rowset_id(), block_pos.segment); _segment_reader = new(std::nothrow) SegmentReader( file_name, _table, olap_index(), block_pos.segment, _seek_columns, _load_bf_columns, _conditions, @@ -272,8 +270,8 @@ OLAPStatus ColumnData::_seek_to_row(const RowCursor& key, bool find_last_key, bo } if (res != OLAP_SUCCESS) { if (res != OLAP_ERR_DATA_EOF) { - OLAP_LOG_WARNING("Fail to find the key.[res=%d key=%s find_last_key=%d]", - res, key.to_string().c_str(), find_last_key); + LOG(WARNING) << "Fail to find the key.[res=" << res << " key=" << key.to_string() + << " find_last_key=" << find_last_key << "]"; } return res; } @@ -290,8 +288,8 @@ OLAPStatus ColumnData::_seek_to_row(const RowCursor& key, bool find_last_key, bo res = _get_block(without_filter); if (res != OLAP_SUCCESS) { if (res != OLAP_ERR_DATA_EOF) { - OLAP_LOG_WARNING("Fail to find the key.[res=%d key=%s find_last_key=%d]", - res, key.to_string().c_str(), find_last_key); + LOG(WARNING) << "Fail to find the key.[res=" << res + << " key=" << key.to_string() << " find_last_key=" << find_last_key << "]"; } return res; } @@ -478,7 +476,7 @@ OLAPStatus ColumnData::get_first_row_block(RowBlock** row_block) { _eof = true; return res; } - OLAP_LOG_WARNING("fail to find first row block with OLAPIndex."); + OLAP_LOG_WARNING("fail to find first row block with Rowset."); return res; } diff --git a/be/src/olap/column_file/column_data.h b/be/src/olap/column_file/column_data.h index 824355b86e..d6a9e240cb 100644 --- a/be/src/olap/column_file/column_data.h +++ b/be/src/olap/column_file/column_data.h @@ -33,7 +33,7 @@ class SegmentReader; // This class is column data reader. this class will be used in two case. class ColumnData : public IData { public: - explicit ColumnData(OLAPIndex* olap_index); + explicit ColumnData(Rowset* olap_index); virtual ~ColumnData(); virtual OLAPStatus init(); @@ -144,7 +144,7 @@ public: ColumnDataComparator( RowBlockPosition position, ColumnData* olap_data, - const OLAPIndex* index) + const Rowset* index) : _start_block_position(position), _olap_data(olap_data), _index(index) {} @@ -186,7 +186,7 @@ private: const RowBlockPosition _start_block_position; ColumnData* _olap_data; - const OLAPIndex* _index; + const Rowset* _index; }; } // namespace column_file diff --git a/be/src/olap/column_file/column_reader.cpp b/be/src/olap/column_file/column_reader.cpp index 1526d3c8ce..2aac2cb0f5 100644 --- a/be/src/olap/column_file/column_reader.cpp +++ b/be/src/olap/column_file/column_reader.cpp @@ -640,7 +640,7 @@ ColumnReader* ColumnReader::create(uint32_t column_id, field_info.default_value, field_info.type, field_info.length); } } else if (field_info.is_allow_null) { - OLAP_LOG_DEBUG("create NullValueReader: %s", field_info.name.c_str()); + LOG(WARNING) << "create NullValueReader: " << field_info.name; return new(std::nothrow) NullValueReader(column_id, column_unique_id); } else { OLAP_LOG_WARNING("not null field has no default value"); @@ -775,8 +775,8 @@ ColumnReader* ColumnReader::create(uint32_t column_id, case OLAP_FIELD_TYPE_LIST: case OLAP_FIELD_TYPE_MAP: default: { - OLAP_LOG_WARNING("unspported filed type. [field=%s type=%d]", - field_info.name.c_str(), field_info.type); + LOG(WARNING) << "unspported filed type. [field=" << field_info.name + << " type=" << field_info.type << "]"; break; } } diff --git a/be/src/olap/column_file/column_writer.cpp b/be/src/olap/column_file/column_writer.cpp index 829ff846de..ab4e01f1f0 100755 --- a/be/src/olap/column_file/column_writer.cpp +++ b/be/src/olap/column_file/column_writer.cpp @@ -16,8 +16,6 @@ #include "olap/column_file/column_writer.h" #include "olap/column_file/bit_field_writer.h" -#include "olap/column_file/run_length_byte_writer.h" -#include "olap/column_file/run_length_integer_writer.h" #include "olap/file_helper.h" namespace palo { @@ -496,27 +494,6 @@ OLAPStatus ByteColumnWriter::init() { return OLAP_SUCCESS; } -OLAPStatus ByteColumnWriter::write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter."); - return res; - } - - const Field* field = row_cursor->get_field_by_index(column_id()); - - bool is_null = row_cursor->is_null(column_id()); - char* buf = field->get_field_ptr(row_cursor->get_buf()); - _block_statistics.add(buf); - if (!is_null) { - char value = *reinterpret_cast(buf + 1); - return _writer->write(value); - } - - return OLAP_SUCCESS; -} - OLAPStatus ByteColumnWriter::finalize(ColumnDataHeaderMessage* header) { OLAPStatus res = OLAP_SUCCESS; @@ -574,21 +551,6 @@ OLAPStatus IntegerColumnWriter::init() { return OLAP_SUCCESS; } -OLAPStatus IntegerColumnWriter::write(int64_t data) { - return _writer->write(data); -} - -OLAPStatus IntegerColumnWriter::finalize(ColumnDataHeaderMessage* header) { - return _writer->flush(); -} - -void IntegerColumnWriter::record_position(PositionEntryWriter* index_entry) { - _writer->get_position(index_entry, false); -} - -OLAPStatus IntegerColumnWriter::flush() { - return _writer->flush(); -} //////////////////////////////////////////////////////////////////////////////// VarStringColumnWriter::VarStringColumnWriter( @@ -642,26 +604,6 @@ OLAPStatus VarStringColumnWriter::init() { return OLAP_SUCCESS; } -OLAPStatus VarStringColumnWriter::write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter."); - return res; - } - - const Field* field = row_cursor->get_field_by_index(column_id()); - bool is_null = row_cursor->is_null(column_id()); - char* buf = field->get_ptr(row_cursor->get_buf()); - - if (!is_null) { - StringSlice* slice = reinterpret_cast(buf); - return write(slice->data, slice->size); - } - - return OLAP_SUCCESS; -} - OLAPStatus VarStringColumnWriter::write(const char* str, uint32_t len) { OLAPStatus res = OLAP_SUCCESS; // zdb shield the dictionary coding @@ -867,27 +809,6 @@ FixLengthStringColumnWriter::FixLengthStringColumnWriter( FixLengthStringColumnWriter::~FixLengthStringColumnWriter() {} -OLAPStatus FixLengthStringColumnWriter::write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter."); - return res; - } - - const Field* field = row_cursor->get_field_by_index(column_id()); - bool is_null = row_cursor->is_null(column_id()); - char* buf = field->get_ptr(row_cursor->get_buf()); - - if (!is_null) { - //const char* str = reinterpret_cast(buf); - StringSlice* slice = reinterpret_cast(buf); - return VarStringColumnWriter::write(slice->data, slice->size); - } - - return OLAP_SUCCESS; -} - //////////////////////////////////////////////////////////////////////////////// DecimalColumnWriter::DecimalColumnWriter(uint32_t column_id, @@ -934,37 +855,6 @@ OLAPStatus DecimalColumnWriter::init() { return OLAP_SUCCESS; } -OLAPStatus DecimalColumnWriter::write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter."); - return res; - } - - const Field* field = row_cursor->get_field_by_index(column_id()); - bool is_null = row_cursor->is_null(column_id()); - char* buf = field->get_field_ptr(row_cursor->get_buf()); - _block_statistics.add(buf); - if (!is_null) { - decimal12_t value = *reinterpret_cast(buf + 1); - - res = _int_writer->write(value.integer); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to write integer of Decimal."); - return res; - } - - res = _frac_writer->write(value.fraction); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to write fraction of Decimal."); - return res; - } - } - - return OLAP_SUCCESS; -} - OLAPStatus DecimalColumnWriter::finalize(ColumnDataHeaderMessage* header) { OLAPStatus res; @@ -1041,37 +931,6 @@ OLAPStatus LargeIntColumnWriter::init() { return OLAP_SUCCESS; } -OLAPStatus LargeIntColumnWriter::write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter."); - return res; - } - - const Field* field = row_cursor->get_field_by_index(column_id()); - bool is_null = row_cursor->is_null(column_id()); - char* buf = field->get_field_ptr(row_cursor->get_buf()); - _block_statistics.add(buf); - if (!is_null) { - - int64_t* value = reinterpret_cast(buf + 1); - res = _high_writer->write(*value); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to write integer of LargeInt."); - return res; - } - - res = _low_writer->write(*(++value)); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to write fraction of LargeInt."); - return res; - } - } - - return OLAP_SUCCESS; -} - OLAPStatus LargeIntColumnWriter::finalize(ColumnDataHeaderMessage* header) { OLAPStatus res; diff --git a/be/src/olap/column_file/column_writer.h b/be/src/olap/column_file/column_writer.h index a07373ed55..8fc428871c 100644 --- a/be/src/olap/column_file/column_writer.h +++ b/be/src/olap/column_file/column_writer.h @@ -24,9 +24,12 @@ #include "olap/column_file/bloom_filter_writer.h" #include "olap/column_file/out_stream.h" #include "olap/column_file/stream_index_writer.h" +#include "olap/column_file/run_length_byte_writer.h" +#include "olap/column_file/run_length_integer_writer.h" #include "olap/field.h" #include "olap/olap_common.h" #include "olap/olap_define.h" +#include "olap/row_block.h" #include "olap/row_cursor.h" namespace palo { @@ -55,7 +58,11 @@ public: double bf_fpp); virtual ~ColumnWriter(); virtual OLAPStatus init(); - virtual OLAPStatus write(RowCursor* row_cursor); + + OLAPStatus write(RowCursor* cursor); + + virtual OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) = 0; + // 将之å‰è®°å½•çš„blockä½ç½®ä¿¡æ¯ä¸Žå½“å‰çš„统计信æ¯å†™å…¥åˆ°ä¸€ä¸ªæ–°çš„索引项中 OLAPStatus create_row_index_entry(); // 估算当å‰ç¼“存的内存大å°, ä¸åŒ…括已ç»è¾“出到OutStream的内存 @@ -143,7 +150,33 @@ public: double bf_fpp); virtual ~ByteColumnWriter(); virtual OLAPStatus init(); - virtual OLAPStatus write(RowCursor* row_cursor); + + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to write ColumnWriter."); + return res; + } + + const Field* field = cursor->get_field_by_index(column_id()); + bool is_null = field->is_null(cursor->get_buf()); + char* buf = field->get_field_ptr(cursor->get_buf()); + _block_statistics.add(buf); + if (!is_null) { + char value = *reinterpret_cast(buf + 1); + res = _writer->write(value); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to write double, res=" << res; + return res; + } + } + } + return OLAP_SUCCESS; + } + virtual OLAPStatus finalize(ColumnDataHeaderMessage* header); virtual void record_position(); virtual OLAPStatus flush() { @@ -165,10 +198,18 @@ public: bool is_singed); ~IntegerColumnWriter(); OLAPStatus init(); - OLAPStatus write(int64_t data); - OLAPStatus finalize(ColumnDataHeaderMessage* header); - void record_position(PositionEntryWriter* index_entry); - OLAPStatus flush(); + OLAPStatus write(int64_t data) { + return _writer->write(data); + } + OLAPStatus finalize(ColumnDataHeaderMessage* header) { + return _writer->flush(); + } + void record_position(PositionEntryWriter* index_entry) { + _writer->get_position(index_entry, false); + } + OLAPStatus flush() { + return _writer->flush(); + } private: uint32_t _column_id; @@ -213,23 +254,28 @@ public: return OLAP_SUCCESS; } - virtual OLAPStatus write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { + OLAP_LOG_WARNING("fail to write ColumnWriter. [res=%d]", res); + return res; + } - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter. [res=%d]", res); - return res; + const Field* field = cursor->get_field_by_index(column_id()); + bool is_null = field->is_null(cursor->get_buf()); + char* buf = field->get_field_ptr(cursor->get_buf()); + _block_statistics.add(buf); + if (!is_null) { + T value = *reinterpret_cast(buf + 1); + res = _writer.write(static_cast(value)); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to write integer, res=" << res; + return res; + } + } } - - bool is_null = row_cursor->is_null(column_id()); - const Field* field = row_cursor->get_field_by_index(column_id()); - char* buf = field->get_field_ptr(row_cursor->get_buf()); - _block_statistics.add(buf); - if (!is_null) { - T value = *reinterpret_cast(buf + 1); - return _writer.write(static_cast(value)); - } - return OLAP_SUCCESS; } @@ -311,23 +357,29 @@ public: return OLAP_SUCCESS; } - virtual OLAPStatus write(RowCursor* row_cursor) { - OLAPStatus res = ColumnWriter::write(row_cursor); + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to write ColumnWriter. [res=%d]", res); + return res; + } - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to write ColumnWriter. [res=%d]", res); - return res; + const Field* field = cursor->get_field_by_index(column_id()); + + bool is_null = field->is_null(cursor->get_buf()); + char* buf = field->get_field_ptr(cursor->get_buf()); + _block_statistics.add(buf); + if (!is_null) { + T* value = reinterpret_cast(buf + 1); + res = _stream->write(reinterpret_cast(value), sizeof(T)); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to write double, res=" << res; + return res; + } + } } - - bool is_null = row_cursor->is_null(column_id()); - const Field* field = row_cursor->get_field_by_index(column_id()); - char* buf = field->get_field_ptr(row_cursor->get_buf()); - _block_statistics.add(buf); - if (!is_null) { - T* value = reinterpret_cast(buf + 1); - return _stream->write(reinterpret_cast(value), sizeof(T)); - } - return OLAP_SUCCESS; } @@ -372,7 +424,30 @@ public: double bf_fpp); virtual ~VarStringColumnWriter(); virtual OLAPStatus init(); - virtual OLAPStatus write(RowCursor* row_cursor); + + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to write ColumnWriter."); + return res; + } + auto field = cursor->get_field_by_index(column_id()); + bool is_null = field->is_null(cursor->get_buf()); + if (!is_null) { + char* buf = field->get_ptr(cursor->get_buf()); + StringSlice* slice = reinterpret_cast(buf); + res = write(slice->data, slice->size); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to write varchar, res=" << res; + return res; + } + } + } + return OLAP_SUCCESS; + } + virtual uint64_t estimate_buffered_memory(); virtual OLAPStatus finalize(ColumnDataHeaderMessage* header); virtual void save_encoding(ColumnEncodingMessage* encoding); @@ -428,7 +503,34 @@ public: size_t num_rows_per_row_block, double bf_fpp); virtual ~FixLengthStringColumnWriter(); - virtual OLAPStatus write(RowCursor* row_cursor); + + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to write ColumnWriter."); + return res; + } + + const Field* field = cursor->get_field_by_index(column_id()); + bool is_null = field->is_null(cursor->get_buf()); + char* buf = field->get_ptr(cursor->get_buf()); + + if (!is_null) { + //const char* str = reinterpret_cast(buf); + StringSlice* slice = reinterpret_cast(buf); + res = VarStringColumnWriter::write(slice->data, slice->size); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to write fix-length string, res=" << res; + return res; + } + } + } + return OLAP_SUCCESS; + } + virtual OLAPStatus flush() { return OLAP_SUCCESS; } @@ -451,7 +553,37 @@ public: double bf_fpp); virtual ~DecimalColumnWriter(); virtual OLAPStatus init(); - virtual OLAPStatus write(RowCursor* row_cursor); + + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to write ColumnWriter."); + return res; + } + + const Field* field = cursor->get_field_by_index(column_id()); + bool is_null = field->is_null(cursor->get_buf()); + char* buf = field->get_field_ptr(cursor->get_buf()); + _block_statistics.add(buf); + if (!is_null) { + decimal12_t value = *reinterpret_cast(buf + 1); + res = _int_writer->write(value.integer); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to write integer of Decimal."); + return res; + } + res = _frac_writer->write(value.fraction); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to write fraction of Decimal."); + return res; + } + } + } + return OLAP_SUCCESS; + } + virtual OLAPStatus finalize(ColumnDataHeaderMessage* header); virtual void record_position(); virtual OLAPStatus flush() { @@ -471,7 +603,36 @@ public: double bf_fpp); virtual ~LargeIntColumnWriter(); virtual OLAPStatus init(); - virtual OLAPStatus write(RowCursor* row_cursor); + + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor) override { + for (uint32_t i = 0; i < block->row_block_info().row_num; i++) { + block->get_row(i, cursor); + OLAPStatus res = ColumnWriter::write(cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to write ColumnWriter."); + return res; + } + const Field* field = cursor->get_field_by_index(column_id()); + bool is_null = field->is_null(cursor->get_buf()); + char* buf = field->get_field_ptr(cursor->get_buf()); + _block_statistics.add(buf); + if (!is_null) { + int64_t* value = reinterpret_cast(buf + 1); + res = _high_writer->write(*value); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to write integer of LargeInt."); + return res; + } + res = _low_writer->write(*(++value)); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to write fraction of LargeInt."); + return res; + } + } + } + return OLAP_SUCCESS; + } + virtual OLAPStatus finalize(ColumnDataHeaderMessage* header); virtual void record_position(); virtual OLAPStatus flush() { diff --git a/be/src/olap/column_file/data_writer.cpp b/be/src/olap/column_file/data_writer.cpp index 3a7113436a..95e6ce396f 100644 --- a/be/src/olap/column_file/data_writer.cpp +++ b/be/src/olap/column_file/data_writer.cpp @@ -18,22 +18,27 @@ #include #include "olap/column_file/segment_writer.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "olap/row_block.h" namespace palo { namespace column_file { -ColumnDataWriter::ColumnDataWriter(SmartOLAPTable table, OLAPIndex* index, bool is_push_write) : - IWriter(is_push_write, table), - _index(index), - _row_block(NULL), - _segment_writer(NULL), - _num_rows(0), - _block_id(0), - _max_segment_size(OLAP_MAX_SEGMENT_FILE_SIZE), - _segment(0) {} +ColumnDataWriter::ColumnDataWriter(OLAPTablePtr table, Rowset* index, bool is_push_write) + : IWriter(is_push_write, table), + _index(index), + _row_block(NULL), + _segment_writer(NULL), + _num_rows(0), + _block_id(0), + _max_segment_size(OLAP_MAX_SEGMENT_FILE_SIZE), + _segment(0), + _all_num_rows(0), + _new_segment_created(false) +{ + init(); +} ColumnDataWriter::~ColumnDataWriter() { SAFE_DELETE(_row_block); @@ -51,13 +56,12 @@ OLAPStatus ColumnDataWriter::init() { double size = static_cast(_table->segment_size()); size *= OLAP_COLUMN_FILE_SEGMENT_SIZE_SCALE; - _max_segment_size = (uint32_t)lround(size); + _max_segment_size = static_cast(lround(size)); _row_block = new(std::nothrow) RowBlock(_table->tablet_schema()); if (NULL == _row_block) { - OLAP_LOG_WARNING("fail to new RowBlock. [table='%s']", - _table->full_name().c_str()); + LOG(WARNING) << "fail to new RowBlock. [table='" << _table->full_name() << "']"; return OLAP_ERR_MALLOC_ERROR; } @@ -67,8 +71,8 @@ OLAPStatus ColumnDataWriter::init() { return res; } - OLAP_LOG_DEBUG("init ColumnData writer. [table='%s' block_row_size=%lu]", - _table->full_name().c_str(), _table->num_rows_per_row_block()); + VLOG(3) << "init ColumnData writer. [table='" << _table->full_name() + << "' block_row_size=" << _table->num_rows_per_row_block() << "]"; RowBlockInfo block_info(0U, _table->num_rows_per_row_block(), 0); block_info.data_file_type = DataFileType::COLUMN_ORIENTED_FILE; block_info.null_supported = true; @@ -78,8 +82,11 @@ OLAPStatus ColumnDataWriter::init() { OLAP_LOG_WARNING("fail to initiate row block. [res=%d]", res); return res; } + return OLAP_SUCCESS; +} - res = _add_segment(); +OLAPStatus ColumnDataWriter::_init_segment() { + OLAPStatus res = _add_segment(); if (OLAP_SUCCESS != res) { OLAP_LOG_WARNING("fail to add segment. [res=%d]", res); return res; @@ -91,6 +98,7 @@ OLAPStatus ColumnDataWriter::init() { return res; } + _new_segment_created = true; return res; } @@ -100,27 +108,42 @@ OLAPStatus ColumnDataWriter::attached_by(RowCursor* row_cursor) { OLAP_LOG_WARNING("failed to flush data while attaching row cursor."); return OLAP_ERR_OTHER_ERROR; } + RETURN_NOT_OK(_flush_segment_with_verfication()); } _row_block->get_row(_row_index, row_cursor); return OLAP_SUCCESS; } -OLAPStatus ColumnDataWriter::finalize() { - OLAPStatus res; +OLAPStatus ColumnDataWriter::write(const char* row) { + if (_row_index >= _table->num_rows_per_row_block()) { + if (OLAP_SUCCESS != _flush_row_block(false)) { + OLAP_LOG_WARNING("failed to flush data while attaching row cursor."); + return OLAP_ERR_OTHER_ERROR; + } + RETURN_NOT_OK(_flush_segment_with_verfication()); + } + _row_block->set_row(_row_index, row); + return OLAP_SUCCESS; +} - res = _flush_row_block(true); +OLAPStatus ColumnDataWriter::finalize() { + if (_all_num_rows == 0 && _row_index == 0) { + _index->set_empty(true); + return OLAP_SUCCESS; + } + OLAPStatus res = _flush_row_block(true); if (OLAP_SUCCESS != res) { OLAP_LOG_WARNING("failed to flush data while attaching row cursor.[res=%d]", res); return res; } - res = _finalize_segment(); + res = _finalize_segment(); if (OLAP_SUCCESS != res) { OLAP_LOG_WARNING("fail to finalize segment.[res=%d]", res); return res; } - res = _index->set_column_statistics(_column_statistics); + res = _index->add_column_statistics(_column_statistics); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("Fail to set delta pruning![res=%d]", res); return res; @@ -129,6 +152,43 @@ OLAPStatus ColumnDataWriter::finalize() { return OLAP_SUCCESS; } +OLAPStatus ColumnDataWriter::_flush_row_block(bool finalize) { + if (!_new_segment_created) { + RETURN_NOT_OK(_init_segment()); + } + + if (_row_index < 1) { return OLAP_SUCCESS; } + // 与OLAPDataWriterä¸åŒ,è¿™é‡Œä¸æ˜¯çœŸçš„写RowBlock,所以并ä¸éœ€è¦finalize RowBlock + // 但考虑到兼容Row Block的使用方å¼,还是调用了finalize + OLAPStatus res = _row_block->finalize(_row_index); + if (OLAP_SUCCESS != res) { + OLAP_LOG_WARNING("fail to finalize row block. [num_rows=%u res=%d]", + _row_index, res); + return OLAP_ERR_WRITER_ROW_BLOCK_ERROR; + } + + // 目标是将自己的block按æ¡å†™å…¥ç›®æ ‡block中。 + res = _segment_writer->write_batch(_row_block, &_cursor, finalize); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to write row to segment. [res=%d]", res); + return OLAP_ERR_WRITER_DATA_WRITE_ERROR; + } + + // 在Rowsetä¸­è®°å½•çš„ä¸æ˜¯æ•°æ®æ–‡ä»¶çš„åç§»,而是blockçš„ç¼–å· + if (OLAP_SUCCESS != _index->add_row_block(*_row_block, _block_id++)) { + OLAP_LOG_WARNING("fail to update index."); + return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; + } + + // In order to reuse row_block, clear the row_block after finalize + _row_block->clear(); + _num_rows += _row_index; + _all_num_rows += _row_index; + _row_index = 0; + + return OLAP_SUCCESS; +} + OLAPStatus ColumnDataWriter::_add_segment() { std::string file_name; @@ -137,9 +197,7 @@ OLAPStatus ColumnDataWriter::_add_segment() { return OLAP_ERR_WRITER_SEGMENT_NOT_FINALIZED; } - file_name = _table->construct_data_file_path(_index->version(), - _index->version_hash(), - _segment); + file_name = _index->construct_data_file_path(_index->rowset_id(), _segment); _segment_writer = new(std::nothrow) SegmentWriter(file_name, _table, OLAP_DEFAULT_COLUMN_STREAM_BUFFER_SIZE); @@ -150,8 +208,7 @@ OLAPStatus ColumnDataWriter::_add_segment() { OLAPStatus res = OLAP_SUCCESS; if (_is_push_write) { - res = _segment_writer->init( - config::push_write_mbytes_per_sec); + res = _segment_writer->init(config::push_write_mbytes_per_sec); } else { res = _segment_writer->init( config::base_compaction_write_mbytes_per_sec); @@ -167,6 +224,23 @@ OLAPStatus ColumnDataWriter::_add_segment() { return OLAP_SUCCESS; } +OLAPStatus ColumnDataWriter::_flush_segment_with_verfication() { + uint64_t segment_size = _segment_writer->estimate_segment_size(); + if (UNLIKELY(segment_size < _max_segment_size)) { + return OLAP_SUCCESS; + } + + OLAPStatus res = _finalize_segment(); + if (OLAP_SUCCESS != res) { + OLAP_LOG_WARNING("fail to finalize segment. [res=%d]", res); + return OLAP_ERR_WRITER_DATA_WRITE_ERROR; + } + + _new_segment_created = false; + _num_rows = 0; + return OLAP_SUCCESS; +} + OLAPStatus ColumnDataWriter::_finalize_segment() { OLAPStatus res = OLAP_SUCCESS; uint32_t data_segment_size; @@ -185,94 +259,6 @@ OLAPStatus ColumnDataWriter::_finalize_segment() { return res; } -OLAPStatus ColumnDataWriter::_flush_row_block(RowBlock* row_block, bool is_finalized) { - OLAPStatus res; - - // 目标是将自己的block按æ¡å†™å…¥ç›®æ ‡block中。 - for (uint32_t i = 0; i < row_block->row_block_info().row_num; i++) { - row_block->get_row(i, &_cursor); - res = _segment_writer->write(&_cursor); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to write row to segment. [res=%d]", res); - return OLAP_ERR_WRITER_DATA_WRITE_ERROR; - } - } - - // 在OLAPIndexä¸­è®°å½•çš„ä¸æ˜¯æ•°æ®æ–‡ä»¶çš„åç§»,而是blockçš„ç¼–å· - if (OLAP_SUCCESS != _index->add_row_block(*row_block, _block_id++)) { - OLAP_LOG_WARNING("fail to update index."); - return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; - } - - if ((_segment_writer->estimate_segment_size() >= _max_segment_size) && - _segment_writer->is_row_block_full() && !is_finalized) { - res = _finalize_segment(); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to finalize segment. [res=%d]", res); - return OLAP_ERR_WRITER_DATA_WRITE_ERROR; - } - - res = _add_segment(); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to add segment. [res=%d]", res); - return OLAP_ERR_WRITER_DATA_WRITE_ERROR; - } - - res = _index->add_segment(); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to add index segment. [res=%d]", res); - return res; - } - - _num_rows = 0; - } - - return OLAP_SUCCESS; -} - -OLAPStatus ColumnDataWriter::_flush_row_block(bool is_finalized) { - OLAPStatus res = OLAP_SUCCESS; - - if (_row_index < 1) { - return OLAP_SUCCESS; - } - - // 与OLAPDataWriterä¸åŒ,è¿™é‡Œä¸æ˜¯çœŸçš„写RowBlock,所以并ä¸éœ€è¦finalize RowBlock - // 但考虑到兼容Row Block的使用方å¼,还是调用了finalize - res = _row_block->finalize(_row_index); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to finalize row block. [num_rows=%u res=%d]", - _row_index, res); - return OLAP_ERR_WRITER_ROW_BLOCK_ERROR; - } - - res = _flush_row_block(_row_block, is_finalized); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to flush row block. [res=%d]", res); - return res; - } - - // In order to reuse row_block, clear the row_block after finalize - _row_block->clear(); - _num_rows += _row_index; - _row_index = 0U; - - return OLAP_SUCCESS; -} - -// 这个接å£ç›®å‰åªä½¿ç”¨åœ¨schema change的时候. 对于ColumnFile而言, æœªæ¥ -// çš„schema change应该是轻é‡çº§çš„Schema change, 这个接å£å°±åªä¼šç”¨æ¥è¿›è¡Œ -// Roll up. 从OLAP Data创建Column Fileçš„roll up应该是éžå¸¸å°‘çš„. -OLAPStatus ColumnDataWriter::write_row_block(RowBlock* row_block) { - OLAP_LOG_DEBUG("write block, block size = %d", row_block->row_block_info().row_num); - - if (NULL == row_block || 0 == row_block->row_block_info().row_num) { - return OLAP_SUCCESS; - } - - return _flush_row_block(row_block, false); -} - uint64_t ColumnDataWriter::written_bytes() { uint64_t size = _segment * _max_segment_size + _segment_writer->estimate_segment_size(); return size; diff --git a/be/src/olap/column_file/data_writer.h b/be/src/olap/column_file/data_writer.h index 210d50d040..588a0ee5f3 100644 --- a/be/src/olap/column_file/data_writer.h +++ b/be/src/olap/column_file/data_writer.h @@ -27,21 +27,22 @@ class SegmentWriter; // 列文件格å¼çš„Writer,接å£å‚考IWriter中的定义 class ColumnDataWriter : public IWriter { public: - ColumnDataWriter(SmartOLAPTable table, OLAPIndex* index, bool is_push_write); + ColumnDataWriter(OLAPTablePtr table, Rowset* index, bool is_push_write); virtual ~ColumnDataWriter(); virtual OLAPStatus init(); virtual OLAPStatus attached_by(RowCursor* row_cursor); + virtual OLAPStatus write(const char* row); virtual OLAPStatus finalize(); - virtual OLAPStatus write_row_block(RowBlock* row_block); virtual uint64_t written_bytes(); virtual MemPool* mem_pool(); private: OLAPStatus _add_segment(); + OLAPStatus _flush_segment_with_verfication(); OLAPStatus _finalize_segment(); - OLAPStatus _flush_row_block(RowBlock* row_block, bool is_finalized); - OLAPStatus _flush_row_block(bool is_finalized); + OLAPStatus _flush_row_block(bool finalize); + OLAPStatus _init_segment(); - OLAPIndex* _index; + Rowset* _index; RowBlock* _row_block; // 使用RowBlcok缓存è¦å†™å…¥çš„æ•°æ® RowCursor _cursor; SegmentWriter* _segment_writer; @@ -49,6 +50,8 @@ private: uint32_t _block_id; // 当å‰Segment内的blockç¼–å· uint32_t _max_segment_size; uint32_t _segment; + int64_t _all_num_rows; + bool _new_segment_created; DISALLOW_COPY_AND_ASSIGN(ColumnDataWriter); }; diff --git a/be/src/olap/column_file/in_stream.h b/be/src/olap/column_file/in_stream.h index 94b44b5496..ae7971d7eb 100644 --- a/be/src/olap/column_file/in_stream.h +++ b/be/src/olap/column_file/in_stream.h @@ -46,8 +46,8 @@ public: // 上层使用者应该ä¿è¯ä¸è¯»å–ByteBuffer // 之间没有数æ®çš„空洞ä½ç½®. // - // 当使用mmap的时候,è¿™é‡Œä¼šé€€åŒ–ä¸ºåªæœ‰ä¸€ä¸ªByteBuffer, 是å - // ¦ä½¿ç”¨mmapå–决于在性能 + // 当使用mmap的时候,è¿™é‡Œä¼šé€€åŒ–ä¸ºåªæœ‰ä¸€ä¸ªByteBuffer, 是 + // ??使用mmapå–决于在性能 // 调优阶段的测试结果 // // Input: diff --git a/be/src/olap/column_file/out_stream.cpp b/be/src/olap/column_file/out_stream.cpp index 972746f2cf..1b4a01b853 100644 --- a/be/src/olap/column_file/out_stream.cpp +++ b/be/src/olap/column_file/out_stream.cpp @@ -250,34 +250,6 @@ OLAPStatus OutStream::_spill() { return OLAP_SUCCESS; } -OLAPStatus OutStream::write(char byte) { - OLAPStatus res = OLAP_SUCCESS; - - if (NULL == _current) { - res = _create_new_input_buffer(); - if (OLAP_SUCCESS != res) { - return res; - } - } - - if (_current->remaining() < 1) { - res = _spill(); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to spill current buffer."); - return res; - } - - if (NULL == _current) { - res = _create_new_input_buffer(); - if (OLAP_SUCCESS != res) { - return res; - } - } - } - - return _current->put(byte); -} - OLAPStatus OutStream::write(const char* buffer, uint64_t length) { OLAPStatus res = OLAP_SUCCESS; uint64_t offset = 0; diff --git a/be/src/olap/column_file/out_stream.h b/be/src/olap/column_file/out_stream.h index effa26b894..3a6cec9ee7 100644 --- a/be/src/olap/column_file/out_stream.h +++ b/be/src/olap/column_file/out_stream.h @@ -49,7 +49,29 @@ public: ~OutStream(); // 呿µè¾“出一个字节 - OLAPStatus write(char byte); + inline OLAPStatus write(char byte) { + OLAPStatus res = OLAP_SUCCESS; + if (_current == nullptr) { + res = _create_new_input_buffer(); + if (res != OLAP_SUCCESS) { + return res; + } + } + if (_current->remaining() < 1) { + res = _spill(); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to spill current buffer."); + return res; + } + if (_current == nullptr) { + res = _create_new_input_buffer(); + if (res != OLAP_SUCCESS) { + return res; + } + } + } + return _current->put(byte); + } // 呿µè¾“å‡ºä¸€æ®µæ•°æ® OLAPStatus write(const char* buffer, uint64_t length); diff --git a/be/src/olap/column_file/run_length_integer_writer.cpp b/be/src/olap/column_file/run_length_integer_writer.cpp index f890dec2da..ea11c7b314 100644 --- a/be/src/olap/column_file/run_length_integer_writer.cpp +++ b/be/src/olap/column_file/run_length_integer_writer.cpp @@ -138,8 +138,11 @@ void RunLengthIntegerWriter::_determined_encoding() { } // use DIRECT for delta overflows - _zz_bits_90p = ser::percentile_bits(_zig_zag_literals, _num_literals, 0.9); - _zz_bits_100p = ser::percentile_bits(_zig_zag_literals, _num_literals, 1.0); + uint16_t hists[65]; + ser::compute_hists(_zig_zag_literals, _num_literals, hists); + + _zz_bits_90p = ser::percentile_bits_with_hist(hists, _num_literals, 0.9); + _zz_bits_100p = ser::percentile_bits_with_hist(hists, _num_literals, 1.0); if (!ser::is_safe_subtract(max, _min)) { _encoding = DIRECT; return; @@ -179,12 +182,13 @@ void RunLengthIntegerWriter::_determined_encoding() { _base_reduced_literals[i] = _literals[i] - _min; } + ser::compute_hists(_base_reduced_literals, _num_literals, hists); // 95th percentile width is used to determine max allowed value // after which patching will be done - _br_bits_95p = ser::percentile_bits(_base_reduced_literals, _num_literals, 0.95); + _br_bits_95p = ser::percentile_bits_with_hist(hists, _num_literals, 0.95); // 100th percentile is used to compute the max patch width - _br_bits_100p = ser::percentile_bits(_base_reduced_literals, _num_literals, 1.0); + _br_bits_100p = ser::percentile_bits_with_hist(hists, _num_literals, 1.0); // after base reducing the values, if the difference in bits between // 95th percentile and 100th percentile value is zero then there diff --git a/be/src/olap/column_file/run_length_integer_writer.h b/be/src/olap/column_file/run_length_integer_writer.h index 9c5dfd3135..81fb78b422 100644 --- a/be/src/olap/column_file/run_length_integer_writer.h +++ b/be/src/olap/column_file/run_length_integer_writer.h @@ -260,12 +260,12 @@ private: OLAPStatus _write_patched_base_values(); OLAPStatus _write_delta_values(); - static const uint32_t MAX_SCOPE = 512; - static const uint32_t MIN_REPEAT = 3; // NOTE ä¸è¦ä¿®æ”¹è¿™ä¸ªå€¼, å¦åˆ™ç¨‹åºå‡ºé”™ - static const uint32_t MAX_SHORT_REPEAT_LENGTH = 10; + static const uint16_t MAX_SCOPE = 512; + static const uint16_t MIN_REPEAT = 3; // NOTE ä¸è¦ä¿®æ”¹è¿™ä¸ªå€¼, å¦åˆ™ç¨‹åºå‡ºé”™ + static const uint16_t MAX_SHORT_REPEAT_LENGTH = 10; // MAX_PATCH_LIST原本åªéœ€è¦0.05*MAX_SCOPE, 然而为了防止percentile_bits()里 // 与这里的浮点计算产生的误差, 这里直接放大两å€, 请ä¸è¦ä¿®æ”¹è¿™ä¸ªå€¼ - static const uint32_t MAX_PATCH_LIST = uint32_t(MAX_SCOPE * 0.1); + static const uint16_t MAX_PATCH_LIST = uint16_t(MAX_SCOPE * 0.1); OutStream* _output; bool _is_signed; uint32_t _fixed_run_length; @@ -273,7 +273,7 @@ private: int64_t _prev_delta; int64_t _literals[MAX_SCOPE]; EncodingType _encoding; - uint32_t _num_literals; + uint16_t _num_literals; int64_t _zig_zag_literals[MAX_SCOPE]; // for direct encoding int64_t _base_reduced_literals[MAX_SCOPE]; // for for patched base encoding int64_t _adj_deltas[MAX_SCOPE - 1]; // for delta encoding diff --git a/be/src/olap/column_file/segment_reader.cpp b/be/src/olap/column_file/segment_reader.cpp index 0eb7c4dd92..a6bdef2efc 100644 --- a/be/src/olap/column_file/segment_reader.cpp +++ b/be/src/olap/column_file/segment_reader.cpp @@ -24,6 +24,7 @@ #include "olap/column_file/out_stream.h" #include "olap/olap_cond.h" #include "olap/row_block.h" +#include "olap/rowset.h" namespace palo { namespace column_file { @@ -33,7 +34,7 @@ static const uint32_t MIN_FILTER_BLOCK_NUM = 10; SegmentReader::SegmentReader( const std::string file, OLAPTable* table, - OLAPIndex* index, + Rowset* index, uint32_t segment_id, const std::vector& used_columns, const std::set& load_bf_columns, @@ -131,11 +132,11 @@ OLAPStatus SegmentReader::_load_segment_file() { res = _file_handler.open_with_cache(_file_name, O_RDONLY); if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to open segment file. [file='%s']", _file_name.c_str()); + LOG(WARNING) << "fail to open segment file. [file='" << _file_name << "']"; return res; } - //OLAP_LOG_DEBUG("seg file : %s", _file_name.c_str()); + //VLOG(3) << "seg file : " << _file_name; // In file_header.unserialize(), it validates file length, signature, checksum of protobuf. _file_header = _olap_index->get_seg_pb(_segment_id); _null_supported = _olap_index->get_null_supported(_segment_id); @@ -685,7 +686,7 @@ OLAPStatus SegmentReader::_load_index(bool is_using_cache) { res = index_message->init(stream_buffer, stream_length, type, is_using_cache, _null_supported); if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("init index from cahce fail"); + OLAP_LOG_WARNING("init index from cache fail"); return res; } @@ -717,6 +718,12 @@ OLAPStatus SegmentReader::_load_index(bool is_using_cache) { if (_block_count != expected_blocks) { OLAP_LOG_WARNING("something wrong while reading index, expected=%lu, actual=%lu", expected_blocks, _block_count); + OLAP_LOG_WARNING("_header_message().number_of_rows()=%d," + "_header_message().num_rows_per_block()=%d, table='%s', version='%d-%d'", + _header_message().number_of_rows(), _header_message().num_rows_per_block(), + _olap_index->table()->full_name().c_str(), + _olap_index->version().first, _olap_index->version().second); + LOG(WARNING) << "version:" << _olap_index->version().first << "-" << _olap_index->version().second; return OLAP_ERR_FILE_FORMAT_ERROR; } } @@ -917,8 +924,9 @@ OLAPStatus SegmentReader::_load_to_vectorized_row_batch( auto reader = _column_readers[cid]; auto res = reader->next_vector(batch->column(cid), size, mem_pool); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to read next, res = %d, column = %u", - res, reader->column_unique_id()); + LOG(WARNING) << "fail to read next, res=" << res + << ", column=" << reader->column_unique_id() + << ", size=" << size; return res; } } diff --git a/be/src/olap/column_file/segment_reader.h b/be/src/olap/column_file/segment_reader.h index 9d6e88f5e6..28c73413bc 100644 --- a/be/src/olap/column_file/segment_reader.h +++ b/be/src/olap/column_file/segment_reader.h @@ -35,7 +35,6 @@ #include "olap/olap_cond.h" #include "olap/olap_define.h" #include "olap/olap_engine.h" -#include "olap/olap_index.h" #include "olap/olap_table.h" #include "olap/row_cursor.h" #include "runtime/runtime_state.h" @@ -44,6 +43,9 @@ #include "olap/column_predicate.h" namespace palo { + +class Rowset; + namespace column_file { class ColumnReader; @@ -53,7 +55,7 @@ class SegmentReader { public: explicit SegmentReader(const std::string file, OLAPTable* table, - OLAPIndex* index, + Rowset* index, uint32_t segment_id, const std::vector& return_columns, const std::set& load_bf_columns, @@ -275,7 +277,7 @@ private: palo::FileHandler _file_handler; // 文件handler OLAPTable* _table; - OLAPIndex* _olap_index; + Rowset* _olap_index; uint32_t _segment_id; const Conditions* _conditions; // 列过滤æ¡ä»¶ diff --git a/be/src/olap/column_file/segment_writer.cpp b/be/src/olap/column_file/segment_writer.cpp index c5d14eac90..bc54bbe906 100644 --- a/be/src/olap/column_file/segment_writer.cpp +++ b/be/src/olap/column_file/segment_writer.cpp @@ -26,14 +26,13 @@ namespace column_file { SegmentWriter::SegmentWriter( const std::string& file_name, - SmartOLAPTable table, + OLAPTablePtr table, uint32_t stream_buffer_size) : _file_name(file_name), _table(table), _stream_buffer_size(stream_buffer_size), _stream_factory(NULL), _row_count(0), - _row_in_block(0), _block_count(0) {} SegmentWriter::~SegmentWriter() { @@ -59,7 +58,7 @@ OLAPStatus SegmentWriter::init(uint32_t write_mbytes_per_sec) { // 创建writer for (uint32_t i = 0; i < _table->tablet_schema().size(); i++) { if (_table->tablet_schema()[i].is_root_column) { - ColumnWriter* writer = ColumnWriter::create(i, _table->tablet_schema(), + ColumnWriter* writer = ColumnWriter::create(i, _table->tablet_schema(), _stream_factory, _table->num_rows_per_row_block(), _table->bloom_filter_fpp()); @@ -84,31 +83,25 @@ OLAPStatus SegmentWriter::init(uint32_t write_mbytes_per_sec) { return OLAP_SUCCESS; } -OLAPStatus SegmentWriter::write(RowCursor* row_cursor) { +OLAPStatus SegmentWriter::write_batch(RowBlock* block, RowCursor* cursor, bool is_finalize) { + DCHECK(block->row_block_info().row_num == _table->num_rows_per_row_block() || is_finalize) + << "write block not empty, num_rows=" << block->row_block_info().row_num + << ", table_num_rows=" << _table->num_rows_per_row_block(); OLAPStatus res = OLAP_SUCCESS; - - // OLAP_LOG_DEBUG("row_count = %lu, row_in_block = %lu, block = %lu", - // _row_count, _row_in_block, _block_count); - if (_row_in_block == _table->num_rows_per_row_block()) { - res = create_row_index_entry(); - - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to create row index entry"); - } - } - - for (std::vector::iterator it = _root_writers.begin(); - it != _root_writers.end(); ++it) { - res = (*it)->write(row_cursor); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { + for (auto col_writer : _root_writers) { + res = col_writer->write_batch(block, cursor); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { OLAP_LOG_WARNING("fail to write row. [res=%d]", res); return res; } + res = col_writer->create_row_index_entry(); + if (OLAP_UNLIKELY(res != OLAP_SUCCESS)) { + OLAP_LOG_WARNING("fail to create row index. [res=%d]", res); + return res; + } } - - ++_row_count; - ++_row_in_block; + _row_count += block->row_block_info().row_num; + ++_block_count; return res; } @@ -215,17 +208,10 @@ OLAPStatus SegmentWriter::finalize(uint32_t* segment_file_size) { if (OLAP_SUCCESS != (res = file_handle.open_with_mode( _file_name, O_CREAT | O_EXCL | O_WRONLY , S_IRUSR | S_IWUSR))) { - OLAP_LOG_WARNING("fail to open file. [file_name=%s]", _file_name.c_str()); + LOG(WARNING) << "fail to open file. [file_name=" << _file_name << "]"; return res; } - if (_row_in_block > 0) { - res = create_row_index_entry(); - if (OLAP_SUCCESS != res) { - return res; - } - } - res = _make_file_header(file_header.mutable_message()); if (OLAP_SUCCESS != res) { OLAP_LOG_WARNING("fail to make file header. [res=%d]", res); @@ -288,25 +274,5 @@ OLAPStatus SegmentWriter::finalize(uint32_t* segment_file_size) { return res; } -OLAPStatus SegmentWriter::create_row_index_entry() { - OLAPStatus res = OLAP_SUCCESS; - - for (std::vector::iterator it = _root_writers.begin(); - it != _root_writers.end(); ++it) { - res = (*it)->create_row_index_entry(); - - if (OLAP_UNLIKELY(OLAP_SUCCESS != res)) { - OLAP_LOG_WARNING("fail to create row index. [res=%d]", res); - return res; - } - } - - OLAP_LOG_DEBUG("create row_index_entry, _block_count = %ld, _row_in_block = %ld", - _block_count, _row_in_block); - ++_block_count; - _row_in_block = 0; - return res; -} - } // namespace column_file } // namespace palo diff --git a/be/src/olap/column_file/segment_writer.h b/be/src/olap/column_file/segment_writer.h index e1e9c0d5a0..2da1f00059 100644 --- a/be/src/olap/column_file/segment_writer.h +++ b/be/src/olap/column_file/segment_writer.h @@ -30,33 +30,26 @@ class ColumnDataHeaderMessage; class SegmentWriter { public: explicit SegmentWriter(const std::string& file_name, - SmartOLAPTable table, + OLAPTablePtr table, uint32_t stream_buffer_size); ~SegmentWriter(); OLAPStatus init(uint32_t write_mbytes_per_sec); - // 写入一行数æ®, 使用row_cursorè¯»å–æ¯ä¸ªåˆ— - OLAPStatus write(RowCursor* row_cursor); - // 记录indexä¿¡æ¯ - OLAPStatus create_row_index_entry(); + OLAPStatus write_batch(RowBlock* block, RowCursor* cursor, bool is_finalize); // 通过对缓存的使用,预估最终segmentçš„å¤§å° uint64_t estimate_segment_size(); // ç”Ÿæˆæ–‡ä»¶å¹¶å†™å…¥ç¼“å­˜çš„æ•°æ® OLAPStatus finalize(uint32_t* segment_file_size); - bool is_row_block_full() { - return (_row_in_block >= _table->num_rows_per_row_block()) ? true : false; - } private: // Helper: ç”Ÿæˆæœ€ç»ˆçš„PB文件头 OLAPStatus _make_file_header(ColumnDataHeaderMessage* file_header); std::string _file_name; - SmartOLAPTable _table; + OLAPTablePtr _table; uint32_t _stream_buffer_size; // è¾“å‡ºç¼“å†²åŒºå¤§å° std::vector _root_writers; OutStreamFactory* _stream_factory; uint64_t _row_count; // å·²ç»å†™å…¥çš„行总数 - uint64_t _row_in_block; // 当å‰blockä¸­çš„æ•°æ® uint64_t _block_count; // å·²ç»å†™å…¥çš„block个数 // write limit diff --git a/be/src/olap/column_file/serialize.cpp b/be/src/olap/column_file/serialize.cpp index 97120ffc15..772206f206 100644 --- a/be/src/olap/column_file/serialize.cpp +++ b/be/src/olap/column_file/serialize.cpp @@ -59,38 +59,13 @@ OLAPStatus read_var_unsigned(ReadOnlyFileStream* stream, int64_t* value) { return OLAP_SUCCESS; } -uint32_t get_closet_fixed_bits(uint32_t n) { - if (n == 0) { - return 1; - } else if (n >= 1 && n <= 24) { - return n; - } else if (n > 24 && n <= 26) { - return 26; - } else if (n > 26 && n <= 28) { - return 28; - } else if (n > 28 && n <= 30) { - return 30; - } else if (n > 30 && n <= 32) { - return 32; - } else if (n > 32 && n <= 40) { - return 40; - } else if (n > 40 && n <= 48) { - return 48; - } else if (n > 48 && n <= 56) { - return 56; - } else { - return 64; - } -} - uint32_t find_closet_num_bits(int64_t value) { - uint32_t count = 0; - - while (value != 0) { - ++count; - value = ((uint64_t)value) >> 1; + // counting leading zero, builtin function, this will generate BSR(Bit Scan Reverse) + // instruction for X86 + if (value == 0) { + return 1; } - + auto count = 64 - __builtin_clzll(value); return get_closet_fixed_bits(count); } @@ -119,77 +94,61 @@ OLAPStatus bytes_to_long_be(ReadOnlyFileStream* stream, int32_t n, int64_t* valu } uint32_t encode_bit_width(uint32_t n) { - n = get_closet_fixed_bits(n); - - if (n >= 1 && n <= 24) { - return n - 1; - } else if (n > 24 && n <= 26) { - return TWENTYSIX; - } else if (n > 26 && n <= 28) { - return TWENTYEIGHT; - } else if (n > 28 && n <= 30) { - return THIRTY; - } else if (n > 30 && n <= 32) { - return THIRTYTWO; - } else if (n > 32 && n <= 40) { - return FORTY; - } else if (n > 40 && n <= 48) { - return FORTYEIGHT; - } else if (n > 48 && n <= 56) { - return FIFTYSIX; - } else { - return SIXTYFOUR; - } + static uint8_t bits_map[65] = { + ONE, // 0 + ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, // 1 - 8 + NINE, TEN, ELEVEN, TWELVE, THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, // 9 - 16 + // 17 - 24 + SEVENTEEN, EIGHTEEN, NINETEEN, TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, + // 25 - 32 + TWENTYSIX, TWENTYSIX, TWENTYEIGHT, TWENTYEIGHT, THIRTY, THIRTY, THIRTYTWO, THIRTYTWO, + // 33 - 40 + FORTY, FORTY, FORTY, FORTY, FORTY, FORTY, FORTY, FORTY, + // 41 - 48 + FORTYEIGHT, FORTYEIGHT, FORTYEIGHT, FORTYEIGHT, FORTYEIGHT, FORTYEIGHT, + FORTYEIGHT, FORTYEIGHT, + // 49 - 56 + FIFTYSIX, FIFTYSIX, FIFTYSIX, FIFTYSIX, FIFTYSIX, FIFTYSIX, FIFTYSIX, FIFTYSIX, + // 57 - 64 + SIXTYFOUR, SIXTYFOUR, SIXTYFOUR, SIXTYFOUR, SIXTYFOUR, SIXTYFOUR, SIXTYFOUR, SIXTYFOUR, + }; + return bits_map[n]; } uint32_t decode_bit_width(uint32_t n) { - if (n <= TWENTYFOUR) { - return n + 1; - } else if (n == TWENTYSIX) { - return 26; - } else if (n == TWENTYEIGHT) { - return 28; - } else if (n == THIRTY) { - return 30; - } else if (n == THIRTYTWO) { - return 32; - } else if (n == FORTY) { - return 40; - } else if (n == FORTYEIGHT) { - return 48; - } else if (n == FIFTYSIX) { - return 56; - } else { - return 64; - } + static uint8_t bits_map[SIXTYFOUR + 1] = { + 1, 2, 3, 4, 5, 6, 7, 8, // ONE - EIGHT + 9, 10, 11, 12, 13, 14, 15, 16, // NINE - SIXTEEN + 17, 18, 19, 20, 21, 22, 23, 24, // SEVENTEEN - TWENTYFOUR + 26, // TWENTYSIX + 28, // TWENTYEIGHT + 30, // THIRTY + 32, // THIRTYTWO + 40, // FORTY + 48, // FORTYEIGHT + 56, // FIFTYSIX + 64 // SIXTYFOUR + }; + return bits_map[n]; } -uint32_t percentile_bits(int64_t* data, uint32_t count, double p) { +uint32_t percentile_bits(int64_t* data, uint16_t count, double p) { // histogram that store the encoded bit requirement for each values. // maximum number of bits that can encoded is 32 (refer FixedBitSizes) - uint32_t hist[32]; - - for (uint32_t i = 0; i < 32; i++) { - hist[i] = 0; - } - + uint16_t hist[65]; + memset(hist, 0, sizeof(hist)); // compute the histogram for (uint32_t i = 0; i < count; i++) { - int idx = encode_bit_width(find_closet_num_bits(data[i])); - hist[idx] += 1; + hist[used_bits(data[i])]++; } - int32_t per_len = (int32_t)(count * (1.0 - p)); - // return the bits required by pth percentile length - for (int32_t i = 31; i >= 0; i--) { + for (int32_t i = 64; i >= 0; i--) { per_len -= hist[i]; - if (per_len < 0) { - return decode_bit_width(i); + return get_closet_fixed_bits(i); } } - return 0; } diff --git a/be/src/olap/column_file/serialize.h b/be/src/olap/column_file/serialize.h index c242c0bc2c..dbdd3f48dd 100644 --- a/be/src/olap/column_file/serialize.h +++ b/be/src/olap/column_file/serialize.h @@ -75,8 +75,50 @@ enum FixedBitSize { TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR }; +inline uint32_t used_bits(uint64_t value) { + // counting leading zero, builtin function, this will generate BSR(Bit Scan Reverse) + // instruction for X86 + if (value == 0) { + return 0; + } + return 64 - __builtin_clzll(value); +} + +inline void compute_hists(int64_t* data, uint16_t count, uint16_t hists[65]) { + memset(hists, 0, sizeof(uint16_t) * 65); + // compute the histogram + for (uint32_t i = 0; i < count; i++) { + hists[used_bits(data[i])]++; + } +} + // 返回大于等于n且最接近nçš„FixedBiteSize -uint32_t get_closet_fixed_bits(uint32_t n); +inline uint32_t get_closet_fixed_bits(uint32_t n) { + static uint8_t bits_map[65] = { + 1, // 0 + 1, 2, 3, 4, 5, 6, 7, 8, // 1 - 8 + 9, 10, 11, 12, 13, 14, 15, 16, // 9 - 16 + 17, 18, 19, 20, 21, 22, 23, 24, // 17 - 24 + 26, 26, 28, 28, 30, 30, 32, 32, // 25 - 32 + 40, 40, 40, 40, 40, 40, 40, 40, // 33 - 40 + 48, 48, 48, 48, 48, 48, 48, 48, // 41 - 48 + 56, 56, 56, 56, 56, 56, 56, 56, // 49 - 56 + 64, 64, 64, 64, 64, 64, 64, 64, // 57 - 64 + }; + return bits_map[n]; +} + +inline uint32_t percentile_bits_with_hist(uint16_t hists[65], uint16_t count, double p) { + int32_t per_len = (int32_t)(count * (1.0 - p)); + // return the bits required by pth percentile length + for (int32_t i = 64; i >= 0; i--) { + per_len -= hists[i]; + if (per_len < 0) { + return get_closet_fixed_bits(i); + } + } + return 0; +} // 首先计算value的比特ä½é•¿(1所在的最高ä½), å†ä½¿ç”¨get_closet_fixed_bits // 返回最接近的FixedBiteSize @@ -95,7 +137,7 @@ uint32_t decode_bit_width(uint32_t n); // 例如: p == 1.0, 表示所有的数æ®çš„æœ€å¤§ä½é•¿ // p == 0.9, è¡¨ç¤ºæ¯”ç‰¹ä½æœ€çŸ­çš„90%的数æ®çš„æœ€å¤§ä½é•¿ // p == 0.5, è¡¨ç¤ºæ¯”ç‰¹ä½æœ€çŸ­çš„50%的数æ®çš„æœ€å¤§ä½é•¿ -uint32_t percentile_bits(int64_t* data, uint32_t count, double p); +uint32_t percentile_bits(int64_t* data, uint16_t count, double p); // 以紧致方å¼å‘output输出一组整数 OLAPStatus write_ints(OutStream* output, int64_t* data, uint32_t count, uint32_t bit_width); diff --git a/be/src/olap/column_file/stream_index_common.cpp b/be/src/olap/column_file/stream_index_common.cpp index bf3fb9546f..f2e7b41b28 100755 --- a/be/src/olap/column_file/stream_index_common.cpp +++ b/be/src/olap/column_file/stream_index_common.cpp @@ -62,20 +62,6 @@ void ColumnStatistics::reset() { } } -void ColumnStatistics::add(char* buf) { - if (_ignored) { - return; - } - - if (_maximum->cmp(buf) < 0) { - _maximum->copy(buf); - } - - if (_minimum->cmp(buf) > 0) { - _minimum->copy(buf); - } -} - void ColumnStatistics::merge(ColumnStatistics* other) { if (_ignored || other->ignored()) { return; diff --git a/be/src/olap/column_file/stream_index_common.h b/be/src/olap/column_file/stream_index_common.h index a1b386f617..ea19b93cfb 100755 --- a/be/src/olap/column_file/stream_index_common.h +++ b/be/src/olap/column_file/stream_index_common.h @@ -53,7 +53,17 @@ public: // åªæ˜¯reset最大和最å°å€¼ï¼Œå°†æœ€å°å€¼è®¾ç½®ä¸ºMAX,将最大值设置为MIN。 void reset(); // 增加一个值,根æ®ä¼ å…¥å€¼è°ƒæ•´æœ€å¤§æœ€å°å€¼ - void add(char* buf); + inline void add(char* buf) { + if (_ignored) { + return; + } + if (_maximum->cmp(buf) < 0) { + _maximum->copy(buf); + } + if (_minimum->cmp(buf) > 0) { + _minimum->copy(buf); + } + } // åˆå¹¶ï¼Œå°†å¦ä¸€ä¸ªç»Ÿè®¡ä¿¡æ¯å’Œå…¥å½“å‰ç»Ÿè®¡ä¸­ void merge(ColumnStatistics* other); // 返回最大最å°å€¼â€œè¾“出时â€å ç”¨çš„å†…å­˜ï¼Œè€Œâ€œä¸æ˜¯? diff --git a/be/src/olap/command_executor.cpp b/be/src/olap/command_executor.cpp deleted file mode 100755 index 3969556935..0000000000 --- a/be/src/olap/command_executor.cpp +++ /dev/null @@ -1,762 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/command_executor.h" - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "olap/base_compaction.h" -#include "olap/delete_handler.h" -#include "olap/field.h" -#include "olap/olap_common.h" -#include "olap/olap_define.h" -#include "olap/olap_engine.h" -#include "olap/olap_server.h" -#include "olap/olap_table.h" -#include "olap/push_handler.h" -#include "olap/reader.h" -#include "olap/schema_change.h" -#include "olap/utils.h" -#include "util/palo_metrics.h" -#include "util/pretty_printer.h" - -using apache::thrift::ThriftDebugString; -using std::map; -using std::make_pair; -using std::pair; -using std::set; -using std::string; -using std::stringstream; -using std::vector; -using std::list; -using google::protobuf::RepeatedPtrField; - -namespace palo { - -OLAPStatus CommandExecutor::compute_checksum( - TTabletId tablet_id, - TSchemaHash schema_hash, - TVersion version, - TVersionHash version_hash, - uint32_t* checksum) { - OLAP_LOG_INFO("begin to process compute checksum. " - "[tablet_id=%ld schema_hash=%d version=%ld]", - tablet_id, schema_hash, version); - OLAPStatus res = OLAP_SUCCESS; - - if (checksum == NULL) { - OLAP_LOG_WARNING("invalid output parameter which is null pointer."); - return OLAP_ERR_CE_CMD_PARAMS_ERROR; - } - - SmartOLAPTable tablet = - OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); - if (NULL == tablet.get()) { - OLAP_LOG_WARNING("can't find tablet. [tablet_id=%ld schema_hash=%d]", - tablet_id, schema_hash); - return OLAP_ERR_TABLE_NOT_FOUND; - } - - { - AutoRWLock auto_lock(tablet->get_header_lock_ptr(), true); - const FileVersionMessage* message = tablet->latest_version(); - if (message == NULL) { - OLAP_LOG_FATAL("fail to get latest version. [tablet_id=%ld]", tablet_id); - return OLAP_ERR_VERSION_NOT_EXIST; - } - - if (message->end_version() == version - && message->version_hash() != version_hash) { - OLAP_LOG_WARNING("fail to check latest version hash. " - "[res=%d tablet_id=%ld version_hash=%ld request_version_hash=%ld]", - res, tablet_id, message->version_hash(), version_hash); - return OLAP_ERR_CE_CMD_PARAMS_ERROR; - } - } - - Reader reader; - ReaderParams reader_params; - reader_params.olap_table = tablet; - reader_params.reader_type = READER_CHECKSUM; - reader_params.version = Version(0, version); - - // ignore float and double type considering to precision lose - for (size_t i = 0; i < tablet->tablet_schema().size(); ++i) { - FieldType type = tablet->get_field_type_by_index(i); - if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) { - continue; - } - - reader_params.return_columns.push_back(i); - } - - res = reader.init(reader_params); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("initiate reader fail. [res=%d]", res); - return res; - } - - RowCursor row; - res = row.init(tablet->tablet_schema(), reader_params.return_columns); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to init row cursor. [res=%d]", res); - return res; - } - row.allocate_memory_for_string_type(tablet->tablet_schema()); - - bool eof = false; - uint32_t row_checksum = 0; - while (true) { - OLAPStatus res = reader.next_row_with_aggregation(&row, &eof); - if (res == OLAP_SUCCESS && eof) { - OLAP_LOG_DEBUG("reader reads to the end."); - break; - } else if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to read in reader. [res=%d]", res); - return res; - } - - row_checksum = row.hash_code(row_checksum); - } - - OLAP_LOG_INFO("success to finish compute checksum. [checksum=%u]", row_checksum); - *checksum = row_checksum; - return OLAP_SUCCESS; -} - -OLAPStatus CommandExecutor::push( - const TPushReq& request, - vector* tablet_info_vec) { - OLAPStatus res = OLAP_SUCCESS; - OLAP_LOG_INFO("begin to process push. [tablet_id=%ld version=%ld]", - request.tablet_id, request.version); - - if (tablet_info_vec == NULL) { - OLAP_LOG_WARNING("invalid output parameter which is null pointer."); - PaloMetrics::push_requests_fail_total.increment(1); - return OLAP_ERR_CE_CMD_PARAMS_ERROR; - } - - SmartOLAPTable olap_table = OLAPEngine::get_instance()->get_table( - request.tablet_id, request.schema_hash); - if (NULL == olap_table.get()) { - OLAP_LOG_WARNING("false to find table. [table=%ld schema_hash=%d]", - request.tablet_id, request.schema_hash); - PaloMetrics::push_requests_fail_total.increment(1); - return OLAP_ERR_TABLE_NOT_FOUND; - } - - PushType type = PUSH_NORMAL; - if (request.push_type == TPushType::LOAD_DELETE) { - type = PUSH_FOR_LOAD_DELETE; - } - - int64_t duration_ns = 0; - PushHandler push_handler; - { - SCOPED_RAW_TIMER(&duration_ns); - res = push_handler.process(olap_table, request, type, tablet_info_vec); - } - if (res != OLAP_SUCCESS) { - LOG(WARNING) << "fail to push delta, table=" << olap_table->full_name().c_str() - << ",cost=" << PrettyPrinter::print(duration_ns, TUnit::TIME_NS); - PaloMetrics::push_requests_fail_total.increment(1); - } else { - LOG(INFO) << "success to push delta, table=" << olap_table->full_name().c_str() - << ",cost=" << PrettyPrinter::print(duration_ns, TUnit::TIME_NS); - PaloMetrics::push_requests_success_total.increment(1); - PaloMetrics::push_request_duration_us.increment(duration_ns / 1000); - PaloMetrics::push_request_write_bytes.increment(push_handler.write_bytes()); - PaloMetrics::push_request_write_rows.increment(push_handler.write_rows()); - } - return res; -} - -OLAPStatus CommandExecutor::base_compaction( - TTabletId tablet_id, - TSchemaHash schema_hash, - TVersion version) { - OLAP_LOG_INFO("begin to process base compaction. " - "[tablet_id=%ld schema_hash=%d version=%ld]", - tablet_id, schema_hash, version); - OLAPStatus res = OLAP_SUCCESS; - - SmartOLAPTable table = - OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); - if (NULL == table.get()) { - OLAP_LOG_WARNING("can't find olap table. [table=%ld schema_hash=%d]", - tablet_id, schema_hash); - return OLAP_ERR_TABLE_NOT_FOUND; - } - - BaseCompaction base_compaction; - res = base_compaction.init(table, true); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to init BaseCompactionHandler. [res=%d]", res); - return res; - } - - res = base_compaction.run(); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process base_compaction. [res=%d]", res); - } - - OLAP_LOG_INFO("success to finish base compaction."); - return res; -} - -OLAPStatus CommandExecutor::create_table(const TCreateTabletReq& request) { - OLAPStatus res = OLAP_SUCCESS; - OLAPTable* olap_table = NULL; - bool is_table_added = false; - - OLAP_LOG_INFO("begin to process create table. [tablet=%ld, schema_hash=%d]", - request.tablet_id, request.tablet_schema.schema_hash); - - PaloMetrics::create_tablet_requests_total.increment(1); - - // 1. Make sure create_table operation is idempotent: - // return success if table with same tablet_id and schema_hash exist, - // false if table with same tablet_id but different schema_hash exist - if (OLAPEngine::get_instance()->check_tablet_id_exist(request.tablet_id)) { - SmartOLAPTable table = OLAPEngine::get_instance()->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - if (table.get() != NULL) { - OLAP_LOG_INFO("create table success for table already exist."); - return OLAP_SUCCESS; - } else { - OLAP_LOG_WARNING("table with different schema hash already exists."); - return OLAP_ERR_CE_TABLET_ID_EXIST; - } - } - - // 2. Lock to ensure that all create_table operation execute in serial - static MutexLock create_table_lock; - AutoMutexLock auto_lock(&create_table_lock); - - do { - // 3. Create table with only header, no deltas - olap_table = OLAPEngine::get_instance()->create_table(request, NULL, false, NULL); - if (olap_table == NULL) { - res = OLAP_ERR_CE_CMD_PARAMS_ERROR; - OLAP_LOG_WARNING("fail to create olap table. [res=%d]", res); - break; - } - - // 4. Add table to OlapEngine will make it visiable to user - res = OLAPEngine::get_instance()->add_table( - request.tablet_id, request.tablet_schema.schema_hash, olap_table); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to add table to OLAPEngine. [res=%d]", res); - break; - } - is_table_added = true; - - SmartOLAPTable olap_table_ptr = OLAPEngine::get_instance()->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - if (olap_table_ptr.get() == NULL) { - res = OLAP_ERR_TABLE_NOT_FOUND; - OLAP_LOG_WARNING("fail to get table. [res=%d]", res); - break; - } - - // 5. Register table into OLAPRootPath, so that we can manage table from - // the perspective of root path. - // Example: unregister all tables when a bad disk found. - res = OLAPRootPath::get_instance()->register_table_into_root_path(olap_table_ptr.get()); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to register table into OLAPRootPath. [res=%d, root_path=%s]", - res, olap_table_ptr->storage_root_path_name().c_str()); - break; - } - - // 6. Create init version if this is not a restore mode replica and request.version is set - // bool in_restore_mode = request.__isset.in_restore_mode && request.in_restore_mode; - // if (!in_restore_mode && request.__isset.version) { - res = _create_init_version(olap_table_ptr, request); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create initial version for table. [res=%d]", res); - } - // } - } while (0); - - // 7. clear environment - if (res != OLAP_SUCCESS) { - PaloMetrics::create_tablet_requests_failed.increment(1); - if (is_table_added) { - OLAPStatus status = OLAPEngine::get_instance()->drop_table( - request.tablet_id, request.tablet_schema.schema_hash); - if (status != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to drop table when create table failed. [res=%d]", res); - } - } else if (NULL != olap_table) { - olap_table->delete_all_files(); - SAFE_DELETE(olap_table); - } - } - - OLAP_LOG_INFO("finish to process create table. [res=%d]", res); - return res; -} - -SmartOLAPTable CommandExecutor::get_table(TTabletId tablet_id, TSchemaHash schema_hash) { - OLAP_LOG_DEBUG("begin to process get_table. [table=%ld schema_hash=%d]", - tablet_id, schema_hash); - return OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); -} - -OLAPStatus CommandExecutor::drop_table(const TDropTabletReq& request) { - OLAP_LOG_INFO("begin to process drop table. [table=%ld schema_hash=%d]", - request.tablet_id, request.schema_hash); - - PaloMetrics::drop_tablet_requests_total.increment(1); - - OLAPStatus res = OLAPEngine::get_instance()->drop_table( - request.tablet_id, request.schema_hash); - if (res != OLAP_SUCCESS && res != OLAP_ERR_TABLE_NOT_FOUND) { - OLAP_LOG_WARNING("fail to process drop table. [status=%d]", res); - return res; - } - - OLAP_LOG_INFO("success to process drop table."); - return OLAP_SUCCESS; -} - -OLAPStatus CommandExecutor::report_all_tablets_info( - map* tablets_info) { - OLAP_LOG_INFO("begin to process report all tablets info."); - - PaloMetrics::report_all_tablets_requests_total.increment(1); - - OLAPStatus res = OLAP_SUCCESS; - - res = OLAPEngine::get_instance()->report_all_tablets_info(tablets_info); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process report all tablets info. [res=%d]", res); - PaloMetrics::report_all_tablets_requests_failed.increment(1); - return res; - } - - OLAP_LOG_INFO("success to process report all tablets info. [tablet_num=%u]", - tablets_info->size()); - return OLAP_SUCCESS; -} - -OLAPStatus CommandExecutor::report_tablet_info(TTabletInfo* tablet_info) { - OLAPStatus res = OLAP_SUCCESS; - OLAP_LOG_INFO("begin to process report tablet info. " - "[table=%ld schema_hash=%d]", - tablet_info->tablet_id, tablet_info->schema_hash); - - PaloMetrics::report_tablet_requests_total.increment(1); - - res = OLAPEngine::get_instance()->report_tablet_info(tablet_info); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get tablet info. [res=%d]", res); - PaloMetrics::report_tablet_requests_failed.increment(1); - return res; - } - - OLAP_LOG_INFO("success to process report tablet info."); - return OLAP_SUCCESS; -} - -OLAPStatus CommandExecutor::schema_change(const TAlterTabletReq& request) { - OLAP_LOG_INFO("begin to schema change. [base_table=%ld new_table=%ld]", - request.base_tablet_id, request.new_tablet_req.tablet_id); - - PaloMetrics::schema_change_requests_total.increment(1); - - OLAPStatus res = OLAP_SUCCESS; - - SchemaChangeHandler handler; - res = handler.process_alter_table(ALTER_TABLET_SCHEMA_CHANGE, request); - - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to do schema change. " - "[base_table=%ld new_table=%ld] [res=%d]", - request.base_tablet_id, request.new_tablet_req.tablet_id, res); - PaloMetrics::schema_change_requests_failed.increment(1); - return res; - } - - OLAP_LOG_INFO("success to submit schema change. " - "[base_table=%ld new_table=%ld]", - request.base_tablet_id, request.new_tablet_req.tablet_id); - return res; -} - -OLAPStatus CommandExecutor::create_rollup_table(const TAlterTabletReq& request) { - OLAP_LOG_INFO("begin to create rollup table. " - "[base_table=%ld new_table=%ld]", - request.base_tablet_id, request.new_tablet_req.tablet_id); - - PaloMetrics::create_rollup_requests_total.increment(1); - - OLAPStatus res = OLAP_SUCCESS; - - SchemaChangeHandler handler; - res = handler.process_alter_table(ALTER_TABLET_CREATE_ROLLUP_TABLE, request); - - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to do rollup. " - "[base_table=%ld new_table=%ld] [res=%d]", - request.base_tablet_id, request.new_tablet_req.tablet_id, res); - PaloMetrics::create_rollup_requests_failed.increment(1); - return res; - } - - OLAP_LOG_INFO("success to create rollup table. " - "[base_table=%ld new_table=%ld] [res=%d]", - request.base_tablet_id, request.new_tablet_req.tablet_id, res); - return res; -} - -AlterTableStatus CommandExecutor::show_alter_table_status( - TTabletId tablet_id, - TSchemaHash schema_hash) { - OLAP_LOG_INFO("begin to process show alter table status. " - "[table=%ld schema_hash=%d]", - tablet_id, schema_hash); - - AlterTableStatus status = ALTER_TABLE_DONE; - - SmartOLAPTable table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); - if (table.get() == NULL) { - OLAP_LOG_WARNING("fail to get table. [table=%ld schema_hash=%d]", - tablet_id, schema_hash); - status = ALTER_TABLE_FAILED; - } else { - status = table->schema_change_status().status; - } - - return status; -} - -OLAPStatus CommandExecutor::make_snapshot( - TTabletId tablet_id, - TSchemaHash schema_hash, - string* snapshot_path) { - TSnapshotRequest request; - request.tablet_id = tablet_id; - request.schema_hash = schema_hash; - return this->make_snapshot(request, snapshot_path); -} - -OLAPStatus CommandExecutor::make_snapshot( - const TSnapshotRequest& request, - std::string* snapshot_path) { - OLAP_LOG_INFO("begin to process make snapshot. " - "[table=%ld, schema_hash=%d]", - request.tablet_id, request.schema_hash); - - OLAPStatus res = OLAP_SUCCESS; - res = OLAPSnapshot::get_instance()->make_snapshot(request, snapshot_path); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process make snapshot. [res=%d]", res); - return res; - } - - OLAP_LOG_INFO("success to process make snapshot. [path=%s]", - snapshot_path->c_str()); - return res; -} - -OLAPStatus CommandExecutor::release_snapshot(const string& snapshot_path) { - OLAP_LOG_INFO("begin to process release snapshot. [path='%s']", - snapshot_path.c_str()); - OLAPStatus res = OLAP_SUCCESS; - - res = OLAPSnapshot::get_instance()->release_snapshot(snapshot_path); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process release snapshot. [res=%d]", res); - return res; - } - - OLAP_LOG_TRACE("success to process release snapshot[path=%s].", - snapshot_path.c_str()); - return res; -} - -OLAPStatus CommandExecutor::obtain_shard_path( - TStorageMedium::type storage_medium, std::string* shard_path) { - OLAP_LOG_INFO("begin to process obtain root path. [storage_medium=%d]", storage_medium); - OLAPStatus res = OLAP_SUCCESS; - - if (shard_path == NULL) { - OLAP_LOG_WARNING("invalid output parameter which is null pointer."); - return OLAP_ERR_CE_CMD_PARAMS_ERROR; - } - - OLAPRootPath::RootPathVec root_paths; - OLAPRootPath::get_instance()->get_root_path_for_create_table(storage_medium, &root_paths); - if (root_paths.size() == 0) { - OLAP_LOG_WARNING("no available disk can be used to create table."); - return OLAP_ERR_NO_AVAILABLE_ROOT_PATH; - } - - uint64_t shard = 0; - res = OLAPRootPath::get_instance()->get_root_path_shard(root_paths[0], &shard); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get root path shard. [res=%d]", res); - return res; - } - - stringstream root_path_stream; - root_path_stream << root_paths[0] << DATA_PREFIX << "/" << shard; - *shard_path = root_path_stream.str(); - - OLAP_LOG_INFO("success to process obtain root path. [path='%s']", - shard_path->c_str()); - return res; -} - -OLAPStatus CommandExecutor::load_header( - const string& shard_path, - const TCloneReq& request) { - OLAP_LOG_INFO("begin to process load headers. " - "[tablet_id=%ld schema_hash=%d]", - request.tablet_id, request.schema_hash); - OLAPStatus res = OLAP_SUCCESS; - - stringstream schema_hash_path_stream; - schema_hash_path_stream << shard_path - << "/" << request.tablet_id - << "/" << request.schema_hash; - res = OLAPEngine::get_instance()->load_one_tablet( - request.tablet_id, request.schema_hash, - schema_hash_path_stream.str()); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process load headers. [res=%d]", res); - return res; - } - - OLAP_LOG_INFO("success to process load headers."); - return res; -} - -OLAPStatus CommandExecutor::load_header( - const string& shard_path, - TTabletId tablet_id, - TSchemaHash schema_hash) { - OLAP_LOG_INFO("begin to process load headers. [tablet_id=%ld schema_hash=%d]", - tablet_id, schema_hash); - OLAPStatus res = OLAP_SUCCESS; - - stringstream schema_hash_path_stream; - schema_hash_path_stream << shard_path - << "/" << tablet_id - << "/" << schema_hash; - res = OLAPEngine::get_instance()->load_one_tablet( - tablet_id, schema_hash, - schema_hash_path_stream.str()); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process load headers. [res=%d]", res); - return res; - } - - OLAP_LOG_INFO("success to process load headers."); - return res; -} - -OLAPStatus CommandExecutor::storage_medium_migrate(const TStorageMediumMigrateReq& request) { - OLAP_LOG_INFO("begin to process storage media migrate. " - "[tablet_id=%ld schema_hash=%d dest_storage_medium=%d]", - request.tablet_id, request.schema_hash, request.storage_medium); - - PaloMetrics::storage_migrate_requests_total.increment(1); - - OLAPStatus res = OLAP_SUCCESS; - res = OLAPSnapshot::get_instance()->storage_medium_migrate( - request.tablet_id, request.schema_hash, request.storage_medium); - - OLAP_LOG_INFO("finish to process storage media migrate. [res=%d]", res); - return res; -} - -OLAPStatus CommandExecutor::reload_root_path(const string& root_paths) { - OLAP_LOG_INFO("begin to process reload root path. [path='%s']", - root_paths.c_str()); - OLAPStatus res = OLAP_SUCCESS; - - static MutexLock reload_root_path_lock; - reload_root_path_lock.lock(); - res = OLAPRootPath::get_instance()->reload_root_paths(root_paths.c_str()); - reload_root_path_lock.unlock(); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process reload root path. [res=%d]", res); - return res; - } - - OLAP_LOG_INFO("success to finish reload root path."); - return res; -} - -OLAPStatus CommandExecutor::delete_data( - const TPushReq& request, - vector* tablet_info_vec) { - OLAP_LOG_INFO("begin to process delete data. [request='%s']", - ThriftDebugString(request).c_str()); - PaloMetrics::delete_requests_total.increment(1); - - OLAPStatus res = OLAP_SUCCESS; - - if (tablet_info_vec == NULL) { - OLAP_LOG_WARNING("invalid output parameter which is null pointer."); - return OLAP_ERR_CE_CMD_PARAMS_ERROR; - } - - // 1. Get all tablets with same tablet_id - SmartOLAPTable table = OLAPEngine::get_instance()->get_table( - request.tablet_id, request.schema_hash); - if (table.get() == NULL) { - OLAP_LOG_WARNING("can't find table. [table=%ld schema_hash=%d]", - request.tablet_id, request.schema_hash); - return OLAP_ERR_TABLE_NOT_FOUND; - } - - // 2. Process delete data by push interface - PushHandler push_handler; - res = push_handler.process( - table, request, PUSH_FOR_DELETE, tablet_info_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to push empty version for delete data. " - "[res=%d table='%s']", - res, table->full_name().c_str()); - PaloMetrics::delete_requests_failed.increment(1); - return res; - } - - OLAP_LOG_INFO("finish to process delete data. [res=%d]", res); - return res; -} - -OLAPStatus CommandExecutor::cancel_delete(const TCancelDeleteDataReq& request) { - OLAP_LOG_INFO("begin to process cancel delete. [table=%ld version=%ld]", - request.tablet_id, request.version); - - PaloMetrics::cancel_delete_requests_total.increment(1); - - OLAPStatus res = OLAP_SUCCESS; - - // 1. Get all tablets with same tablet_id - list table_list; - res = OLAPEngine::get_instance()->get_tables_by_id(request.tablet_id, &table_list); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("can't find table. [table=%ld]", request.tablet_id); - return OLAP_ERR_TABLE_NOT_FOUND; - } - - // 2. Remove delete conditions from each tablet. - DeleteConditionHandler cond_handler; - for (SmartOLAPTable temp_table : table_list) { - temp_table->obtain_header_wrlock(); - res = cond_handler.delete_cond(temp_table, request.version, false); - if (res != OLAP_SUCCESS) { - temp_table->release_header_lock(); - OLAP_LOG_WARNING("cancel delete failed. [res=%d table=%s]", - res, temp_table->full_name().c_str()); - break; - } - - res = temp_table->save_header(); - if (res != OLAP_SUCCESS) { - temp_table->release_header_lock(); - OLAP_LOG_WARNING("fail to save header. [res=%d table=%s]", - res, temp_table->full_name().c_str()); - break; - } - temp_table->release_header_lock(); - } - - // Show delete conditions in tablet header. - for (SmartOLAPTable table : table_list) { - cond_handler.log_conds(table); - } - - OLAP_LOG_INFO("finish to process cancel delete. [res=%d]", res); - return res; -} - -OLAPStatus CommandExecutor::get_all_root_path_info( - std::vector* root_paths_info, - bool need_capacity) { - OLAP_LOG_INFO("begin to process get all root path info."); - OLAPStatus res = OLAP_SUCCESS; - - res = OLAPRootPath::get_instance()->get_all_root_path_info(root_paths_info, need_capacity); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to process get all root path info. [res=%d]", res); - return res; - } - - OLAP_LOG_INFO("success to process get all root path info."); - return res; -} - -OLAPStatus CommandExecutor::_create_init_version( - SmartOLAPTable olap_table, const TCreateTabletReq& request) { - OLAPStatus res = OLAP_SUCCESS; - - if (request.version < 1) { - OLAP_LOG_WARNING("init version of tablet should at least 1."); - return OLAP_ERR_CE_CMD_PARAMS_ERROR; - } else { - Version init_base_version(0, request.version); - res = OLAPEngine::get_instance()->create_init_version( - request.tablet_id, request.tablet_schema.schema_hash, - init_base_version, request.version_hash); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create init base version. [res=%d version=%ld]", - res, request.version); - return res; - } - - Version init_delta_version(request.version + 1, request.version + 1); - res = OLAPEngine::get_instance()->create_init_version( - request.tablet_id, request.tablet_schema.schema_hash, - init_delta_version, 0); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create init delta version. [res=%d version=%ld]", - res, request.version + 1); - return res; - } - } - - olap_table->obtain_header_wrlock(); - olap_table->set_cumulative_layer_point(request.version + 1); - res = olap_table->save_header(); - olap_table->release_header_lock(); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to save header. [table=%s]", olap_table->full_name().c_str()); - } - - return res; -} - -} // namespace palo diff --git a/be/src/olap/command_executor.h b/be/src/olap/command_executor.h deleted file mode 100644 index 96b6285173..0000000000 --- a/be/src/olap/command_executor.h +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_OLAP_COMMAND_EXECUTOR_H -#define BDG_PALO_BE_SRC_OLAP_COMMAND_EXECUTOR_H - -#include -#include - -#include "gen_cpp/AgentService_types.h" -#include "gen_cpp/MasterService_types.h" -#include "olap/olap_common.h" -#include "olap/olap_define.h" -#include "olap/olap_rootpath.h" -#include "olap/olap_table.h" -#include "olap/utils.h" - -namespace palo { - -class CommandExecutor { -public: - // Empty constructor and destructor. - CommandExecutor() {} - virtual ~CommandExecutor() {} - // Create new tablet according to request info, - // note that this interface is idempotent. - // - // @param [in] request contains tablet id info and tablet schema - // @return error code - virtual OLAPStatus create_table(const TCreateTabletReq& request); - - // Get specified tablet. - // - // @param [in] tablet_id & schema_hash specify a tablet - // @return SmartOLAPTable point to NULL if tablet not exits - virtual SmartOLAPTable get_table(TTabletId tablet_id, TSchemaHash schema_hash); - - // Drop specified tablet. - // - // @param [in] request specify tablet_id and schema_hash which - // uniquely identify a tablet - // @return error code - virtual OLAPStatus drop_table(const TDropTabletReq& request); - - // Push local data file into specified tablet, - // note that this interface is idempotent. - // - // @param [in] request contains tablet id info and local data path - // @param [out] tablet_info_vec return tablet lastest status, which - // include version info, row count, data size, etc - // @return error code - virtual OLAPStatus push(const TPushReq& request, std::vector* tablet_info_vec); - - // Report tablet detail information including - // version info, row count, data size, etc. - // - // @param [in][out] tablet_info specify tablet_id and schema_hash, - // will be filled with tablet lastest status. - // @return error code - virtual OLAPStatus report_tablet_info(TTabletInfo* tablet_info); - - // Report all tablets detail info in current OLAPEngine. - // - // @param [out] tablets_info - // @return error code - virtual OLAPStatus report_all_tablets_info(std::map* tablets_info); - - // ######################### ALTER TABLE BEGIN ######################### - // The following interfaces are all about alter tablet operation, - // the main logical is that generating a new tablet with different - // schema on base tablet. - - // Create rollup tablet on base tablet, after create_rollup_table, - // both base tablet and new tablet is effective. - // - // @param [in] request specify base tablet, new tablet and its schema - // @return OLAP_SUCCESS if submit success - virtual OLAPStatus create_rollup_table(const TAlterTabletReq& request); - - // Do schema change on tablet, OLAPEngine support - // add column, drop column, alter column type and order, - // after schema_change, base tablet is abandoned. - // Note that the two tablets has same tablet_id but different schema_hash - // - // @param [in] request specify base tablet, new tablet and its schema - // @return OLAP_SUCCESS if submit success - virtual OLAPStatus schema_change(const TAlterTabletReq& request); - - // Show status of all alter table operation. - // - // @param [in] tablet_id & schema_hash specify a tablet - // @return alter table status - virtual AlterTableStatus show_alter_table_status(TTabletId tablet_id, TSchemaHash schema_hash); - - // ######################### ALTER TABLE END######################### - - // ######################### CLONE TABLE BEGIN ######################### - // The following interfaces are all about clone tablet operation, - // the main logical: - // 1. make a snapshot of base tablet; - // 2. obtain root path on another olapengine; - // 3. rsync the snapshot to root path by agent; - // 4. release the snapshot after clone process. - - // Make snapshot of base tablet. - // - // @param [in] tablet_id & schema_hash specify base tablet - // @param [out] snapshot_path return snapshot path - // @return error code - virtual OLAPStatus make_snapshot( - TTabletId tablet_id, - TSchemaHash schema_hash, - std::string* snapshot_path); - - virtual OLAPStatus make_snapshot( - const TSnapshotRequest& request, - std::string* snapshot_path); - - // Obtain shard path for new tablet. - // - // @param [out] shard_path choose an available root_path to clone new tablet - // @return error code - virtual OLAPStatus obtain_shard_path( - TStorageMedium::type storage_medium, - std::string* shared_path); - - // Load new tablet to make it effective. - // - // @param [in] root_path specify root path of new tablet - // @param [in] request specify new tablet info - // @return OLAP_SUCCESS if load tablet success - virtual OLAPStatus load_header(const std::string& shard_path, const TCloneReq& request); - virtual OLAPStatus load_header( - const std::string& shard_path, - TTabletId tablet_id, - TSchemaHash schema_hash); - - // Release snapshot of base tablet after clone finished. - // - // @param [in] snapshot_path - // @return error code - virtual OLAPStatus release_snapshot(const std::string& snapshot_path); - - // ######################### CLONE TABLE END ######################### - - // Migrate specified tablet to specified storage media. - // - // @param [in] request specify tablet and destination storage media - // @return error code - virtual OLAPStatus storage_medium_migrate(const TStorageMediumMigrateReq& request); - - // Delete data of specified tablet according to delete conditions, - // once delete_data command submit success, deleted data is not visible, - // but not actually deleted util delay_delete_time run out. - // - // @param [in] request specify tablet and delete conditions - // @param [out] tablet_info_vec return tablet lastest status, which - // include version info, row count, data size, etc - // @return OLAP_SUCCESS if submit delete_data success - virtual OLAPStatus delete_data( - const TPushReq& request, - std::vector* tablet_info_vec); - - // Cancel delete operation before delay_delete_time run out. - // - // @param [in] request specify tablet and delete version - // @return OLAP_SUCCESS if cancel success - virtual OLAPStatus cancel_delete(const TCancelDeleteDataReq& request); - - // Start base compaction to expand base delta to version manually. - // - // @param [in] tablet_id & schema_hash specify tablet - // @param [in] version specify base compaction range - // @return OLAP_SUCCESS if start be success - virtual OLAPStatus base_compaction(TTabletId tablet_id, - TSchemaHash schema_hash, - TVersion version); - - // Compute checksum of Version(0,version) to diff between 3 copies. - // - // @param [in] tablet_id & schema_hash specify tablet - // @param [in] version - // @param [out] checksum - // @return error code - virtual OLAPStatus compute_checksum( - TTabletId tablet_id, - TSchemaHash schema_hash, - TVersion version, - TVersionHash version_hash, - uint32_t* checksum); - - // Reload multiple root paths split by ';'. - // - // @param root_paths for example: "/home/disk1/data;/home/disk2/data" - // @return error code - virtual OLAPStatus reload_root_path(const std::string& root_paths); - - // Get all root path state information. - // - // @param root_paths_stat each root path stat including total/used/available capacity - // @return error code - virtual OLAPStatus get_all_root_path_info( - std::vector* root_paths_info, - bool need_capacity = true); - -private: - // Create initial base and delta version. - OLAPStatus _create_init_version(SmartOLAPTable olap_table, const TCreateTabletReq& request); - - DISALLOW_COPY_AND_ASSIGN(CommandExecutor); -}; - -} // namespace palo - -#endif // BDG_PALO_BE_SRC_OLAP_COMMAND_EXECUTOR_H diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index e1019ce430..99c1674dfb 100755 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -30,8 +30,8 @@ using std::vector; namespace palo { -OLAPStatus CumulativeCompaction::init(SmartOLAPTable table) { - OLAP_LOG_TRACE("init cumulative compaction handler. [table=%s]", table->full_name().c_str()); +OLAPStatus CumulativeCompaction::init(OLAPTablePtr table) { + LOG(INFO) << "init cumulative compaction handler. [table=" << table->full_name() << "]"; if (_is_init) { OLAP_LOG_WARNING("cumulative handler has been inited.[table=%s]", @@ -58,24 +58,14 @@ OLAPStatus CumulativeCompaction::init(SmartOLAPTable table) { // 如果为-1,则该table之剿²¡æœ‰è®¾ç½®è¿‡cumulative layer point // 我们在这里设置一下 if (_old_cumulative_layer_point == -1) { - OLAP_LOG_INFO("tablet has an unreasonable cumulative layer point. " - "[tablet='%s' cumulative_layer_point=%d]", - _table->full_name().c_str(), _old_cumulative_layer_point); + LOG(INFO) << "tablet has an unreasonable cumulative layer point. [tablet='" << _table->full_name() + << "' cumulative_layer_point=" << _old_cumulative_layer_point << "]"; _table->release_cumulative_lock(); return OLAP_ERR_CUMULATIVE_INVALID_PARAMETERS; } - OLAPStatus res = OLAP_SUCCESS; - _obtain_header_rdlock(); - res = _check_whether_satisfy_policy(); - _release_header_lock(); - if (res != OLAP_SUCCESS) { - _table->release_cumulative_lock(); - return res; - } - _obtain_header_wrlock(); - res = _calculate_need_merged_versions(); + OLAPStatus res = _calculate_need_merged_versions(); _release_header_lock(); if (res != OLAP_SUCCESS) { _table->release_cumulative_lock(); @@ -143,11 +133,10 @@ OLAPStatus CumulativeCompaction::run() { do { // 3. ç”Ÿæˆæ–°cumulative文件对应的olap index - _new_cumulative_index = new (nothrow) OLAPIndex(_table.get(), + _new_cumulative_index = new (nothrow) Rowset(_table.get(), _cumulative_version, _cumulative_version_hash, - false, - 0, 0); + false, 0, 0); if (_new_cumulative_index == NULL) { OLAP_LOG_WARNING("failed to malloc new cumulative olap index. " "[table=%s; cumulative_version=%d-%d]", @@ -186,31 +175,6 @@ OLAPStatus CumulativeCompaction::run() { return res; } -OLAPStatus CumulativeCompaction::_check_whether_satisfy_policy() { - OLAPStatus res = OLAP_SUCCESS; - - Versions delta_versions; - res = _get_delta_versions(&delta_versions); - if (res != OLAP_SUCCESS) { - OLAP_LOG_TRACE("get delta versions failed when check cumulative policy. [res=%d]", res); - return res; - } - - if (delta_versions.size() < config::cumulative_compaction_num_singleton_deltas) { - OLAP_LOG_TRACE("do not satisfy cumulative policy. " - "[num_existed_singleton_deltas=%d cumulative_compaction_num_singleton_deltas=%d]", - delta_versions.size(), - config::cumulative_compaction_num_singleton_deltas); - return OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS; - } - - OLAP_LOG_INFO("satisfy cumulative policy." - "[num_existed_singleton_delta=%d cumulative_compaction_num_singleton_deltas=%d]", - delta_versions.size(), - config::cumulative_compaction_num_singleton_deltas); - return OLAP_SUCCESS; -} - OLAPStatus CumulativeCompaction::_calculate_need_merged_versions() { OLAPStatus res = OLAP_SUCCESS; @@ -242,7 +206,7 @@ OLAPStatus CumulativeCompaction::_calculate_need_merged_versions() { } Version delta = delta_versions[index]; - size_t delta_size = _table->get_version_entity_by_version(delta).data_size; + size_t delta_size = _table->get_version_data_size(delta); // 如果é‡åˆ°å¤§çš„delta文件,或delete版本文件,则: if (delta_size >= _max_delta_file_size || _table->is_delete_data_version(delta) @@ -270,7 +234,7 @@ OLAPStatus CumulativeCompaction::_calculate_need_merged_versions() { if (need_merged_versions.size() == 1 || total_size == 0) { // 如果区间末尾是较大的delta版, 则与它åˆå¹¶ if (index < delta_number - && _table->get_version_entity_by_version(delta_versions[index]).data_size >= + && _table->get_version_data_size(delta_versions[index]) >= _max_delta_file_size) { need_merged_versions.push_back(delta_versions[index]); ++index; @@ -348,7 +312,19 @@ OLAPStatus CumulativeCompaction::_get_delta_versions(Versions* delta_versions) { return OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS; } - sort(delta_versions->begin(), delta_versions->end(), version_comparator); + sort(delta_versions->begin(), delta_versions->end(), version_comparator); + + // can't do cumulative expansion if there has a hole + Versions versions_path; + OLAPStatus select_status = _table->select_versions_to_span( + Version(delta_versions->front().first, delta_versions->back().second), &versions_path); + if (select_status != OLAP_SUCCESS) { + OLAP_LOG_WARNING("can't do cumulative expansion if fail to select shortest version path. " + "[table=%s start=%d; end=%d]", + _table->full_name().c_str(), + delta_versions->front().first, delta_versions->back().second); + return select_status; + } return OLAP_SUCCESS; } @@ -377,7 +353,7 @@ bool CumulativeCompaction::_find_previous_version(const Version current_version, return false; } - size_t data_size = _table->get_version_entity_by_version(*version).data_size; + size_t data_size = _table->get_version_data_size(*version); if (data_size >= _max_delta_file_size) { return false; } @@ -397,7 +373,7 @@ OLAPStatus CumulativeCompaction::_do_cumulative_compaction() { // 1. merge delta files into new cumulative file uint64_t merged_rows = 0; uint64_t filted_rows = 0; - res = merger.merge(_data_source, true, &merged_rows, &filted_rows); + res = merger.merge(_data_source, &merged_rows, &filted_rows); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("failed to do cumulative merge. [table=%s; cumulative_version=%d-%d]", _table->full_name().c_str(), @@ -436,7 +412,7 @@ OLAPStatus CumulativeCompaction::_do_cumulative_compaction() { } // 3. add new cumulative file into table - vector unused_indices; + vector unused_indices; _obtain_header_wrlock(); res = _update_header(&unused_indices); if (res != OLAP_SUCCESS) { @@ -460,7 +436,7 @@ OLAPStatus CumulativeCompaction::_do_cumulative_compaction() { // if error happened, roll back OLAPStatus ret = _roll_back(unused_indices); if (ret != OLAP_SUCCESS) { - OLAP_LOG_FATAL("roll back failed. [table=%s]", _table->full_name().c_str()); + LOG(FATAL) << "roll back failed. [table=" << _table->full_name() << "]"; } _release_header_lock(); @@ -481,8 +457,8 @@ OLAPStatus CumulativeCompaction::_do_cumulative_compaction() { return res; } -OLAPStatus CumulativeCompaction::_update_header(vector* unused_indices) { - vector new_indices; +OLAPStatus CumulativeCompaction::_update_header(vector* unused_indices) { + vector new_indices; new_indices.push_back(_new_cumulative_index); OLAPStatus res = OLAP_SUCCESS; @@ -500,14 +476,14 @@ OLAPStatus CumulativeCompaction::_update_header(vector* unused_indic return res; } - return res; + return res; } -void CumulativeCompaction::_delete_unused_delta_files(vector* unused_indices) { +void CumulativeCompaction::_delete_unused_delta_files(vector* unused_indices) { if (!unused_indices->empty()) { - OLAPUnusedIndex* unused_index = OLAPUnusedIndex::get_instance(); + OLAPEngine* unused_index = OLAPEngine::get_instance(); - for (vector::iterator it = unused_indices->begin(); + for (vector::iterator it = unused_indices->begin(); it != unused_indices->end(); ++it) { unused_index->add_unused_index(*it); } @@ -547,23 +523,23 @@ OLAPStatus CumulativeCompaction::_validate_delete_file_action() { return OLAP_SUCCESS; } -OLAPStatus CumulativeCompaction::_roll_back(const vector& old_olap_indices) { +OLAPStatus CumulativeCompaction::_roll_back(const vector& old_olap_indices) { vector need_remove_version; need_remove_version.push_back(_cumulative_version); // unused_indices will only contain new cumulative index // we don't need to delete it here; we will delete new cumulative index in the end. - vector unused_indices; + vector unused_indices; OLAPStatus res = OLAP_SUCCESS; res = _table->replace_data_sources(&need_remove_version, &old_olap_indices, &unused_indices); if (res != OLAP_SUCCESS) { - OLAP_LOG_FATAL("failed to replace data sources. [table=%s]", _table->full_name().c_str()); + LOG(FATAL) << "failed to replace data sources. [table=" << _table->full_name() << "]"; return res; } res = _table->save_header(); if (res != OLAP_SUCCESS) { - OLAP_LOG_FATAL("failed to save header. [table=%s]", _table->full_name().c_str()); + LOG(FATAL) << "failed to save header. [table=" << _table->full_name() << "]"; return res; } diff --git a/be/src/olap/cumulative_compaction.h b/be/src/olap/cumulative_compaction.h index 6e17d0d6ec..6afc00b764 100755 --- a/be/src/olap/cumulative_compaction.h +++ b/be/src/olap/cumulative_compaction.h @@ -28,6 +28,8 @@ namespace palo { +class Rowset; + class CumulativeCompaction { public: CumulativeCompaction() : @@ -50,7 +52,7 @@ public: // 返回值: // - 如果触å‘cumulative compaction,返回OLAP_SUCCESS // - å¦åˆ™ï¼Œè¿”å›žå¯¹åº”é”™è¯¯ç  - OLAPStatus init(SmartOLAPTable table); + OLAPStatus init(OLAPTablePtr table); // 执行cumulative compaction // @@ -60,12 +62,6 @@ public: OLAPStatus run(); private: - // æ£€æŸ¥æ˜¯å¦æ»¡è¶³cumulative compaction触å‘ç­–ç•¥ - // - // 返回值: - // - 如果满足,返回OLAP_SUCCESS - // - å¦‚æžœä¸æ»¡è¶³ï¼Œè¿”回OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSIONS - OLAPStatus _check_whether_satisfy_policy(); // 计算å¯ä»¥åˆå¹¶çš„deltaæ–‡ä»¶ï¼Œä»¥åŠæ–°çš„cumulative层标识点 // @@ -112,13 +108,13 @@ private: // 返回值: // - 如果æˆåŠŸï¼Œè¿”å›žOLAP_SUCCESS // - å¦‚æžœä¸æˆåŠŸï¼Œè¿”å›žç›¸åº”é”™è¯¯ç  - OLAPStatus _update_header(std::vector* unused_indices); + OLAPStatus _update_header(std::vector* unused_indices); // 删除ä¸å†ä½¿ç”¨çš„delta文件 // // è¾“å…¥è¾“å‡ºå‚æ•° // - unused_indices: 待删除的ä¸å†ä½¿ç”¨çš„delta文件对应的olap index - void _delete_unused_delta_files(std::vector* unused_indices); + void _delete_unused_delta_files(std::vector* unused_indices); // 验è¯å¾—到的m_need_merged_versionsæ˜¯å¦æ­£ç¡® // @@ -135,7 +131,7 @@ private: OLAPStatus _validate_delete_file_action(); // æ¢å¤header头文件的文件版本和tableçš„data source - OLAPStatus _roll_back(const std::vector& old_olap_indices); + OLAPStatus _roll_back(const std::vector& old_olap_indices); void _obtain_header_rdlock() { _table->obtain_header_rdlock(); @@ -166,13 +162,13 @@ private: // 当delta文件的大å°è¶…过该值时,我们认为该delta文件是cumulative文件 size_t _max_delta_file_size; // 待执行cumulative compactionçš„olap table - SmartOLAPTable _table; + OLAPTablePtr _table; // æ–°cumulative文件的版本 Version _cumulative_version; // æ–°cumulative文件的version hash VersionHash _cumulative_version_hash; // æ–°cumulative文件对应的olap index - OLAPIndex* _new_cumulative_index; + Rowset* _new_cumulative_index; // å¯åˆå¹¶çš„delta文件的data文件 std::vector _data_source; // å¯åˆå¹¶çš„delta文件的版本 diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index d66e7ec7cf..87089481a9 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -48,7 +48,7 @@ namespace palo { // 1. 删除æ¡ä»¶çš„版本è¦ä¸æ˜¯å½“剿œ€å¤§çš„delta版本å·ï¼Œè¦ä¸æ˜¯æœ€å¤§çš„delta版本å·åŠ 1 // 2. 删除æ¡ä»¶ä¸­æŒ‡å®šçš„列在table中存在,必须是key列,且ä¸èƒ½æ˜¯double,float类型 OLAPStatus DeleteConditionHandler::store_cond( - SmartOLAPTable table, + OLAPTablePtr table, const int32_t version, const vector& conditions) { if (conditions.size() == 0 || _check_version_valid(table, version) != OLAP_SUCCESS) { @@ -60,7 +60,7 @@ OLAPStatus DeleteConditionHandler::store_cond( // 检查删除æ¡ä»¶æ˜¯å¦ç¬¦åˆè¦æ±‚ for (const TCondition& condition : conditions) { - if (_check_condition_valid(table, condition) != OLAP_SUCCESS) { + if (check_condition_valid(table, condition) != OLAP_SUCCESS) { OLAP_LOG_WARNING("invalid condition. [%s]", ThriftDebugString(condition).c_str()); return OLAP_ERR_DELETE_INVALID_CONDITION; @@ -68,7 +68,7 @@ OLAPStatus DeleteConditionHandler::store_cond( } int cond_index = _check_whether_condition_exist(table, version); - DeleteDataConditionMessage* del_cond = NULL; + DeleteConditionMessage* del_cond = NULL; if (cond_index == -1) { // 删除æ¡ä»¶ä¸å­˜åœ¨ del_cond = table->add_delete_data_conditions(); @@ -80,29 +80,34 @@ OLAPStatus DeleteConditionHandler::store_cond( // 存储删除æ¡ä»¶ for (const TCondition& condition : conditions) { - string op = condition.condition_op; - if (op == "<") { - op += "<"; - } else if (op == ">") { - op += ">"; - } - string condition_str = ""; - if ("IS" == op) { - condition_str = condition.column_name + " " + op + " " + condition.condition_values[0]; - } else { - condition_str = condition.column_name + op + condition.condition_values[0]; - } + string condition_str = construct_sub_conditions(condition); del_cond->add_sub_conditions(condition_str); - OLAP_LOG_INFO("store one sub-delete condition. [condition=%s]", - condition_str.c_str()); + OLAP_LOG_INFO("store one sub-delete condition. [condition='%s']", + condition_str.c_str()); } return OLAP_SUCCESS; } +string DeleteConditionHandler::construct_sub_conditions(const TCondition& condition) { + string op = condition.condition_op; + if (op == "<") { + op += "<"; + } else if (op == ">") { + op += ">"; + } + string condition_str = ""; + if ("IS" == op) { + condition_str = condition.column_name + " " + op + " " + condition.condition_values[0]; + } else { + condition_str = condition.column_name + op + condition.condition_values[0]; + } + return condition_str; +} + // 删除指定版本å·çš„删除æ¡ä»¶ï¼›éœ€è¦æ³¨æ„的是,如果table上没有任何删除æ¡ä»¶ï¼Œæˆ–者 // 指定版本å·çš„删除æ¡ä»¶ä¸å­˜åœ¨ï¼Œä¹Ÿä¼šè¿”回OLAP_SUCCESS。 -OLAPStatus DeleteConditionHandler::delete_cond(SmartOLAPTable table, +OLAPStatus DeleteConditionHandler::delete_cond(OLAPTablePtr table, const int32_t version, bool delete_smaller_version_conditions) { if (version < 0) { @@ -113,7 +118,6 @@ OLAPStatus DeleteConditionHandler::delete_cond(SmartOLAPTable table, del_cond_array* delete_conditions = table->mutable_delete_data_conditions(); if (delete_conditions->size() == 0) { - OLAP_LOG_INFO("empty delete conditions."); return OLAP_SUCCESS; } @@ -123,7 +127,7 @@ OLAPStatus DeleteConditionHandler::delete_cond(SmartOLAPTable table, // 1. 如果删除æ¡ä»¶çš„版本å·ç­‰äºŽå½¢å‚指定的版本å·ï¼Œåˆ™åˆ é™¤è¯¥ç‰ˆæœ¬çš„æ–‡ä»¶ï¼› // 2. 如果还指定了delete_smaller_version_conditions为trueï¼Œåˆ™åŒæ—¶åˆ é™¤ // 版本å·å°äºŽæŒ‡å®šç‰ˆæœ¬å·çš„删除æ¡ä»¶ï¼›å¦åˆ™ä¸åˆ é™¤ã€‚ - DeleteDataConditionMessage temp = delete_conditions->Get(index); + DeleteConditionMessage temp = delete_conditions->Get(index); if (temp.version() == version || (temp.version() < version && delete_smaller_version_conditions)) { @@ -151,14 +155,14 @@ OLAPStatus DeleteConditionHandler::delete_cond(SmartOLAPTable table, return OLAP_SUCCESS; } -OLAPStatus DeleteConditionHandler::log_conds(SmartOLAPTable table) { +OLAPStatus DeleteConditionHandler::log_conds(OLAPTablePtr table) { OLAP_LOG_INFO("display all delete condition. [full_name=%s]", table->full_name().c_str()); table->obtain_header_rdlock(); const del_cond_array& delete_conditions = table->delete_data_conditions(); for (int index = 0; index != delete_conditions.size(); ++index) { - DeleteDataConditionMessage temp = delete_conditions.Get(index); + DeleteConditionMessage temp = delete_conditions.Get(index); string del_cond_str; const RepeatedPtrField& sub_conditions = temp.sub_conditions(); @@ -175,8 +179,8 @@ OLAPStatus DeleteConditionHandler::log_conds(SmartOLAPTable table) { return OLAP_SUCCESS; } -OLAPStatus DeleteConditionHandler::_check_condition_valid( - SmartOLAPTable table, +OLAPStatus DeleteConditionHandler::check_condition_valid( + OLAPTablePtr table, const TCondition& cond) { // 检查指定列å的列是å¦å­˜åœ¨ int field_index = table->get_field_index(cond.column_name); @@ -247,12 +251,12 @@ OLAPStatus DeleteConditionHandler::_check_condition_valid( if (valid_condition) { return OLAP_SUCCESS; } else { - OLAP_LOG_WARNING("invalid condition value. [value=%s]", value_str.c_str()); + LOG(WARNING) << "invalid condition value. [value=" << value_str << "]"; return OLAP_ERR_DELETE_INVALID_CONDITION; } } -OLAPStatus DeleteConditionHandler::_check_version_valid(SmartOLAPTable table, +OLAPStatus DeleteConditionHandler::_check_version_valid(OLAPTablePtr table, const int32_t filter_version) { // æ‰¾åˆ°å½“å‰æœ€å¤§çš„deltaæ–‡ä»¶ç‰ˆæœ¬å· vector all_file_versions; @@ -275,7 +279,7 @@ OLAPStatus DeleteConditionHandler::_check_version_valid(SmartOLAPTable table, } } -int DeleteConditionHandler::_check_whether_condition_exist(SmartOLAPTable table, int cond_version) { +int DeleteConditionHandler::_check_whether_condition_exist(OLAPTablePtr table, int cond_version) { const del_cond_array& delete_conditions = table->delete_data_conditions(); if (delete_conditions.size() == 0) { @@ -285,7 +289,7 @@ int DeleteConditionHandler::_check_whether_condition_exist(SmartOLAPTable table, int index = 0; while (index != delete_conditions.size()) { - DeleteDataConditionMessage temp = delete_conditions.Get(index); + DeleteConditionMessage temp = delete_conditions.Get(index); if (temp.version() == cond_version) { return index; @@ -314,7 +318,7 @@ bool DeleteHandler::_parse_condition(const std::string& condition_str, TConditio matched = false; } } catch (regex_error& e) { - OLAP_LOG_DEBUG("fail to parse expr. [expr=%s; error=%s]", condition_str.c_str(), e.what()); + VLOG(3) << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what() << "]"; matched = false; } @@ -329,7 +333,7 @@ bool DeleteHandler::_parse_condition(const std::string& condition_str, TConditio return true; } -OLAPStatus DeleteHandler::init(SmartOLAPTable olap_table, int32_t version) { +OLAPStatus DeleteHandler::init(OLAPTablePtr olap_table, int32_t version) { if (_is_inited) { OLAP_LOG_WARNING("reintialize delete handler."); return OLAP_ERR_INIT_FAILED; diff --git a/be/src/olap/delete_handler.h b/be/src/olap/delete_handler.h index d1cdd7e3a9..1d6e8ba822 100644 --- a/be/src/olap/delete_handler.h +++ b/be/src/olap/delete_handler.h @@ -46,11 +46,15 @@ namespace palo { // * 在调用log_conds()的时候,åªéœ€è¦åŠ è¯»é” class DeleteConditionHandler { public: - typedef google::protobuf::RepeatedPtrField del_cond_array; + typedef google::protobuf::RepeatedPtrField del_cond_array; DeleteConditionHandler() {} ~DeleteConditionHandler() {} + // 检查cond表示的删除æ¡ä»¶æ˜¯å¦ç¬¦åˆè¦æ±‚ï¼› + // 如果ä¸ç¬¦åˆè¦æ±‚,返回OLAP_ERR_DELETE_INVALID_CONDITION;符åˆè¦æ±‚返回OLAP_SUCCESS + OLAPStatus check_condition_valid(OLAPTablePtr table, const TCondition& cond); + // 存储指定版本å·çš„删除æ¡ä»¶åˆ°Header文件中。因此,调用之å‰éœ€è¦å¯¹Headeræ–‡ä»¶åŠ å†™é” // // è¾“å…¥å‚æ•°ï¼š @@ -62,10 +66,13 @@ public: // * OLAP_ERR_DELETE_INVALID_PARAMETERSï¼šå‡½æ•°å‚æ•°ä¸ç¬¦åˆè¦æ±‚ // * OLAP_ERR_DELETE_INVALID_CONDITION:del_conditionä¸ç¬¦åˆè¦æ±‚ OLAPStatus store_cond( - SmartOLAPTable table, + OLAPTablePtr table, const int32_t version, const std::vector& conditions); + // construct sub condition from TCondition + std::string construct_sub_conditions(const TCondition& condition); + // 从Header文件中移除特定版本å·çš„删除æ¡ä»¶ã€‚在调用之å‰éœ€è¦å¯¹Headeræ–‡ä»¶åŠ å†™é” // // è¾“å…¥å‚æ•°ï¼š @@ -81,7 +88,7 @@ public: // * 这个表没有指定版本å·çš„删除æ¡ä»¶ // * OLAP_ERR_DELETE_INVALID_PARAMETERSï¼šå‡½æ•°å‚æ•°ä¸ç¬¦åˆè¦æ±‚ OLAPStatus delete_cond( - SmartOLAPTable table, const int32_t version, bool delete_smaller_version_conditions); + OLAPTablePtr table, const int32_t version, bool delete_smaller_version_conditions); // 将一个olap engine的表上存有的所有删除æ¡ä»¶æ‰“å°åˆ°log中。调用å‰åªéœ€è¦ç»™Headeræ–‡ä»¶åŠ è¯»é” // @@ -89,18 +96,15 @@ public: // table: è¦æ‰“å°åˆ é™¤æ¡ä»¶çš„olap engine表 // 返回值: // OLAP_SUCCESS:调用æˆåŠŸ - OLAPStatus log_conds(SmartOLAPTable table); + OLAPStatus log_conds(OLAPTablePtr table); private: - // 检查cond表示的删除æ¡ä»¶æ˜¯å¦ç¬¦åˆè¦æ±‚ï¼› - // 如果ä¸ç¬¦åˆè¦æ±‚,返回OLAP_ERR_DELETE_INVALID_CONDITION;符åˆè¦æ±‚返回OLAP_SUCCESS - OLAPStatus _check_condition_valid(SmartOLAPTable table, const TCondition& cond); // 检查指定的删除æ¡ä»¶ç‰ˆæœ¬æ˜¯å¦ç¬¦åˆè¦æ±‚ï¼› // 如果ä¸ç¬¦åˆè¦æ±‚,返回OLAP_ERR_DELETE_INVALID_VERSION;符åˆè¦æ±‚返回OLAP_SUCCESS - OLAPStatus _check_version_valid(SmartOLAPTable table, const int32_t filter_version); + OLAPStatus _check_version_valid(OLAPTablePtr table, const int32_t filter_version); // 检查指定版本的删除æ¡ä»¶æ˜¯å¦å·²ç»å­˜åœ¨ã€‚如果存在,返回指定版本删除æ¡ä»¶çš„æ•°ç»„下标;ä¸å­˜åœ¨è¿”回-1 - int _check_whether_condition_exist(SmartOLAPTable, int cond_version); + int _check_whether_condition_exist(OLAPTablePtr, int cond_version); }; // 表示一个删除æ¡ä»¶ @@ -129,7 +133,7 @@ struct DeleteConditions { class DeleteHandler { public: typedef std::vector::size_type cond_num_t; - typedef google::protobuf::RepeatedPtrField del_cond_array; + typedef google::protobuf::RepeatedPtrField del_cond_array; DeleteHandler() : _is_inited(false) {} ~DeleteHandler() {} @@ -148,7 +152,7 @@ public: // * OLAP_SUCCESS: 调用æˆåŠŸ // * OLAP_ERR_DELETE_INVALID_PARAMETERS: 傿•°ä¸ç¬¦åˆè¦æ±‚ // * OLAP_ERR_MALLOC_ERROR: 在填充_del_conds时,分é…内存失败 - OLAPStatus init(SmartOLAPTable olap_table, int32_t version); + OLAPStatus init(OLAPTablePtr olap_table, int32_t version); // åˆ¤å®šä¸€æ¡æ•°æ®æ˜¯å¦ç¬¦åˆåˆ é™¤æ¡ä»¶ // diff --git a/be/src/olap/field.h b/be/src/olap/field.h index 8c7ae50ba5..708a7feb12 100644 --- a/be/src/olap/field.h +++ b/be/src/olap/field.h @@ -74,6 +74,7 @@ public: // 返回-1,0,1,分别代表当å‰fieldå°äºŽï¼Œç­‰äºŽï¼Œå¤§äºŽä¼ å…¥å‚数中的field inline int cmp(char* left, char* right) const; + inline int cmp(char* left, bool r_null, char* right) const; inline int index_cmp(char* left, char* right) const; inline bool equal(char* left, char* right); @@ -82,6 +83,7 @@ public: inline void copy_with_pool(char* dest, const char* src, MemPool* mem_pool); inline void copy_without_pool(char* dest, const char* src); + inline void copy_without_pool(char* dest, bool is_null, const char* src); inline void agg_init(char* dest, const char* src); // copy filed content from src to dest without nullbyte @@ -127,6 +129,15 @@ inline int Field::cmp(char* left, char* right) const { } } +inline int Field::cmp(char* left, bool r_null, char* right) const { + bool l_null = *reinterpret_cast(left); + if (l_null != r_null) { + return l_null ? -1 : 1; + } else { + return l_null ? 0 : (_type_info->cmp(left + 1, right)); + } +} + inline int Field::index_cmp(char* left, char* right) const { bool l_null = *reinterpret_cast(left); bool r_null = *reinterpret_cast(right); @@ -170,7 +181,7 @@ inline bool Field::equal(char* left, char* right) { } inline void Field::aggregate(char* dest, char* src) { - _aggregate_func(dest, src); + _aggregate_func(dest, src, nullptr); } inline void Field::finalize(char* data) { @@ -198,6 +209,14 @@ inline void Field::copy_without_pool(char* dest, const char* src) { return _type_info->copy_without_pool(dest + 1, src + 1); } +inline void Field::copy_without_pool(char* dest, bool is_null, const char* src) { + *reinterpret_cast(dest) = is_null; + if (is_null) { + return; + } + return _type_info->copy_without_pool(dest + 1, src); +} + inline void Field::agg_init(char* dest, const char* src) { if (OLAP_LIKELY(_type != OLAP_FIELD_TYPE_HLL)) { copy_without_pool(dest, src); diff --git a/be/src/olap/field_info.cpp b/be/src/olap/field_info.cpp index 1e39e552f2..e94ba68722 100644 --- a/be/src/olap/field_info.cpp +++ b/be/src/olap/field_info.cpp @@ -69,7 +69,7 @@ FieldType FieldInfo::get_field_type_by_string(const string& type_str) { } else if (0 == upper_type_str.compare("MAP")) { type = OLAP_FIELD_TYPE_MAP; } else { - OLAP_LOG_WARNING("invalid type string. [type='%s']", type_str.c_str()); + LOG(WARNING) << "invalid type string. [type='" << type_str << "']"; type = OLAP_FIELD_TYPE_UNKNOWN; } @@ -94,7 +94,7 @@ FieldAggregationMethod FieldInfo::get_aggregation_type_by_string(const string& s } else if (0 == upper_str.compare("HLL_UNION")) { aggregation_type = OLAP_FIELD_AGGREGATION_HLL_UNION; } else { - OLAP_LOG_WARNING("invalid aggregation type string. [aggregation='%s']", str.c_str()); + LOG(WARNING) << "invalid aggregation type string. [aggregation='" << str << "']"; aggregation_type = OLAP_FIELD_AGGREGATION_UNKNOWN; } diff --git a/be/src/olap/file_helper.cpp b/be/src/olap/file_helper.cpp index b3d0aa7eb1..7b1db57e2a 100644 --- a/be/src/olap/file_helper.cpp +++ b/be/src/olap/file_helper.cpp @@ -39,7 +39,6 @@ FileHandler::FileHandler() : _file_name(""), _is_using_cache(false), _cache_handle(NULL) { - _fd_cache = OLAPEngine::get_instance()->file_descriptor_lru_cache(); } FileHandler::~FileHandler() { @@ -58,13 +57,17 @@ OLAPStatus FileHandler::open(const string& file_name, int flag) { _fd = ::open(file_name.c_str(), flag); if (_fd < 0) { - OLAP_LOG_WARNING("failed to open file. [err=%m file_name='%s' flag=%d]", - file_name.c_str(), flag); + char errmsg[64]; + LOG(WARNING) << "failed to open file. [err=" << strerror_r(errno, errmsg, 64) + << ", file_name='" << file_name << "' flag=" << flag << "]"; + if (errno == EEXIST) { + return OLAP_ERR_FILE_ALREADY_EXIST; + } return OLAP_ERR_IO_ERROR; } - OLAP_LOG_DEBUG("success to open file. [file_name='%s' flag=%d fd=%d]", - file_name.c_str(), flag, _fd); + VLOG(3) << "success to open file. [file_name='" + << file_name << "' flag=" << flag << " fd=" << _fd << "]"; _is_using_cache = false; _file_name = file_name; return OLAP_SUCCESS; @@ -80,25 +83,30 @@ OLAPStatus FileHandler::open_with_cache(const string& file_name, int flag) { } CacheKey key(file_name.c_str(), file_name.size()); - _cache_handle = _fd_cache->lookup(key); + Cache* fd_cache = OLAPEngine::get_instance()->file_descriptor_lru_cache(); + _cache_handle = fd_cache->lookup(key); if (NULL != _cache_handle) { FileDescriptor* file_desc = - reinterpret_cast(_fd_cache->value(_cache_handle)); + reinterpret_cast(fd_cache->value(_cache_handle)); _fd = file_desc->fd; - OLAP_LOG_DEBUG("success to open file with cache. [file_name='%s' flag=%d fd=%d]", - file_name.c_str(), flag, _fd); + VLOG(3) << "success to open file with cache. [file_name='" << file_name + << "' flag=" << flag << " fd=" << _fd << "]"; } else { _fd = ::open(file_name.c_str(), flag); if (_fd < 0) { - OLAP_LOG_WARNING("failed to open file. [err=%m file_name='%s' flag=%d]", - file_name.c_str(), flag); + char errmsg[64]; + LOG(WARNING) << "failed to open file. [err=" << strerror_r(errno, errmsg, 64) + << " file_name='" << file_name << "' flag=" << flag << "]"; + if (errno == EEXIST) { + return OLAP_ERR_FILE_ALREADY_EXIST; + } return OLAP_ERR_IO_ERROR; } FileDescriptor* file_desc = new FileDescriptor(_fd); - _cache_handle = _fd_cache->insert( + _cache_handle = fd_cache->insert( key, file_desc, 1, &_delete_cache_file_descriptor); - OLAP_LOG_DEBUG("success to open file. [file_name='%s' flag=%d fd=%d]", + OLAP_LOG_DEBUG("success to open file with cache. [file_name='%s' flag=%d fd=%d]", file_name.c_str(), flag, _fd); } _is_using_cache = true; @@ -118,8 +126,14 @@ OLAPStatus FileHandler::open_with_mode(const string& file_name, int flag, int mo _fd = ::open(file_name.c_str(), flag, mode); if (_fd < 0) { - OLAP_LOG_WARNING("failed to open file. [err=%m file_name='%s' flag=%d mode=%d]", - file_name.c_str(), flag, mode); + char err_buf[64]; + LOG(WARNING) << "failed to open file. [err=" << strerror_r(errno, err_buf, 64) + << " file_name='" << file_name + << "' flag=" << flag + << " mode=" << mode << "]"; + if (errno == EEXIST) { + return OLAP_ERR_FILE_ALREADY_EXIST; + } return OLAP_ERR_IO_ERROR; } @@ -130,7 +144,8 @@ OLAPStatus FileHandler::open_with_mode(const string& file_name, int flag, int mo } OLAPStatus FileHandler::release() { - _fd_cache->release(_cache_handle); + Cache* fd_cache = OLAPEngine::get_instance()->file_descriptor_lru_cache(); + fd_cache->release(_cache_handle); _cache_handle = NULL; _is_using_cache = false; return OLAP_SUCCESS; @@ -154,8 +169,9 @@ OLAPStatus FileHandler::close() { // 在一些æžç«¯æƒ…况下(fdå¯ç”¨,但fsync失败)å¯èƒ½é€ æˆå¥æŸ„æ³„æ¼ if (::close(_fd) < 0) { - OLAP_LOG_WARNING("failed to close file. [err=%m file_name='%s' fd=%d]", - _file_name.c_str(), _fd); + char errmsg[64]; + LOG(WARNING) << "failed to close file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << "]"; return OLAP_ERR_IO_ERROR; } } @@ -175,14 +191,16 @@ OLAPStatus FileHandler::pread(void* buf, size_t size, size_t offset) { ssize_t rd_size = ::pread(_fd, ptr, size, offset); if (rd_size < 0) { - OLAP_LOG_WARNING("failed to pread from file. " - "[err=%m file_name='%s' fd=%d size=%ld offset=%ld]", - _file_name.c_str(), _fd, size, offset); + char errmsg[64]; + LOG(WARNING) << "failed to pread from file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << " size=" << size + << " offset=" << offset << "]"; return OLAP_ERR_IO_ERROR; } else if (0 == rd_size) { - OLAP_LOG_WARNING("read unenough from file. " - "[err=%m file_name='%s' fd=%d size=%ld offset=%ld]", - _file_name.c_str(), _fd, size, offset); + char errmsg[64]; + LOG(WARNING) << "read unenough from file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << " size=" << size + << " offset=" << offset << "]"; return OLAP_ERR_READ_UNENOUGH; } @@ -202,13 +220,16 @@ OLAPStatus FileHandler::write(const void* buf, size_t buf_size) { ssize_t wr_size = ::write(_fd, ptr, buf_size); if (wr_size < 0) { - OLAP_LOG_WARNING("failed to write to file. [err=%m file_name='%s' fd=%d size=%ld]", - _file_name.c_str(), _fd, buf_size); + char errmsg[64]; + LOG(WARNING) << "failed to write to file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd + << " size=" << buf_size << "]"; return OLAP_ERR_IO_ERROR; } else if (0 == wr_size) { - OLAP_LOG_WARNING("write unenough to file. " - "[err=%m file_name='%s' fd=%d size=%ld]", - _file_name.c_str(), _fd, buf_size); + char errmsg[64]; + LOG(WARNING) << "write unenough to file. [err=" << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd + << " size=" << buf_size << "]"; return OLAP_ERR_IO_ERROR; } @@ -229,18 +250,21 @@ OLAPStatus FileHandler::write(const void* buf, size_t buf_size) { OLAPStatus FileHandler::pwrite(const void* buf, size_t buf_size, size_t offset) { const char* ptr = reinterpret_cast(buf); + size_t org_buf_size = buf_size; while (buf_size > 0) { ssize_t wr_size = ::pwrite(_fd, ptr, buf_size, offset); if (wr_size < 0) { - OLAP_LOG_WARNING("failed to pwrite to file. " - "[err=%m file_name='%s' fd=%d size=%ld offset=%ld]", - _file_name.c_str(), _fd, buf_size, offset); + char errmsg[64]; + LOG(WARNING) << "failed to pwrite to file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << " size=" << buf_size + << " offset=" << offset << "]"; return OLAP_ERR_IO_ERROR; } else if (0 == wr_size) { - OLAP_LOG_WARNING("pwrite unenough to file. " - "[err=%m file_name='%s' fd=%d size=%ld offset=%ld]", - _file_name.c_str(), _fd, buf_size, offset); + char errmsg[64]; + LOG(WARNING) << "pwrite unenough to file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << " size=" << buf_size + << " offset=" << offset << "]"; return OLAP_ERR_IO_ERROR; } @@ -248,6 +272,7 @@ OLAPStatus FileHandler::pwrite(const void* buf, size_t buf_size, size_t offset) ptr += wr_size; offset += wr_size; } + _wr_length += org_buf_size; return OLAP_SUCCESS; } @@ -284,8 +309,12 @@ OLAPStatus FileHandlerWithBuf::open(const string& file_name, const char* mode) { _fp = ::fopen(file_name.c_str(), mode); if (NULL == _fp) { - OLAP_LOG_WARNING("failed to open file. [err=%m file_name='%s' flag='%s']", - file_name.c_str(), mode); + char errmsg[64]; + LOG(WARNING) << "failed to open file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << file_name << "' flag='" << mode << "']"; + if (errno == EEXIST) { + return OLAP_ERR_FILE_ALREADY_EXIST; + } return OLAP_ERR_IO_ERROR; } @@ -306,8 +335,9 @@ OLAPStatus FileHandlerWithBuf::close() { // 在一些æžç«¯æƒ…况下(fdå¯ç”¨,但fsync失败)å¯èƒ½é€ æˆå¥æŸ„æ³„æ¼ if (::fclose(_fp) != 0) { - OLAP_LOG_WARNING("failed to close file. [err=%m file_name='%s']", - _file_name.c_str()); + char errmsg[64]; + LOG(WARNING) << "failed to close file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "']"; return OLAP_ERR_IO_ERROR; } @@ -327,12 +357,15 @@ OLAPStatus FileHandlerWithBuf::read(void* buf, size_t size) { if (rd_size == size) { return OLAP_SUCCESS; } else if (::feof(_fp)) { - OLAP_LOG_WARNING("read unenough from file. [err=%m file_name='%s' size=%ld rd_size=%ld]", - _file_name.c_str(), size, rd_size); + char errmsg[64]; + LOG(WARNING) << "read unenough from file. [err=" << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' size=" << size + << " rd_size=" << rd_size << "]"; return OLAP_ERR_READ_UNENOUGH; } else { - OLAP_LOG_WARNING("failed to read from file. [err=%m file_name='%s' size=%ld]", - _file_name.c_str(), size); + char errmsg[64]; + LOG(WARNING) << "failed to read from file. [err=" << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' size=" << size << "]"; return OLAP_ERR_IO_ERROR; } } @@ -344,8 +377,10 @@ OLAPStatus FileHandlerWithBuf::pread(void* buf, size_t size, size_t offset) { } if (0 != ::fseek(_fp, offset, SEEK_SET)) { - OLAP_LOG_WARNING("failed to seek file. [err=%m file_name='%s' size=%ld offset=%ld]", - _file_name.c_str(), size, offset); + char errmsg[64]; + LOG(WARNING) << "failed to seek file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' size=" << size + << " offset=" << offset << "]"; return OLAP_ERR_IO_ERROR; } @@ -361,8 +396,9 @@ OLAPStatus FileHandlerWithBuf::write(const void* buf, size_t buf_size) { size_t wr_size = ::fwrite(buf, 1, buf_size, _fp); if (wr_size != buf_size) { - OLAP_LOG_WARNING("failed to write to file. [err=%m file_name='%s' size=%ld]", - _file_name.c_str(), buf_size); + char errmsg[64]; + LOG(WARNING) << "failed to write to file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' size=" << buf_size << "]"; return OLAP_ERR_IO_ERROR; } @@ -376,8 +412,10 @@ OLAPStatus FileHandlerWithBuf::pwrite(const void* buf, size_t buf_size, size_t o } if (0 != ::fseek(_fp, offset, SEEK_SET)) { - OLAP_LOG_WARNING("failed to seek file. [err=%m file_name='%s' size=%ld offset=%ld]", - _file_name.c_str(), buf_size, offset); + char errmsg[64]; + LOG(WARNING) << "failed to seek file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' size=" << buf_size + << " offset=" << offset << "]"; return OLAP_ERR_IO_ERROR; } @@ -388,7 +426,7 @@ off_t FileHandlerWithBuf::length() const { struct stat stat_data; if (stat(_file_name.c_str(), &stat_data) < 0) { - OLAP_LOG_WARNING("fstat error. [file_name='%s']", _file_name.c_str()); + LOG(WARNING) << "fstat error. [file_name='" << _file_name << "']"; return -1; } diff --git a/be/src/olap/file_helper.h b/be/src/olap/file_helper.h index cfe17f45dc..b4bccdbd5c 100644 --- a/be/src/olap/file_helper.h +++ b/be/src/olap/file_helper.h @@ -27,6 +27,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/utils.h" +#include "util/debug_util.h" namespace palo { @@ -62,8 +63,9 @@ public: off_t res = -1; if (-1 == (res = lseek(_fd, 0, SEEK_CUR))) { - OLAP_LOG_WARNING("fail to tell file. [err=%m file_name='%s' fd=%d]", - _file_name.c_str(), _fd); + char errmsg[64]; + LOG(WARNING) << "fail to tell file. [err=" << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << "]"; } return res; @@ -75,9 +77,10 @@ public: off_t res = -1; if (-1 == (res = lseek(_fd, offset, whence))) { - OLAP_LOG_WARNING("fail to seek file. [err=%m file_name='%s' fd=%d " - "offset=%ld whence=%d]", - _file_name.c_str(), _fd, offset, whence); + char errmsg[64]; + LOG(WARNING) << "fail to seek file. [err=" << strerror_r(errno, errmsg, 64) + << "file_name='" << _file_name + << "' fd=" << _fd << " offset=" << offset << " whence=" << whence << "]"; } return res; @@ -103,7 +106,6 @@ private: std::string _file_name; bool _is_using_cache; Cache::Handle* _cache_handle; - Cache* _fd_cache; }; class FileHandlerWithBuf { @@ -124,8 +126,9 @@ public: int32_t sync() { int32_t res = -1; if (0 != (res = ::fflush(_fp))) { - OLAP_LOG_WARNING("fail to fsync file. [err=%m file_name='%s']", - _file_name.c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to fsync file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "']"; } return res; } @@ -133,8 +136,9 @@ public: off_t tell() const { off_t res = -1; if (-1 == (res = ::ftell(_fp))) { - OLAP_LOG_WARNING("fail to tell file. [err=%m file_name='%s']", - _file_name.c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to tell file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "']"; } return res; } @@ -144,9 +148,10 @@ public: off_t seek(off_t offset, int whence) { off_t res = -1; if (-1 == (res = ::fseek(_fp, offset, whence))) { - OLAP_LOG_WARNING("fail to seek file. [err=%m file_name='%s' " - "offset=%ld whence=%d]", - _file_name.c_str(), offset, whence); + char errmsg[64]; + LOG(WARNING) << "fail to seek file. [err=" << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name + << "' offset=" << offset << " whence=" << whence << "]"; } return res; } @@ -262,13 +267,13 @@ OLAPStatus FileHeader::prepare( try { if (!_proto.SerializeToString(&_proto_string)) { - OLAP_LOG_WARNING("serialize file header to string error. [path='%s']", - file_handler->file_name().c_str()); + LOG(WARNING) << "serialize file header to string error. [path='" + << file_handler->file_name() << "']"; return OLAP_ERR_SERIALIZE_PROTOBUF_ERROR; } } catch (...) { - OLAP_LOG_WARNING("serialize file header to string error. [path='%s']", - file_handler->file_name().c_str()); + LOG(WARNING) << "serialize file header to string error. [path='" + << file_handler->file_name() << "']"; return OLAP_ERR_SERIALIZE_PROTOBUF_ERROR; } @@ -294,22 +299,28 @@ OLAPStatus FileHeader::serialize( // 写入文件 if (OLAP_SUCCESS != file_handler->pwrite(&_fixed_file_header, _fixed_file_header_size, 0)) { - OLAP_LOG_WARNING("fail to write fixed header to file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to write fixed header to file. [file='" + << file_handler->file_name() + << "' err=" << strerror_r(errno, errmsg, 64) << "]"; return OLAP_ERR_IO_ERROR; } if (OLAP_SUCCESS != file_handler->pwrite(&_extra_fixed_header, sizeof(_extra_fixed_header), _fixed_file_header_size)) { - OLAP_LOG_WARNING("fail to write extra fixed header to file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to write extra fixed header to file. [file='" + << file_handler->file_name() + << "' err=" << strerror_r(errno, errmsg, 64) << "]"; return OLAP_ERR_IO_ERROR; } if (OLAP_SUCCESS != file_handler->pwrite(_proto_string.c_str(), _proto_string.size(), _fixed_file_header_size + sizeof(_extra_fixed_header))) { - OLAP_LOG_WARNING("fail to write proto header to file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to write proto header to file. [file='" + << file_handler->file_name() + << "' err='" << strerror_r(errno, errmsg, 64) << "']"; return OLAP_ERR_IO_ERROR; } @@ -328,8 +339,10 @@ OLAPStatus FileHeader::unserialize( if (OLAP_SUCCESS != file_handler->pread(&_fixed_file_header, _fixed_file_header_size, 0)) { - OLAP_LOG_WARNING("fail to load header structure from file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to load header structure from file. [file='" + << file_handler->file_name() + << "' err='" << strerror_r(errno, errmsg, 64) << "']"; return OLAP_ERR_IO_ERROR; } @@ -341,8 +354,10 @@ OLAPStatus FileHeader::unserialize( if (OLAP_SUCCESS != file_handler->pread(&tmp_header, sizeof(tmp_header), 0)) { - OLAP_LOG_WARNING("fail to load header structure from file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to load header structure from file. [file='" + << file_handler->file_name() + << "' err='" << strerror_r(errno, errmsg, 64) << "']"; return OLAP_ERR_IO_ERROR; } @@ -365,32 +380,38 @@ OLAPStatus FileHeader::unserialize( if (OLAP_SUCCESS != file_handler->pread(&_extra_fixed_header, sizeof(_extra_fixed_header), _fixed_file_header_size)) { - OLAP_LOG_WARNING("fail to load extra fixed header from file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to load extra fixed header from file. [file='" + << file_handler->file_name() + << "' err='" << strerror_r(errno, errmsg, 64) << "']"; return OLAP_ERR_IO_ERROR; } std::unique_ptr buf(new(std::nothrow) char[_fixed_file_header.protobuf_length]); if (NULL == buf.get()) { - OLAP_LOG_WARNING("malloc protobuf buf error. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "malloc protobuf buf error. [file='" + << file_handler->file_name() + << "' err='" << strerror_r(errno, errmsg, 64) << "']"; return OLAP_ERR_MALLOC_ERROR; } if (OLAP_SUCCESS != file_handler->pread(buf.get(), _fixed_file_header.protobuf_length, _fixed_file_header_size + sizeof(_extra_fixed_header))) { - OLAP_LOG_WARNING("fail to load protobuf from file. [file='%s' message='%m']", - file_handler->file_name().c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to load protobuf from file. [file='" + << file_handler->file_name() + << "' err='" << strerror_r(errno, errmsg, 64) << "']"; return OLAP_ERR_IO_ERROR; } real_file_length = file_handler->length(); if (file_length() != static_cast(real_file_length)) { - OLAP_LOG_WARNING("file length is not match. [file='%s' file_length=%lu" - " real_file_length=%ld]", - file_handler->file_name().c_str(), file_length(), real_file_length); + LOG(WARNING) << "file length is not match. [file='" << file_handler->file_name() + << "' file_length=" << file_length() + << " real_file_length=" << real_file_length << "]"; return OLAP_ERR_FILE_DATA_ERROR; } @@ -399,10 +420,9 @@ OLAPStatus FileHeader::unserialize( buf.get(), _fixed_file_header.protobuf_length); if (real_protobuf_checksum != _fixed_file_header.protobuf_checksum) { - OLAP_LOG_WARNING("checksum is not match. [file='%s' expect=%d actual=%d]", - file_handler->file_name().c_str(), - _fixed_file_header.protobuf_checksum, - real_protobuf_checksum); + LOG(WARNING) << "checksum is not match. [file='" << file_handler->file_name() + << "' expect=" << _fixed_file_header.protobuf_checksum + << " actual=" << real_protobuf_checksum << "]"; return OLAP_ERR_CHECKSUM_ERROR; } @@ -410,12 +430,12 @@ OLAPStatus FileHeader::unserialize( std::string protobuf_str(buf.get(), _fixed_file_header.protobuf_length); if (!_proto.ParseFromString(protobuf_str)) { - OLAP_LOG_WARNING("fail to parse file content to protobuf object. [file='%s']", - file_handler->file_name().c_str()); + LOG(WARNING) << "fail to parse file content to protobuf object. [file='" + << file_handler->file_name() << "']"; return OLAP_ERR_PARSE_PROTOBUF_ERROR; } } catch (...) { - OLAP_LOG_WARNING("fail to load protobuf. [file='%s']", file_handler->file_name().c_str()); + LOG(WARNING) << "fail to load protobuf. [file='" << file_handler->file_name() << "']"; return OLAP_ERR_PARSE_PROTOBUF_ERROR; } @@ -429,13 +449,14 @@ OLAPStatus FileHeader::validate( OLAPStatus res = OLAP_SUCCESS; if (OLAP_SUCCESS != file_handler.open(filename.c_str(), O_RDONLY)) { - OLAP_LOG_WARNING("fail to open file. [file='%s' message='%m']", - filename.c_str()); + char errmsg[64]; + LOG(WARNING) << "fail to open file. [file='" << filename + << "' err=" << strerror_r(errno, errmsg, 64) << "]"; return OLAP_ERR_IO_ERROR; } if (OLAP_SUCCESS != (res = unserialize(&file_handler))) { - OLAP_LOG_WARNING("unserialize file header error. [file='%s']", filename.c_str()); + LOG(WARNING) << "unserialize file header error. [file='" << filename << "']"; return res; } diff --git a/be/src/olap/hll.cpp b/be/src/olap/hll.cpp index 78795c696c..e45959488e 100644 --- a/be/src/olap/hll.cpp +++ b/be/src/olap/hll.cpp @@ -39,8 +39,8 @@ void HllSetResolver::parse() { // first byte : type // second~five byte : hash values's number // five byte later : hash value - _expliclit_num = (ExpliclitLengthValueType) (pdata[sizeof(SetTypeValueType)]); - _expliclit_value = (uint64_t*)(pdata + sizeof(SetTypeValueType) + _explicit_num = (ExpliclitLengthValueType) (pdata[sizeof(SetTypeValueType)]); + _explicit_value = (uint64_t*)(pdata + sizeof(SetTypeValueType) + sizeof(ExpliclitLengthValueType)); break; case HLL_DATA_SPRASE: @@ -70,8 +70,8 @@ void HllSetResolver::parse() { void HllSetResolver::fill_registers(char* registers, int len) { if (_set_type == HLL_DATA_EXPLICIT) { - for (int i = 0; i < get_expliclit_count(); ++i) { - uint64_t hash_value = get_expliclit_value(i); + for (int i = 0; i < get_explicit_count(); ++i) { + uint64_t hash_value = get_explicit_value(i); int idx = hash_value % len; uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1; registers[idx] = std::max((uint8_t)registers[idx], first_one_bit); @@ -96,8 +96,8 @@ void HllSetResolver::fill_registers(char* registers, int len) { void HllSetResolver::fill_index_to_value_map(std::map* index_to_value, int len) { if (_set_type == HLL_DATA_EXPLICIT) { - for (int i = 0; i < get_expliclit_count(); ++i) { - uint64_t hash_value = get_expliclit_value(i); + for (int i = 0; i < get_explicit_count(); ++i) { + uint64_t hash_value = get_explicit_value(i); int idx = hash_value % len; uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1; if (index_to_value->find(idx) != index_to_value->end()) { @@ -136,8 +136,8 @@ void HllSetResolver::fill_index_to_value_map(std::map* index_to_va void HllSetResolver::fill_hash64_set(std::set* hash_set) { if (_set_type == HLL_DATA_EXPLICIT) { - for (int i = 0; i < get_expliclit_count(); ++i) { - uint64_t hash_value = get_expliclit_value(i); + for (int i = 0; i < get_explicit_count(); ++i) { + uint64_t hash_value = get_explicit_value(i); hash_set->insert(hash_value); } } @@ -161,7 +161,7 @@ void HllSetHelper::set_sparse( *(int*)(result + 1) = registers_count; } -void HllSetHelper::set_expliclit(char* result, const std::set& hash_value_set, int& len) { +void HllSetHelper::set_explicit(char* result, const std::set& hash_value_set, int& len) { result[0] = HLL_DATA_EXPLICIT; result[1] = (HllSetResolver::ExpliclitLengthValueType)(hash_value_set.size()); len = sizeof(HllSetResolver::SetTypeValueType) diff --git a/be/src/olap/hll.h b/be/src/olap/hll.h index 133a9eef7e..306789436c 100644 --- a/be/src/olap/hll.h +++ b/be/src/olap/hll.h @@ -47,8 +47,8 @@ public: _buf_len(0), _set_type(HLL_DATA_EMPTY), _full_value_position(nullptr), - _expliclit_value(nullptr), - _expliclit_num(0) {} + _explicit_value(nullptr), + _explicit_num(0) {} ~HllSetResolver() {} @@ -69,22 +69,22 @@ public: return _set_type; }; - // expliclit value num - int get_expliclit_count() { - return (int)_expliclit_num; + // explicit value num + int get_explicit_count() { + return (int)_explicit_num; }; - // get expliclit index value 64bit - uint64_t get_expliclit_value(int index) { - if (index >= _expliclit_num) { + // get explicit index value 64bit + uint64_t get_explicit_value(int index) { + if (index >= _explicit_num) { return -1; } - return _expliclit_value[index]; + return _explicit_value[index]; }; - // get expliclit index value 64bit - char* get_expliclit_value() { - return (char*)_expliclit_value; + // get explicit index value 64bit + char* get_explicit_value() { + return (char*)_explicit_value; }; // get full register value @@ -119,8 +119,8 @@ private : int _buf_len; // set len HllDataType _set_type; //set type char* _full_value_position; - uint64_t* _expliclit_value; - ExpliclitLengthValueType _expliclit_num; + uint64_t* _explicit_value; + ExpliclitLengthValueType _explicit_num; std::map _sparse_map; SparseLengthValueType* _sparse_count; }; @@ -128,16 +128,16 @@ private : // 通过varcharçš„å˜é•¿ç¼–ç æ–¹å¼å®žçްhllé›†åˆ // 实现hllåˆ—ä¸­é—´è®¡ç®—ç»“æžœçš„å¤„ç† // empty ç©ºé›†åˆ -// expliclit 存储64ä½hashå€¼çš„é›†åˆ +// explicit 存储64ä½hashå€¼çš„é›†åˆ // sparse 存储hlléž0çš„register // full 存储全部的hll register -// empty -> expliclit -> sparse -> full å››ç§ç±»åž‹çš„è½¬æ¢æ–¹å‘ä¸å¯é€† -// 第一个字节存放hll集åˆçš„类型 0:empty 1:expliclit 2:sparse 3:full +// empty -> explicit -> sparse -> full å››ç§ç±»åž‹çš„è½¬æ¢æ–¹å‘ä¸å¯é€† +// 第一个字节存放hll集åˆçš„类型 0:empty 1:explicit 2:sparse 3:full // 已决定åŽé¢çš„æ•°æ®æ€Žä¹ˆè§£æž class HllSetHelper { public: static void set_sparse(char *result, const std::map& index_to_value, int& len); - static void set_expliclit(char* result, const std::set& hash_value_set, int& len); + static void set_explicit(char* result, const std::set& hash_value_set, int& len); static void set_full(char* result, const char* registers, const int set_len, int& len); static void set_full(char* result, const std::map& index_to_value, const int set_len, int& len); diff --git a/be/src/olap/i_data.cpp b/be/src/olap/i_data.cpp index be11be0943..7ea142f513 100644 --- a/be/src/olap/i_data.cpp +++ b/be/src/olap/i_data.cpp @@ -17,10 +17,11 @@ #include "olap/column_file/column_data.h" #include "olap/olap_data.h" +#include "olap/rowset.h" namespace palo { -IData* IData::create(OLAPIndex* index) { +IData* IData::create(Rowset* index) { IData* data = NULL; DataFileType file_type = index->table()->data_file_type(); @@ -42,7 +43,7 @@ IData* IData::create(OLAPIndex* index) { } bool IData::delta_pruning_filter() { - if (empty()) { + if (empty() || zero_num_rows()) { return true; } @@ -54,8 +55,7 @@ bool IData::delta_pruning_filter() { } int IData::delete_pruning_filter() { - - if (empty()) { + if (empty() || zero_num_rows()) { // should return DEL_NOT_SATISFIED, because that when creating rollup table, // the delete version file should preserved for filter data. return DEL_NOT_SATISFIED; diff --git a/be/src/olap/i_data.h b/be/src/olap/i_data.h index 58d31e4627..9a0db916f1 100644 --- a/be/src/olap/i_data.h +++ b/be/src/olap/i_data.h @@ -23,7 +23,7 @@ #include "olap/delete_handler.h" #include "olap/olap_common.h" #include "olap/olap_cond.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "util/runtime_profile.h" #include "olap/column_predicate.h" @@ -31,7 +31,7 @@ namespace palo { class OLAPTable; -class OLAPIndex; +class Rowset; class RowBlock; class RowCursor; class Conditions; @@ -42,7 +42,7 @@ class RuntimeState; class IData { public: // 工厂方法, 生æˆIData对象, 调用者获得新建的对象, å¹¶è´Ÿè´£delete释放 - static IData* create(OLAPIndex* olap_index); + static IData* create(Rowset* olap_index); virtual ~IData() {} // 为了与之å‰å…¼å®¹, 暴露部分indexçš„æŽ¥å£ @@ -58,9 +58,6 @@ public: uint32_t num_segments() const { return _olap_index->num_segments(); } - time_t max_timestamp() const { - return _olap_index->max_timestamp(); - } // æŸ¥è¯¢æ•°æ®æ–‡ä»¶ç±»åž‹ DataFileType data_file_type() { @@ -141,6 +138,10 @@ public: return _olap_index->empty(); } + bool zero_num_rows() const { + return _olap_index->zero_num_rows(); + } + bool delta_pruning_filter(); int delete_pruning_filter(); @@ -149,11 +150,11 @@ public: return 0; } - OLAPIndex* olap_index() const { + Rowset* olap_index() const { return _olap_index; } - void set_olap_index(OLAPIndex* olap_index) { + void set_olap_index(Rowset* olap_index) { _olap_index = olap_index; } @@ -166,8 +167,8 @@ public: virtual OLAPStatus unpickle() = 0; protected: - // 基类必须指定data_file_type, 也必须关è”一个OLAPIndex - IData(DataFileType data_file_type, OLAPIndex* olap_index): + // 基类必须指定data_file_type, 也必须关è”一个Rowset + IData(DataFileType data_file_type, Rowset* olap_index): _data_file_type(data_file_type), _olap_index(olap_index), _eof(false), @@ -179,7 +180,7 @@ protected: protected: DataFileType _data_file_type; - OLAPIndex* _olap_index; + Rowset* _olap_index; // 当到达文件末尾或者到达end key时设置此标志 bool _eof; const Conditions* _conditions; diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index 8bfc941efe..a56307c6f0 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -221,7 +221,7 @@ void LRUCache::_lru_append(LRUHandle* list, LRUHandle* e) { } Cache::Handle* LRUCache::lookup(const CacheKey& key, uint32_t hash) { - AutoMutexLock l(&_mutex); + MutexLock l(&_mutex); ++_lookup_count; LRUHandle* e = _table.lookup(key, hash); @@ -234,14 +234,14 @@ Cache::Handle* LRUCache::lookup(const CacheKey& key, uint32_t hash) { } void LRUCache::release(Cache::Handle* handle) { - AutoMutexLock l(&_mutex); + MutexLock l(&_mutex); _unref(reinterpret_cast(handle)); } Cache::Handle* LRUCache::insert( const CacheKey& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const CacheKey& key, void* value)) { - AutoMutexLock l(&_mutex); + MutexLock l(&_mutex); LRUHandle* e = reinterpret_cast( malloc(sizeof(LRUHandle)-1 + key.size())); @@ -288,12 +288,12 @@ bool LRUCache::_finish_erase(LRUHandle* e) { } void LRUCache::erase(const CacheKey& key, uint32_t hash) { - AutoMutexLock l(&_mutex); + MutexLock l(&_mutex); _finish_erase(_table.remove(key, hash)); } int LRUCache::prune() { - AutoMutexLock l(&_mutex); + MutexLock l(&_mutex); int num_prune = 0; while (_lru.next != &_lru) { LRUHandle* e = _lru.next; @@ -353,7 +353,7 @@ void* ShardedLRUCache::value(Handle* handle) { } uint64_t ShardedLRUCache::new_id() { - AutoMutexLock l(&_id_mutex); + MutexLock l(&_id_mutex); return ++(_last_id); } diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index fc1ae47eb9..9bdad564eb 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -330,7 +330,7 @@ namespace palo { size_t _capacity; // _mutex protects the following state. - MutexLock _mutex; + Mutex _mutex; size_t _usage; uint64_t _last_id; @@ -376,7 +376,7 @@ namespace palo { static uint32_t _shard(uint32_t hash); LRUCache _shards[kNumShards]; - MutexLock _id_mutex; + Mutex _id_mutex; uint64_t _last_id; }; diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index 74e7fca338..67ef5feecf 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -20,7 +20,7 @@ #include "olap/i_data.h" #include "olap/olap_define.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "olap/olap_table.h" #include "olap/reader.h" #include "olap/row_cursor.h" @@ -34,126 +34,14 @@ using std::vector; namespace palo { -Merger::Merger(SmartOLAPTable table, OLAPIndex* index, ReaderType type) : +Merger::Merger(OLAPTablePtr table, Rowset* index, ReaderType type) : _table(table), _index(index), _reader_type(type), - _row_count(0), - _uniq_keys(table->num_key_fields(), 1), - _selectivities(table->num_key_fields(), 1) {} + _row_count(0) {} -OLAPStatus Merger::merge( - const vector& olap_data_arr, - bool use_simple_merge, - uint64_t* merged_rows, - uint64_t* filted_rows) { - if (use_simple_merge && _check_simple_merge(olap_data_arr)) { - *merged_rows = 0; - *filted_rows = 0; - return _create_hard_link(); - } else { - return _merge(olap_data_arr, merged_rows, filted_rows); - } -} - -bool Merger::_check_simple_merge(const vector& olap_data_arr) { - bool res = false; - vector versions; - int32_t no_empty_file_num = 0; - - // set init value. When all the version is empty, the first version is the merge base version. - _simple_merge_version = olap_data_arr[0]->version(); - - _table->obtain_header_rdlock(); - - for (vector::const_iterator it = olap_data_arr.begin(); - it != olap_data_arr.end(); ++it) { - if (!(*it)->empty()) { - _simple_merge_version = (*it)->version(); - no_empty_file_num++; - - if (no_empty_file_num > 1) { - goto EXIT; - } - } else if (_table->is_delete_data_version((*it)->version())) { - goto EXIT; - } - } - - if (1 <= no_empty_file_num) { - res = true; - } - -EXIT: - _table->release_header_lock(); - return res; -} - -OLAPStatus Merger::_create_hard_link() { - OLAPStatus res = OLAP_SUCCESS; - _table->obtain_header_rdlock(); - VersionEntity version_entity = _table->get_version_entity_by_version(_simple_merge_version); - _table->release_header_lock(); - list new_files; - - for (uint32_t i = 0; i < version_entity.num_segments; i++) { - string new_index_path = _table->construct_index_file_path( - _index->version(), _index->version_hash(), i); - - string old_index_path = _table->construct_index_file_path( - version_entity.version, version_entity.version_hash, i); - - if (0 != link(old_index_path.c_str(), new_index_path.c_str())) { - OLAP_LOG_WARNING("fail to create hard link. [old_path=%s] to [new_path=%s] [%m]", - old_index_path.c_str(), - new_index_path.c_str()); - res = OLAP_ERR_OS_ERROR; - goto EXIT; - } - - new_files.push_back(new_index_path); - - string new_data_path = _table->construct_data_file_path( - _index->version(), _index->version_hash(), i); - - string old_data_path = _table->construct_data_file_path( - version_entity.version, version_entity.version_hash, i); - - if (0 != link(old_data_path.c_str(), new_data_path.c_str())) { - OLAP_LOG_WARNING("fail to create hard link. from [path=%s] to [path=%s] [%m]", - old_data_path.c_str(), - new_data_path.c_str()); - res = OLAP_ERR_OS_ERROR; - goto EXIT; - } - - new_files.push_back(new_data_path); - } - - _index->set_num_segments(version_entity.num_segments); - - if (version_entity.column_statistics.size() != 0) { - _index->set_column_statistics(version_entity.column_statistics); - } - -EXIT: - - if (res != OLAP_SUCCESS && new_files.size() != 0) { - for (list::iterator it = new_files.begin(); - it != new_files.end(); ++it) { - if (0 != remove(it->c_str())) { - OLAP_LOG_WARNING("fail to remove linked file.[file='%s']", it->c_str()); - } - } - } - - return res; -} - -OLAPStatus Merger::_merge( - const vector& olap_data_arr, - uint64_t* merged_rows, - uint64_t* filted_rows) { +OLAPStatus Merger::merge(const vector& olap_data_arr, + uint64_t* merged_rows, uint64_t* filted_rows) { // Create and initiate reader for scanning and multi-merging specified // OLAPDatas. Reader reader; @@ -180,15 +68,7 @@ OLAPStatus Merger::_merge( return OLAP_ERR_MALLOC_ERROR; } - if (OLAP_SUCCESS != writer->init()) { - OLAP_LOG_WARNING("fail to initiate writer. [table='%s']", - _table->full_name().c_str()); - return OLAP_ERR_INIT_FAILED; - } - bool has_error = false; - // We calculate selectivities only when base compactioning. - bool need_calculate_selectivities = (_index->version().first == 0); RowCursor row_cursor; if (OLAP_SUCCESS != row_cursor.init(_table->tablet_schema())) { @@ -196,16 +76,7 @@ OLAPStatus Merger::_merge( has_error = true; } - RowCursor last_row; - - if (OLAP_SUCCESS != last_row.init(_table->tablet_schema())) { - OLAP_LOG_WARNING("fail to init row cursor."); - has_error = true; - } - bool eof = false; - MemPool* mem_pool = writer->mem_pool(); - // The following procedure would last for long time, half of one day, etc. while (!has_error) { // Attach row cursor to the memory position of the row block being @@ -216,7 +87,7 @@ OLAPStatus Merger::_merge( has_error = true; break; } - row_cursor.allocate_memory_for_string_type(_table->tablet_schema(), mem_pool); + row_cursor.allocate_memory_for_string_type(_table->tablet_schema(), writer->mem_pool()); // Read one row into row_cursor OLAPStatus res = reader.next_row_with_aggregation(&row_cursor, &eof); @@ -231,32 +102,6 @@ OLAPStatus Merger::_merge( // Goto next row position in the row block being written writer->next(row_cursor); - - if (need_calculate_selectivities) { - // Calculate statistics while base compaction - if (0 != _row_count) { - size_t first_diff_id = 0; - - if (OLAP_SUCCESS != last_row.get_first_different_column_id( - row_cursor, &first_diff_id)) { - OLAP_LOG_WARNING("fail to get_first_different_column_id."); - has_error = true; - break; - } - - for (size_t i = first_diff_id; i < _uniq_keys.size(); ++i) { - ++_uniq_keys[i]; - } - } - - // set last row for next comapration. - if (OLAP_SUCCESS != last_row.copy(row_cursor, mem_pool)) { - OLAP_LOG_WARNING("fail to copy last row."); - has_error = true; - break; - } - } - ++_row_count; } @@ -266,13 +111,6 @@ OLAPStatus Merger::_merge( has_error = true; } - if (need_calculate_selectivities) { - for (size_t i = 0; i < _uniq_keys.size(); ++i) { - _selectivities[i] - = static_cast(_row_count / _uniq_keys[i]); - } - } - if (!has_error) { *merged_rows = reader.merged_rows(); *filted_rows = reader.filted_rows(); @@ -281,5 +119,4 @@ OLAPStatus Merger::_merge( return has_error ? OLAP_ERR_OTHER_ERROR : OLAP_SUCCESS; } - } // namespace palo diff --git a/be/src/olap/merger.h b/be/src/olap/merger.h index 786ae6ef36..e9fbf1a8f6 100644 --- a/be/src/olap/merger.h +++ b/be/src/olap/merger.h @@ -21,51 +21,31 @@ namespace palo { -class OLAPIndex; +class Rowset; class IData; class Merger { public: // parameter index is created by caller, and it is empty. - Merger(SmartOLAPTable table, OLAPIndex* index, ReaderType type); + Merger(OLAPTablePtr table, Rowset* index, ReaderType type); virtual ~Merger() {}; - // @brief read from multiple OLAPData and OLAPIndex, then write into single OLAPData and - // OLAPIndex. When use_simple_merge is true, check weather to create hard link. + // @brief read from multiple OLAPData and Rowset, then write into single OLAPData and Rowset // @return OLAPStatus: OLAP_SUCCESS or FAIL // @note it will take long time to finish. - OLAPStatus merge( - const std::vector& olap_data_arr, - bool use_simple_merge, - uint64_t* merged_rows, - uint64_t* filted_rows); + OLAPStatus merge(const std::vector& olap_data_arr, + uint64_t* merged_rows, uint64_t* filted_rows); // 获å–在åšmerge过程中累积的行数 uint64_t row_count() { return _row_count; } - // 获å–å‰ç¼€ç»„åˆçš„selectivity - const std::vector& selectivities() { - return _selectivities; - } - private: - OLAPStatus _merge( - const std::vector& olap_data_arr, - uint64_t* merged_rows, - uint64_t* filted_rows); - - bool _check_simple_merge(const std::vector& olap_data_arr); - - OLAPStatus _create_hard_link(); - - SmartOLAPTable _table; - OLAPIndex* _index; + OLAPTablePtr _table; + Rowset* _index; ReaderType _reader_type; uint64_t _row_count; - std::vector _uniq_keys; // 存储æ¯ä¸€ç§å‰ç¼€ç»„åˆçš„独特值个数 - std::vector _selectivities; // ä¿å­˜æ¯ä¸€ç§å‰ç¼€ç»„åˆçš„selectivity Version _simple_merge_version; DISALLOW_COPY_AND_ASSIGN(Merger); diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index ccf2c006fe..75ee6f4fd3 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -37,18 +37,18 @@ typedef int64_t VersionHash; typedef __int128 int128_t; typedef unsigned __int128 uint128_t; -struct TableInfo { - TableInfo( +struct TabletInfo { + TabletInfo( TTabletId in_tablet_id, TSchemaHash in_schema_hash) : tablet_id(in_tablet_id), schema_hash(in_schema_hash) {} - bool operator<(const TableInfo& other) const { - if (tablet_id < other.tablet_id) { - return true; + bool operator<(const TabletInfo& right) const { + if (tablet_id != right.tablet_id) { + return tablet_id < right.tablet_id; } else { - return false; + return schema_hash < right.schema_hash; } } @@ -142,7 +142,7 @@ enum AlterTabletType { enum AlterTableStatus { ALTER_TABLE_WAITING = 0, ALTER_TABLE_RUNNING = 1, - ALTER_TABLE_DONE = 2, + ALTER_TABLE_FINISHED = 2, ALTER_TABLE_FAILED = 3, }; @@ -153,7 +153,7 @@ enum PushType { }; enum ReaderType { - READER_FETCH = 0, + READER_QUERY = 0, READER_ALTER_TABLE = 1, READER_BASE_COMPACTION = 2, READER_CUMULATIVE_COMPACTION = 3, @@ -161,7 +161,8 @@ enum ReaderType { }; // , such as <100, 110> -typedef std::pair Version; +//using Version = std::pair; +typedef std::pair Version; typedef std::vector Versions; // It is used to represent Graph vertex. @@ -172,54 +173,38 @@ struct Vertex { class Field; class WrapperField; -// 包å«Version,对应的version_hashå’Œnum_segments,一般指代OLAP中存在的实体Version -struct VersionEntity { - VersionEntity(Version v, - VersionHash hash, - uint32_t num_seg, - int32_t ref_count, - int64_t num_rows, - size_t data_size, - size_t index_size, - bool empty) : - version(v), - version_hash(hash), - num_segments(num_seg), - ref_count(ref_count), - num_rows(num_rows), - data_size(data_size), - index_size(index_size), - empty(empty), - column_statistics(0) {} +using KeyRange = std::pair; +struct RowSetEntity { + RowSetEntity(int32_t rowset_id, int32_t num_segments, + int64_t num_rows, size_t data_size, size_t index_size, + bool empty, const std::vector* column_statistics) + : rowset_id(rowset_id), num_segments(num_segments), num_rows(num_rows), + data_size(data_size), index_size(index_size), empty(empty) + { + if (column_statistics != nullptr) { + key_ranges = *column_statistics; + } + } - VersionEntity(Version v, - VersionHash hash, - uint32_t num_seg, - int32_t ref_count, - int64_t num_rows, - size_t data_size, - size_t index_size, - bool empty, - const std::vector>& column_statistics) : - version(v), - version_hash(hash), - num_segments(num_seg), - ref_count(ref_count), - num_rows(num_rows), - data_size(data_size), - index_size(index_size), - empty(empty), - column_statistics(column_statistics) {} - - Version version; - VersionHash version_hash; - uint32_t num_segments; - int32_t ref_count; + int32_t rowset_id; + int32_t num_segments; int64_t num_rows; size_t data_size; size_t index_size; bool empty; - std::vector> column_statistics; + std::vector key_ranges; +}; + +struct VersionEntity { + VersionEntity(Version v, VersionHash version_hash) + : version(v), version_hash(version_hash) { } + void add_rowset_entity(const RowSetEntity& rowset) { + rowset_vec.push_back(rowset); + } + + Version version; + VersionHash version_hash; + std::vector rowset_vec; }; // ReaderStatistics used to collect statistics when scan data from storage diff --git a/be/src/olap/olap_cond.cpp b/be/src/olap/olap_cond.cpp index 1244b2fd2e..1fbf8b089f 100644 --- a/be/src/olap/olap_cond.cpp +++ b/be/src/olap/olap_cond.cpp @@ -175,7 +175,7 @@ OLAPStatus Cond::init(const TCondition& tcond, const FieldInfo& fi) { } auto insert_reslut = operand_set.insert(f.get()); if (!insert_reslut.second) { - OLAP_LOG_WARNING("Duplicate operand in in-predicate.[condition=%s]", operand.c_str()); + LOG(WARNING) << "Duplicate operand in in-predicate.[condition=" << operand << "]"; // Duplicated, let unique_ptr delete field } else { // Normal case, release this unique_ptr diff --git a/be/src/olap/olap_cond.h b/be/src/olap/olap_cond.h index 8ecc6fb05c..37f451d112 100644 --- a/be/src/olap/olap_cond.h +++ b/be/src/olap/olap_cond.h @@ -87,7 +87,7 @@ public: // 所有归属于åŒä¸€åˆ—上的æ¡ä»¶äºŒå…ƒç»„,èšåˆåœ¨ä¸€ä¸ªCondColumn上 class CondColumn { public: - CondColumn(SmartOLAPTable table, int32_t index) : _col_index(index), _table(table) { + CondColumn(OLAPTablePtr table, int32_t index) : _col_index(index), _table(table) { _conds.clear(); _is_key = _table->tablet_schema()[_col_index].is_key; } @@ -118,7 +118,7 @@ private: bool _is_key; int32_t _col_index; std::vector _conds; - SmartOLAPTable _table; + OLAPTablePtr _table; }; // 一次请求所关è”çš„æ¡ä»¶ @@ -137,7 +137,7 @@ public: _columns.clear(); } - void set_table(SmartOLAPTable table) { + void set_table(OLAPTablePtr table) { long do_not_remove_me_until_you_want_a_heart_attacking = table.use_count(); OLAP_UNUSED_ARG(do_not_remove_me_until_you_want_a_heart_attacking); @@ -162,7 +162,7 @@ public: } private: - SmartOLAPTable _table; // ref to OLAPTable to access schema + OLAPTablePtr _table; // ref to OLAPTable to access schema CondColumns _columns; // list of condition column }; diff --git a/be/src/olap/olap_data.cpp b/be/src/olap/olap_data.cpp index 56d23a7a02..add0a45fae 100644 --- a/be/src/olap/olap_data.cpp +++ b/be/src/olap/olap_data.cpp @@ -28,7 +28,7 @@ // #include #include "olap/olap_engine.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "olap/olap_table.h" #include "olap/row_block.h" #include "olap/row_cursor.h" @@ -46,7 +46,7 @@ using std::vector; namespace palo { -OLAPData::OLAPData(OLAPIndex* index) : +OLAPData::OLAPData(Rowset* index) : IData(OLAP_DATA_FILE, index), _olap_table(NULL), _is_pickled(true), @@ -96,7 +96,7 @@ OLAPStatus OLAPData::get_first_row_block(RowBlock** row_block, set_eof(true); return res; } else if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("Fail to find first row block with OLAPIndex."); + OLAP_LOG_WARNING("Fail to find first row block with Rowset."); return res; } @@ -135,7 +135,7 @@ OLAPStatus OLAPData::get_next_row_block(RowBlock** row_block, RowBlockPosition row_block_pos = _row_block_broker->position(); res = olap_index()->find_next_row_block(&row_block_pos, eof_ptr()); if (eof()) { - OLAP_LOG_DEBUG("Got EOF from OLAPIndex. [segment=%d, data_offset=%d]", + OLAP_LOG_DEBUG("Got EOF from Rowset. [segment=%d, data_offset=%d]", row_block_pos.segment, row_block_pos.data_offset); // 当到达eof的时候ä¸éœ€è¦æŠŠç»“æžœå¸¦å‡ºæ¥ @@ -241,11 +241,11 @@ const RowCursor* OLAPData::get_first_row() { set_eof(true); return NULL; } else if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("Fail to find first row block with OLAPIndex."); + OLAP_LOG_WARNING("Fail to find first row block with Rowset."); return NULL; } - OLAP_LOG_DEBUG("RowBlockPosition='%s'", row_block_pos.to_string().c_str()); + VLOG(3) << "RowBlockPosition='" << row_block_pos.to_string() << "'"; if ((res = _row_block_broker->change_to(row_block_pos)) != OLAP_SUCCESS) { OLAP_LOG_WARNING("Fail to get row block. " @@ -429,7 +429,7 @@ const RowCursor* OLAPData::find_row(const RowCursor& key, bool find_last_key, bo return row_cursor; } else if (eof || data_eof) { // 此处找ä¸åˆ°ï¼Œæ˜¯ç”±äºŽè®¾ç½®äº†end_key,超找超过了end_key对应行 - OLAP_LOG_TRACE("key can't be found, Search over end_key![key=%s]", key.to_string().c_str()); + VLOG(3) << "key can't be found, Search over end_key![key=" << key.to_string() << "]"; set_eof(true); return NULL; } else { @@ -551,13 +551,11 @@ OLAPStatus OLAPData::add_segment() { data_header->set_segment(_write_descriptor->segment); // file for new segment - file_name = _olap_table->construct_data_file_path(olap_index()->version(), - olap_index()->version_hash(), - _write_descriptor->segment); + file_name = olap_index()->construct_data_file_path(olap_index()->rowset_id(), _write_descriptor->segment); res = _write_descriptor->file_handle.open_with_mode( file_name, O_CREAT | O_EXCL | O_WRONLY , S_IRUSR | S_IWUSR); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("Fail to open file. [file_name=%s]", file_name.c_str()); + LOG(WARNING) << "Fail to open file. [file_name=" << file_name << "]"; goto ADD_SEGMENT_ERR; } @@ -796,7 +794,7 @@ void OLAPData::_check_io_error(OLAPStatus res) { } OLAPData::RowBlockBroker::RowBlockBroker( - OLAPTable* olap_table, OLAPIndex* olap_index, RuntimeState* runtime_state) : + OLAPTable* olap_table, Rowset* olap_index, RuntimeState* runtime_state) : _file_handler(), _row_block_pos(), _read_buffer(NULL), @@ -888,7 +886,7 @@ const RowCursor* OLAPData::RowBlockBroker::find_row(const RowCursor& key, bool find_last_key, bool* end_of_row_block) { if (_row_block->find_row(key, find_last_key, &_row_index) != OLAP_SUCCESS) { - OLAP_LOG_TRACE("fail to find row from row block. [key='%s']", key.to_string().c_str()); + VLOG(3) << "fail to find row from row block. [key='" << key.to_string() << "']"; return NULL; } @@ -1009,12 +1007,10 @@ OLAPStatus OLAPData::RowBlockBroker::_get_row_block(const RowBlockPosition& row_ } } - file_name = _olap_table->construct_data_file_path(_olap_index->version(), - _olap_index->version_hash(), - row_block_pos.segment); + file_name = _olap_index->construct_data_file_path(_olap_index->rowset_id(), row_block_pos.segment); if ((res = _file_handler.open_with_cache(file_name, O_RDONLY)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to open file. [file_name=%s]", file_name.c_str()); + LOG(WARNING) << "fail to open file. [file_name=" << file_name << "]"; goto GET_ROW_BLOCK_ERROR; } diff --git a/be/src/olap/olap_data.h b/be/src/olap/olap_data.h index 31a2d12c52..4e48790e21 100644 --- a/be/src/olap/olap_data.h +++ b/be/src/olap/olap_data.h @@ -29,6 +29,9 @@ namespace palo { +// Row Storage Table is deprecated. +// This file will be removed in succedent release. + class OLAPTable; class RowBlock; class RowCursor; @@ -48,7 +51,7 @@ class RowCursor; // the original RAW_COLUMN_MIXED. class OLAPData: public IData { public: - explicit OLAPData(OLAPIndex* index); + explicit OLAPData(Rowset* index); virtual ~OLAPData(); // åˆå§‹åŒ–, å’Œunpickle统一到åŒä¸€æµç¨‹ä¸Š @@ -143,15 +146,11 @@ public: return olap_index()->num_segments(); } - time_t max_timestamp() const { - return olap_index()->max_timestamp(); - } - private: // RowBlock代ç†,内部类,å¯ä»¥ä»£ç†æŒ‡å®špositionçš„RowBlock,为RowBlockæä¾›ç±»ä¼¼IteratoræœåŠ¡ã€‚ class RowBlockBroker { public: - RowBlockBroker(OLAPTable* olap_table, OLAPIndex* olap_index, RuntimeState* runtime_state); + RowBlockBroker(OLAPTable* olap_table, Rowset* olap_index, RuntimeState* runtime_state); ~RowBlockBroker(); OLAPStatus init(); @@ -244,7 +243,7 @@ private: bool _is_set_end_row; OLAPTable* _olap_table; - OLAPIndex* _olap_index; + Rowset* _olap_index; uint64_t _data_read_buf_size; bool _is_end_block; @@ -310,7 +309,7 @@ class OLAPDataComparator { public: OLAPDataComparator(RowBlockPosition position, OLAPData* olap_data, - const OLAPIndex* index, + const Rowset* index, RowCursor* helper_cursor) : _start_block_position(position), _olap_data(olap_data), @@ -361,7 +360,7 @@ private: const RowBlockPosition _start_block_position; OLAPData* _olap_data; - const OLAPIndex* _index; + const Rowset* _index; RowCursor* _helper_cursor; }; diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index 3d80eb641b..56c4cc3a62 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -32,8 +32,8 @@ static const uint32_t OLAP_MAX_PATH_LEN = 512; static const uint32_t OLAP_DEFAULT_MAX_PACKED_ROW_BLOCK_SIZE = 1024 * 1024 * 20; // æ¯ä¸ªrow block压缩å‰çš„æœ€å¤§é•¿åº¦ï¼Œä¹Ÿå°±æ˜¯buf的最大长度 static const uint32_t OLAP_DEFAULT_MAX_UNPACKED_ROW_BLOCK_SIZE = 1024 * 1024 * 100; -// å—大å°ä½¿ç”¨uint32_tä¿å­˜, 为防止一些æ„外出现, 这里定义为4G-2MB, -static const uint32_t OLAP_MAX_SEGMENT_FILE_SIZE = 4292870144; +// å—大å°ä½¿ç”¨uint32_tä¿å­˜, 这里定义为2G, +static const uint32_t OLAP_MAX_SEGMENT_FILE_SIZE = 2147483648; // 列存储文件的å—大å°,由于å¯èƒ½ä¼šè¢«å…¨éƒ¨è½½å…¥å†…å­˜,所以需è¦ä¸¥æ ¼æŽ§åˆ¶å¤§å°, 这里定义为256MB static const uint32_t OLAP_MAX_COLUMN_SEGMENT_FILE_SIZE = 268435456; // 在列存储文件中, æ•°æ®åˆ†å—压缩, æ¯ä¸ªå—的默认压缩å‰çš„å¤§å° @@ -71,6 +71,9 @@ static const std::string SNAPSHOT_PREFIX = "/snapshot"; static const std::string TRASH_PREFIX = "/trash"; static const std::string UNUSED_PREFIX = "/unused"; static const std::string ERROR_LOG_PREFIX = "/error_log"; +static const std::string PENDING_DELTA_PREFIX = "/pending_delta"; +static const std::string INCREMENTAL_DELTA_PREFIX = "/incremental_delta"; +static const std::string CLONE_PREFIX = "/clone"; static const int32_t OLAP_DATA_VERSION_APPLIED = PALO_V1; @@ -125,6 +128,7 @@ enum OLAPStatus { OLAP_ERR_FILE_FORMAT_ERROR = -119, OLAP_ERR_EVAL_CONJUNCTS_ERROR = -120, OLAP_ERR_COPY_FILE_ERROR = -121, + OLAP_ERR_FILE_ALREADY_EXIST = -122, // common errors codes // [-200, -300) @@ -153,6 +157,8 @@ enum OLAPStatus { OLAP_ERR_NO_AVAILABLE_ROOT_PATH = -223, OLAP_ERR_CHECK_LINES_ERROR = -224, OLAP_ERR_INVALID_CLUSTER_INFO = -225, + OLAP_ERR_TRANSACTION_NOT_EXIST = -226, + OLAP_ERR_DISK_FAILURE = -227, // CommandExecutor // [-300, -400) @@ -170,6 +176,7 @@ enum OLAPStatus { OLAP_ERR_TABLE_VERSION_INDEX_MISMATCH_ERROR = -401, OLAP_ERR_TABLE_INDEX_VALIDATE_ERROR = -402, OLAP_ERR_TABLE_INDEX_FIND_ERROR = -403, + OLAP_ERR_TABLE_CREATE_FROM_HEADER_ERROR = -404, // OLAPEngine // [-500, -600) @@ -226,8 +233,9 @@ enum OLAPStatus { OLAP_ERR_PUSH_VERSION_ALREADY_EXIST = -908, OLAP_ERR_PUSH_TABLE_NOT_EXIST = -909, OLAP_ERR_PUSH_INPUT_DATA_ERROR = -910, + OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST = -911, - // OLAPIndex + // Rowset // [-1000, -1100) OLAP_ERR_INDEX_LOAD_ERROR = -1000, OLAP_ERR_INDEX_EOF = -1001, @@ -257,6 +265,17 @@ enum OLAPStatus { // [-1400, -1500) OLAP_ERR_HEADER_ADD_VERSION = -1400, OLAP_ERR_HEADER_DELETE_VERSION = -1401, + OLAP_ERR_HEADER_ADD_PENDING_DELTA = -1402, + OLAP_ERR_HEADER_ADD_INCREMENTAL_VERSION = -1403, + OLAP_ERR_HEADER_INVALID_FLAG = -1404, + OLAP_ERR_HEADER_PUT = -1405, + OLAP_ERR_HEADER_DELETE = -1406, + OLAP_ERR_HEADER_GET = -1407, + OLAP_ERR_HEADER_LOAD_INVALID_KEY = -1408, + OLAP_ERR_HEADER_FLAG_PUT = -1409, + OLAP_ERR_HEADER_LOAD_JSON_HEADER = -1410, + OLAP_ERR_HEADER_INIT_FAILED = -1411, + OLAP_ERR_HEADER_PB_PARSE_FAILED = -1412, // OLAPTableSchema // [-1500, -1600) @@ -301,12 +320,36 @@ enum OLAPStatus { OLAP_ERR_CUMULATIVE_FAILED_ACQUIRE_DATA_SOURCE = -2003, OLAP_ERR_CUMULATIVE_INVALID_NEED_MERGED_VERSIONS = -2004, OLAP_ERR_CUMULATIVE_ERROR_DELETE_ACTION = -2005, + + // OLAPMeta + // [-3000, -3100) + OLAP_ERR_META_INVALID_ARGUMENT = -3000, + OLAP_ERR_META_OPEN_DB = -3001, + OLAP_ERR_META_KEY_NOT_FOUND = -3002, + OLAP_ERR_META_GET = -3003, + OLAP_ERR_META_PUT = -3004, + OLAP_ERR_META_ITERATOR = -3005, + OLAP_ERR_META_DELETE = -3006, +}; + +enum ColumnFamilyIndex { + DEFAULT_COLUMN_FAMILY_INDEX = 0, + DORIS_COLUMN_FAMILY_INDEX, + META_COLUMN_FAMILY_INDEX, }; static const char* const HINIS_KEY_SEPARATOR = ";"; static const char* const HINIS_KEY_PAIR_SEPARATOR = "|"; static const char* const HINIS_KEY_GROUP_SEPARATOR = "&"; +static const std::string DEFAULT_COLUMN_FAMILY = "default"; +static const std::string DORIS_COLUMN_FAMILY = "doris"; +static const std::string META_COLUMN_FAMILY = "meta"; +static const std::string IS_HEADER_CONVERTED = "is_header_converted"; +static const std::string CONVERTED_FLAG = "true"; +const std::string TABLET_ID_KEY = "tablet_id"; +const std::string TABLET_SCHEMA_HASH_KEY = "schema_hash"; + #define RETURN_NOT_OK(s) do { \ OLAPStatus _s = (s); \ if (_s != OLAP_SUCCESS) { \ diff --git a/be/src/olap/olap_engine.cpp b/be/src/olap/olap_engine.cpp index 2ec0625208..e145200f62 100644 --- a/be/src/olap/olap_engine.cpp +++ b/be/src/olap/olap_engine.cpp @@ -22,27 +22,36 @@ #include #include #include +#include #include +#include #include #include #include +#include +#include "agent/file_downloader.h" #include "olap/base_compaction.h" #include "olap/cumulative_compaction.h" #include "olap/lru_cache.h" #include "olap/olap_header.h" -#include "olap/olap_rootpath.h" -#include "olap/olap_snapshot.h" +#include "olap/olap_header_manager.h" #include "olap/push_handler.h" +#include "olap/reader.h" #include "olap/schema_change.h" +#include "olap/store.h" #include "olap/utils.h" #include "olap/writer.h" +#include "util/time.h" #include "util/palo_metrics.h" +#include "util/pretty_printer.h" +using apache::thrift::ThriftDebugString; using boost::filesystem::canonical; using boost::filesystem::directory_iterator; using boost::filesystem::path; +using boost::filesystem::recursive_directory_iterator; using std::back_inserter; using std::copy; using std::inserter; @@ -57,49 +66,114 @@ using std::string; using std::stringstream; using std::vector; - namespace palo { -// OLAPTable*对象的shared_ptrçš„æžæž„函数 -void OLAPTableDestruction(OLAPTable* olap_table) { - SAFE_DELETE(olap_table); -} -bool _sort_table_by_create_time(const SmartOLAPTable& a, const SmartOLAPTable& b) { +OLAPEngine* OLAPEngine::_s_instance = nullptr; +const std::string HTTP_REQUEST_PREFIX = "/api/_tablet/_download?"; +const std::string HTTP_REQUEST_TOKEN_PARAM = "token="; +const std::string HTTP_REQUEST_FILE_PARAM = "&file="; + +const uint32_t DOWNLOAD_FILE_MAX_RETRY = 3; +const uint32_t LIST_REMOTE_FILE_TIMEOUT = 15; + +bool _sort_table_by_create_time(const OLAPTablePtr& a, const OLAPTablePtr& b) { return a->creation_time() < b->creation_time(); } -OLAPEngine::OLAPEngine() : +static Status _validate_options(const EngineOptions& options) { + if (options.store_paths.empty()) { + return Status("sotre paths is empty");; + } + return Status::OK; +} + +Status OLAPEngine::open(const EngineOptions& options, OLAPEngine** engine_ptr) { + RETURN_IF_ERROR(_validate_options(options)); + std::unique_ptr engine(new OLAPEngine(options)); + auto st = engine->open(); + if (st != OLAP_SUCCESS) { + LOG(WARNING) << "engine open failed, res=" << st; + return Status("open engine failed"); + } + st = engine->_start_bg_worker(); + if (st != OLAP_SUCCESS) { + LOG(WARNING) << "engine start background failed, res=" << st; + return Status("open engine failed"); + } + *engine_ptr = engine.release(); + return Status::OK; +} + +OLAPEngine::OLAPEngine(const EngineOptions& options) + : is_report_disk_state_already(false), + is_report_olap_table_already(false), + _options(options), + _available_storage_medium_type_count(0), + _effective_cluster_id(-1), + _is_all_cluster_id_exist(true), + _is_drop_tables(false), _global_table_id(0), _file_descriptor_lru_cache(NULL), - _index_stream_lru_cache(NULL) {} + _index_stream_lru_cache(NULL), + _tablet_stat_cache_update_time_ms(0), + _snapshot_base_id(0) { + if (_s_instance == nullptr) { + _s_instance = this; + } +} OLAPEngine::~OLAPEngine() { clear(); } -OLAPStatus OLAPEngine::_load_tables(const string& tablet_root_path) { - // é历跟目录寻找所有的shard +OLAPStatus OLAPEngine::_load_store(OlapStore* store) { + std::string store_path = store->path(); + LOG(INFO) <<"start to load tablets from store_path:" << store_path; + + bool is_header_converted = false; + OLAPStatus res = OlapHeaderManager::get_header_converted(store, is_header_converted); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "get convert flag from meta failed"; + return res; + } + if (is_header_converted) { + LOG(INFO) << "load header from meta"; + OLAPStatus s = store->load_tables(this); + LOG(INFO) << "load header from meta finished"; + if (s != OLAP_SUCCESS) { + LOG(WARNING) << "there is failure when loading table headers, path:" << store_path; + return s; + } else { + return OLAP_SUCCESS; + } + } + + // compatible for old header load method + // walk all directory to load header file + LOG(INFO) << "load headers from header files"; + + // get all shards set shards; - if (dir_walk(tablet_root_path + DATA_PREFIX, &shards, NULL) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to walk dir. [root=%s]", tablet_root_path.c_str()); + if (dir_walk(store_path + DATA_PREFIX, &shards, NULL) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to walk dir. [root=" << store_path << "]"; return OLAP_ERR_INIT_FAILED; } for (const auto& shard : shards) { - // é历shard目录寻找此shard的所有tablet + // get all tablets set tablets; - string one_shard_path = tablet_root_path + DATA_PREFIX + '/' + shard; + string one_shard_path = store_path + DATA_PREFIX + '/' + shard; if (dir_walk(one_shard_path, &tablets, NULL) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to walk dir. [root=%s]", one_shard_path.c_str()); + LOG(WARNING) << "fail to walk dir. [root=" << one_shard_path << "]"; continue; } for (const auto& tablet : tablets) { - // é历table目录寻找此table的所有indexedRollupTable,注æ„䏿˜¯OLAPIndex,而是OLAPTable + // é历table目录寻找此table的所有indexedRollupTable,注æ„䏿˜¯Rowset,而是OLAPTable set schema_hashes; string one_tablet_path = one_shard_path + '/' + tablet; if (dir_walk(one_tablet_path, &schema_hashes, NULL) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to walk dir. [root=%s]", one_tablet_path.c_str()); + LOG(WARNING) << "fail to walk dir. [root=" << one_tablet_path << "]"; continue; } @@ -110,6 +184,7 @@ OLAPStatus OLAPEngine::_load_tables(const string& tablet_root_path) { // é历schema_hash目录寻找此index的所有schema // 加载失败ä¾ç„¶åŠ è½½ä¸‹ä¸€ä¸ªTable if (load_one_tablet( + store, tablet_id, tablet_schema_hash, one_tablet_path + '/' + schema_hash) != OLAP_SUCCESS) { @@ -119,37 +194,38 @@ OLAPStatus OLAPEngine::_load_tables(const string& tablet_root_path) { } } } - - return OLAP_SUCCESS; + res = OlapHeaderManager::set_converted_flag(store); + LOG(INFO) << "load header from header files finished"; + return res; } OLAPStatus OLAPEngine::load_one_tablet( - TTabletId tablet_id, SchemaHash schema_hash, const string& schema_hash_path, - bool force) { + OlapStore* store, TTabletId tablet_id, SchemaHash schema_hash, + const string& schema_hash_path, bool force) { stringstream header_name_stream; header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr"; string header_path = header_name_stream.str(); path boost_schema_hash_path(schema_hash_path); if (access(header_path.c_str(), F_OK) != 0) { - OLAP_LOG_WARNING("fail to find header file. [header_path=%s]", header_path.c_str()); + LOG(WARNING) << "fail to find header file. [header_path=" << header_path << "]"; move_to_trash(boost_schema_hash_path, boost_schema_hash_path); return OLAP_ERR_FILE_NOT_EXIST; } - OLAPTable* olap_table = OLAPTable::create_from_header_file( - tablet_id, schema_hash, header_path); + auto olap_table = OLAPTable::create_from_header_file( + tablet_id, schema_hash, header_path, store); if (olap_table == NULL) { - OLAP_LOG_WARNING("fail to load table. [header_path=%s]", header_path.c_str()); + LOG(WARNING) << "fail to load table. [header_path=" << header_path << "]"; move_to_trash(boost_schema_hash_path, boost_schema_hash_path); return OLAP_ERR_ENGINE_LOAD_INDEX_TABLE_ERROR; } - if (olap_table->latest_version() == NULL && !olap_table->is_schema_changing()) { + + if (olap_table->lastest_version() == NULL && !olap_table->is_schema_changing()) { OLAP_LOG_WARNING("tablet not in schema change state without delta is invalid. " "[header_path=%s]", header_path.c_str()); move_to_trash(boost_schema_hash_path, boost_schema_hash_path); - SAFE_DELETE(olap_table); return OLAP_ERR_ENGINE_LOAD_INDEX_TABLE_ERROR; } @@ -163,11 +239,11 @@ OLAPStatus OLAPEngine::load_one_tablet( return OLAP_SUCCESS; } - OLAP_LOG_WARNING("failed to add table. [table=%s]", table_name.c_str()); + LOG(WARNING) << "failed to add table. [table=" << table_name << "]"; return OLAP_ERR_ENGINE_LOAD_INDEX_TABLE_ERROR; } - if (OLAPRootPath::get_instance()->register_table_into_root_path(olap_table) != OLAP_SUCCESS) { + if (register_table_into_root_path(olap_table.get()) != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to register table into root path. [root_path=%s]", schema_hash_path.c_str()); @@ -180,69 +256,50 @@ OLAPStatus OLAPEngine::load_one_tablet( return OLAP_ERR_ENGINE_LOAD_INDEX_TABLE_ERROR; } + // load pending data (for realtime push), will add transaction relationship into engine + olap_table->load_pending_data(); + OLAP_LOG_DEBUG("succeed to add table. [table=%s, path=%s]", olap_table->full_name().c_str(), schema_hash_path.c_str()); return OLAP_SUCCESS; } -void* load_root_path_thread_callback(void* arg) { - OLAPStatus res = OLAP_SUCCESS; - string root_path = (char*)arg; - - if ((res = OLAPEngine::get_instance()->_load_tables(root_path.c_str())) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("io error when init load tables. [res=%d root='%s']", - res, - root_path.c_str()); +void OLAPEngine::load_stores(const std::vector& stores) { + std::vector threads; + for (auto store : stores) { + threads.emplace_back([this, store] { + auto res = _load_store(store); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "io error when init load tables. res=" << res + << ", store=" << store->path(); + } + }); + } + for (auto& thread : threads) { + thread.join(); } - - return NULL; } -// @static -OLAPStatus OLAPEngine::_spawn_load_root_path_thread(pthread_t* thread, const string& root_path) { - sigset_t mask; - sigset_t omask; - - sigemptyset(&mask); - sigaddset(&mask, SIGCHLD); - sigaddset(&mask, SIGHUP); - sigaddset(&mask, SIGPIPE); - pthread_sigmask(SIG_SETMASK, &mask, &omask); - - OLAP_LOG_TRACE("spawn a schema_change thread."); - - int err = 0; - while ((err = pthread_create(thread, - NULL, - load_root_path_thread_callback, - reinterpret_cast(const_cast( - root_path.c_str())))) != 0) { - OLAP_LOG_WARNING("failed to spawn load root path thread."); - // Sleep 1s before next try - usleep(1000000); +OLAPStatus OLAPEngine::open() { + // init store_map + for (auto& path : _options.store_paths) { + OlapStore* store = new OlapStore(path.path, path.capacity_bytes); + auto st = store->load(); + if (!st.ok()) { + LOG(WARNING) << "Store load failed, path=" << path.path; + return OLAP_ERR_INVALID_ROOT_PATH; + } + _store_map.emplace(path.path, store); + } + _effective_cluster_id = config::cluster_id; + auto res = check_all_root_path_cluster_id(); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to check cluster info. res=" << res; + return res; } - pthread_sigmask(SIG_SETMASK, &omask, NULL); - return OLAP_SUCCESS; -} - -void OLAPEngine::load_root_paths(const OLAPRootPath::RootPathVec& root_paths) { - pthread_t* load_root_path_thread = new pthread_t [root_paths.size()]; - - for (uint32_t i = 0; i < root_paths.size(); i++) { - _spawn_load_root_path_thread(&load_root_path_thread[i], root_paths[i]); - } - - for (uint32_t i = 0; i < root_paths.size(); i++) { - pthread_join(load_root_path_thread[i], NULL); - } - - delete [] load_root_path_thread; -} - -OLAPStatus OLAPEngine::init() { - OLAPRootPath::RootPathVec all_available_root_path; + _update_storage_medium_type_count(); _file_descriptor_lru_cache = new_lru_cache(config::file_descriptor_cache_capacity); if (_file_descriptor_lru_cache == NULL) { @@ -261,23 +318,14 @@ OLAPStatus OLAPEngine::init() { } // åˆå§‹åŒ–CE调度器 - vector all_root_paths_info; - OLAPRootPath::get_instance()->get_all_root_path_info(&all_root_paths_info, false); - _cumulative_compaction_disk_stat.reserve(all_root_paths_info.size()); - for (uint32_t i = 0; i < all_root_paths_info.size(); i++) { - const RootPathInfo& info = all_root_paths_info[i]; - _cumulative_compaction_disk_stat.emplace_back(info.path, i, info.is_used); - _disk_id_map[info.path] = i; - } int32_t cumulative_compaction_num_threads = config::cumulative_compaction_num_threads; int32_t base_compaction_num_threads = config::base_compaction_num_threads; - uint32_t file_system_num = OLAPRootPath::get_instance()->get_file_system_count(); + uint32_t file_system_num = get_file_system_count(); _max_cumulative_compaction_task_per_disk = (cumulative_compaction_num_threads + file_system_num - 1) / file_system_num; _max_base_compaction_task_per_disk = (base_compaction_num_threads + file_system_num - 1) / file_system_num; - // 加载所有table - OLAPRootPath::get_instance()->get_all_available_root_path(&all_available_root_path); - load_root_paths(all_available_root_path); + auto stores = get_stores(); + load_stores(stores); // å–æ¶ˆæœªå®Œæˆçš„SchemaChange任务 _cancel_unfinished_schema_change(); @@ -285,23 +333,292 @@ OLAPStatus OLAPEngine::init() { return OLAP_SUCCESS; } +void OLAPEngine::_update_storage_medium_type_count() { + set available_storage_medium_types; + + std::lock_guard l(_store_lock); + for (auto& it : _store_map) { + if (it.second->is_used()) { + available_storage_medium_types.insert(it.second->storage_medium()); + } + } + + _available_storage_medium_type_count = available_storage_medium_types.size(); +} + + +OLAPStatus OLAPEngine::_judge_and_update_effective_cluster_id(int32_t cluster_id) { + OLAPStatus res = OLAP_SUCCESS; + + if (cluster_id == -1 && _effective_cluster_id == -1) { + // maybe this is a new cluster, cluster id will get from heartbeate + return res; + } else if (cluster_id != -1 && _effective_cluster_id == -1) { + _effective_cluster_id = cluster_id; + } else if (cluster_id == -1 && _effective_cluster_id != -1) { + // _effective_cluster_id is the right effective cluster id + return res; + } else { + if (cluster_id != _effective_cluster_id) { + OLAP_LOG_WARNING("multiple cluster ids is not equal. [id1=%d id2=%d]", + _effective_cluster_id, cluster_id); + return OLAP_ERR_INVALID_CLUSTER_INFO; + } + } + + return res; +} + +void OLAPEngine::set_store_used_flag(const string& path, bool is_used) { + std::lock_guard l(_store_lock); + auto it = _store_map.find(path); + if (it == _store_map.end()) { + LOG(WARNING) << "store not exist, path=" << path; + } + + it->second->set_is_used(is_used); + _update_storage_medium_type_count(); +} + +void OLAPEngine::get_all_available_root_path(std::vector* available_paths) { + available_paths->clear(); + std::lock_guard l(_store_lock); + for (auto& it : _store_map) { + if (it.second->is_used()) { + available_paths->push_back(it.first); + } + } +} + +template +std::vector OLAPEngine::get_stores() { + std::vector stores; + stores.reserve(_store_map.size()); + + std::lock_guard l(_store_lock); + if (include_unused) { + for (auto& it : _store_map) { + stores.push_back(it.second); + } + } else { + for (auto& it : _store_map) { + if (it.second->is_used()) { + stores.push_back(it.second); + } + } + } + return stores; +} + +template std::vector OLAPEngine::get_stores(); +template std::vector OLAPEngine::get_stores(); + +OLAPStatus OLAPEngine::get_all_root_path_info(vector* root_paths_info) { + OLAPStatus res = OLAP_SUCCESS; + root_paths_info->clear(); + + std::lock_guard l(_store_lock); + for (auto& it : _store_map) { + root_paths_info->emplace_back(it.second->to_root_path_info()); + } + + for (auto& info: *root_paths_info) { + if (info.is_used) { + _get_root_path_capacity(info.path, &info.data_used_capacity, &info.available); + } else { + info.capacity = 1; + info.data_used_capacity = 0; + info.available = 0; + } + } + + return res; +} + +OLAPStatus OLAPEngine::register_table_into_root_path(OLAPTable* olap_table) { + return olap_table->store()->register_table(olap_table); +} + +void OLAPEngine::start_disk_stat_monitor() { + for (auto& it : _store_map) { + it.second->health_check(); + } + _update_storage_medium_type_count(); + _delete_tables_on_unused_root_path(); + + // if drop tables + // notify disk_state_worker_thread and olap_table_worker_thread until they received + if (_is_drop_tables) { + disk_broken_cv.notify_all(); + + bool is_report_disk_state_expected = true; + bool is_report_olap_table_expected = true; + bool is_report_disk_state_exchanged = + is_report_disk_state_already.compare_exchange_strong(is_report_disk_state_expected, false); + bool is_report_olap_table_exchanged = + is_report_olap_table_already.compare_exchange_strong(is_report_olap_table_expected, false); + if (is_report_disk_state_exchanged && is_report_olap_table_exchanged) { + _is_drop_tables = false; + } + } +} + +bool OLAPEngine::_used_disk_not_enough(uint32_t unused_num, uint32_t total_num) { + return ((total_num == 0) || (unused_num * 100 / total_num > _min_percentage_of_error_disk)); +} + +OLAPStatus OLAPEngine::check_all_root_path_cluster_id() { + int32_t cluster_id = -1; + for (auto& it : _store_map) { + int32_t tmp_cluster_id = it.second->cluster_id(); + if (tmp_cluster_id == -1) { + _is_all_cluster_id_exist = false; + } else if (tmp_cluster_id == cluster_id) { + // both hava right cluster id, do nothing + } else if (cluster_id == -1) { + cluster_id = tmp_cluster_id; + } else { + LOG(WARNING) << "multiple cluster ids is not equal. one=" << cluster_id + << ", other=" << tmp_cluster_id; + return OLAP_ERR_INVALID_CLUSTER_INFO; + } + } + + // judge and get effective cluster id + OLAPStatus res = OLAP_SUCCESS; + res = _judge_and_update_effective_cluster_id(cluster_id); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to judge and update effective cluster id. [res=%d]", res); + return res; + } + + // write cluster id into cluster_id_path if get effective cluster id success + if (_effective_cluster_id != -1 && !_is_all_cluster_id_exist) { + set_cluster_id(_effective_cluster_id); + } + + return res; +} + +Status OLAPEngine::set_cluster_id(int32_t cluster_id) { + std::lock_guard l(_store_lock); + for (auto& it : _store_map) { + RETURN_IF_ERROR(it.second->set_cluster_id(cluster_id)); + } + _effective_cluster_id = cluster_id; + _is_all_cluster_id_exist = true; + return Status::OK; +} + +std::vector OLAPEngine::get_stores_for_create_table( + TStorageMedium::type storage_medium) { + std::vector stores; + { + std::lock_guard l(_store_lock); + for (auto& it : _store_map) { + if (it.second->is_used()) { + if (_available_storage_medium_type_count == 1 + || it.second->storage_medium() == storage_medium) { + stores.push_back(it.second); + } + } + } + } + + std::random_device rd; + srand(rd()); + std::random_shuffle(stores.begin(), stores.end()); + return stores; +} + +OlapStore* OLAPEngine::get_store(const std::string& path) { + std::lock_guard l(_store_lock); + auto it = _store_map.find(path); + if (it == std::end(_store_map)) { + return nullptr; + } + return it->second; +} + +void OLAPEngine::_delete_tables_on_unused_root_path() { + vector tablet_info_vec; + uint32_t unused_root_path_num = 0; + uint32_t total_root_path_num = 0; + + std::lock_guard l(_store_lock); + + for (auto& it : _store_map) { + total_root_path_num++; + if (it.second->is_used()) { + continue; + } + for (auto& tablet : it.second->_tablet_set) { + tablet_info_vec.push_back(tablet); + } + it.second->_tablet_set.clear(); + } + + if (_used_disk_not_enough(unused_root_path_num, total_root_path_num)) { + OLAP_LOG_FATAL("engine stop running, because more than %d disks error." + "[total_disks=%d error_disks=%d]", + _min_percentage_of_error_disk, + total_root_path_num, + unused_root_path_num); + exit(0); + } + + if (!tablet_info_vec.empty()) { + _is_drop_tables = true; + } + + OLAPEngine::get_instance()->drop_tables_on_error_root_path(tablet_info_vec); +} + +OLAPStatus OLAPEngine::_get_root_path_capacity( + const string& root_path, + int64_t* data_used, + int64_t* disk_available) { + OLAPStatus res = OLAP_SUCCESS; + int64_t used = 0; + + try { + path boost_root_path(root_path + DATA_PREFIX); + for (recursive_directory_iterator it(boost_root_path); + it != recursive_directory_iterator(); ++it) { + if (!is_directory(*it)) { + used += file_size(*it); + } + } + *data_used = used; + boost::filesystem::path path_name(root_path); + boost::filesystem::space_info path_info = boost::filesystem::space(path_name); + *disk_available = path_info.available; + } catch (boost::filesystem::filesystem_error& e) { + LOG(WARNING) << "get space info failed. path: " << root_path << " erro:" << e.what(); + return OLAP_ERR_STL_ERROR; + } + + return res; +} + OLAPStatus OLAPEngine::clear() { // 删除lru中所有内容,å…¶å®žè¿›ç¨‹é€€å‡ºè¿™ä¹ˆåšæœ¬èº«æ„义ä¸å¤§,但坹啿µ‹å’Œæ›´å®¹æ˜“å‘现问题还是有很大æ„义的 SAFE_DELETE(_file_descriptor_lru_cache); SAFE_DELETE(_index_stream_lru_cache); _tablet_map.clear(); + _transaction_tablet_map.clear(); _global_table_id = 0; return OLAP_SUCCESS; } -SmartOLAPTable OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHash schema_hash) { +OLAPTablePtr OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHash schema_hash) { OLAP_LOG_DEBUG("begin to get olap table. [table=%ld]", tablet_id); tablet_map_t::iterator it = _tablet_map.find(tablet_id); if (it != _tablet_map.end()) { - for (SmartOLAPTable table : it->second.table_arr) { + for (OLAPTablePtr table : it->second.table_arr) { if (table->equal(tablet_id, schema_hash)) { OLAP_LOG_DEBUG("get olap table success. [table=%ld]", tablet_id); return table; @@ -311,13 +628,13 @@ SmartOLAPTable OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHa OLAP_LOG_DEBUG("fail to get olap table. [table=%ld]", tablet_id); // Return empty olap_table if fail - SmartOLAPTable olap_table; + OLAPTablePtr olap_table; return olap_table; } -SmartOLAPTable OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash) { +OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table) { _tablet_map_lock.rdlock(); - SmartOLAPTable olap_table; + OLAPTablePtr olap_table; olap_table = _get_table_with_no_lock(tablet_id, schema_hash); _tablet_map_lock.unlock(); @@ -325,7 +642,7 @@ SmartOLAPTable OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash if (!olap_table->is_used()) { OLAP_LOG_WARNING("olap table cannot be used. [table=%ld]", tablet_id); olap_table.reset(); - } else if (!olap_table->is_loaded()) { + } else if (load_table && !olap_table->is_loaded()) { if (olap_table->load() != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to load olap table. [table=%ld]", tablet_id); olap_table.reset(); @@ -338,14 +655,14 @@ SmartOLAPTable OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash OLAPStatus OLAPEngine::get_tables_by_id( TTabletId tablet_id, - list* table_list) { + list* table_list) { OLAPStatus res = OLAP_SUCCESS; OLAP_LOG_DEBUG("begin to get tables by id. [table=%ld]", tablet_id); _tablet_map_lock.rdlock(); tablet_map_t::iterator it = _tablet_map.find(tablet_id); if (it != _tablet_map.end()) { - for (SmartOLAPTable olap_table : it->second.table_arr) { + for (OLAPTablePtr olap_table : it->second.table_arr) { table_list->push_back(olap_table); } } @@ -356,7 +673,7 @@ OLAPStatus OLAPEngine::get_tables_by_id( return OLAP_ERR_TABLE_NOT_FOUND; } - for (std::list::iterator it = table_list->begin(); + for (std::list::iterator it = table_list->begin(); it != table_list->end();) { if (!(*it)->is_loaded()) { if ((*it)->load() != OLAP_SUCCESS) { @@ -387,17 +704,16 @@ bool OLAPEngine::check_tablet_id_exist(TTabletId tablet_id) { } OLAPStatus OLAPEngine::add_table(TTabletId tablet_id, SchemaHash schema_hash, - OLAPTable* table, bool force) { + const OLAPTablePtr& table, bool force) { OLAPStatus res = OLAP_SUCCESS; OLAP_LOG_DEBUG("begin to add olap table to OLAPEngine. [tablet_id=%ld schema_hash=%d], force: %d", tablet_id, schema_hash, force); _tablet_map_lock.wrlock(); - SmartOLAPTable smart_table(table, OLAPTableDestruction); - smart_table->set_id(_global_table_id++); + table->set_id(_global_table_id++); - SmartOLAPTable table_item; - for (SmartOLAPTable item : _tablet_map[tablet_id].table_arr) { + OLAPTablePtr table_item; + for (OLAPTablePtr item : _tablet_map[tablet_id].table_arr) { if (item->equal(tablet_id, schema_hash)) { table_item = item; break; @@ -405,7 +721,7 @@ OLAPStatus OLAPEngine::add_table(TTabletId tablet_id, SchemaHash schema_hash, } if (table_item.get() == NULL) { - _tablet_map[tablet_id].table_arr.push_back(smart_table); + _tablet_map[tablet_id].table_arr.push_back(table); _tablet_map[tablet_id].table_arr.sort(_sort_table_by_create_time); _tablet_map_lock.unlock(); @@ -414,18 +730,18 @@ OLAPStatus OLAPEngine::add_table(TTabletId tablet_id, SchemaHash schema_hash, _tablet_map_lock.unlock(); if (!force) { - if (table_item->header_file_name() == smart_table->header_file_name()) { - OLAP_LOG_WARNING("add the same tablet twice! [tablet_id=%ld schema_hash=%d]", - tablet_id, schema_hash); + if (table_item->tablet_path() == table->tablet_path()) { + LOG(WARNING) << "add the same tablet twice! tablet_id=" + << tablet_id << " schema_hash=" << tablet_id; return OLAP_ERR_ENGINE_INSERT_EXISTS_TABLE; } } table_item->obtain_header_rdlock(); - int64_t old_time = table_item->latest_version()->creation_time(); - int64_t new_time = smart_table->latest_version()->creation_time(); - int32_t old_version = table_item->latest_version()->end_version(); - int32_t new_version = smart_table->latest_version()->end_version(); + int64_t old_time = table_item->lastest_version()->creation_time(); + int64_t new_time = table->lastest_version()->creation_time(); + int32_t old_version = table_item->lastest_version()->end_version(); + int32_t new_version = table->lastest_version()->end_version(); table_item->release_header_lock(); /* @@ -442,21 +758,390 @@ OLAPStatus OLAPEngine::add_table(TTabletId tablet_id, SchemaHash schema_hash, || (new_version == old_version && new_time > old_time))) { drop_table(tablet_id, schema_hash, keep_files); _tablet_map_lock.wrlock(); - _tablet_map[tablet_id].table_arr.push_back(smart_table); + _tablet_map[tablet_id].table_arr.push_back(table); _tablet_map[tablet_id].table_arr.sort(_sort_table_by_create_time); _tablet_map_lock.unlock(); } else { - smart_table->mark_dropped(); + table->mark_dropped(); res = OLAP_ERR_ENGINE_INSERT_EXISTS_TABLE; } - OLAP_LOG_WARNING("add duplicated table. force: %d, [res=%d tablet_id=%ld schema_hash=%d " - "old_version=%d new_version=%d old_time=%ld new_time=%ld]", - force, res, tablet_id, schema_hash, - old_version, new_version, old_time, new_time); + LOG(WARNING) << "add duplicated table. force=" << force << ", res=" << res + << ", tablet_id=" << tablet_id << ", schema_hash=" << schema_hash + << ", old_version=" << old_version << ", new_version=" << new_version + << ", old_time=" << old_time << ", new_time=" << new_time + << ", old_tablet_path=" << table_item->tablet_path() + << ", new_tablet_path=" << table->tablet_path(); return res; } +OLAPStatus OLAPEngine::add_transaction( + TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, SchemaHash schema_hash, const PUniqueId& load_id) { + + pair key(partition_id, transaction_id); + TabletInfo tablet_info(tablet_id, schema_hash); + WriteLock wrlock(&_transaction_tablet_map_lock); + auto it = _transaction_tablet_map.find(key); + if (it != _transaction_tablet_map.end()) { + auto load_info = it->second.find(tablet_info); + if (load_info != it->second.end()) { + for (PUniqueId& pid : load_info->second) { + if (pid.hi() == load_id.hi() && pid.lo() == load_id.lo()) { + LOG(WARNING) << "find transaction exists when add to engine." + << "partition_id: " << key.first << ", " + << "transaction_id: " << key.second << ", " + << "table: " << tablet_info.to_string(); + return OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST; + } + } + } + } + + _transaction_tablet_map[key][tablet_info].push_back(load_id); + VLOG(3) << "add transaction to engine successfully." + << "partition_id: " << key.first << ", " + << "transaction_id: " << key.second << ", " + << "table: " << tablet_info.to_string(); + return OLAP_SUCCESS; +} + +void OLAPEngine::delete_transaction( + TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, SchemaHash schema_hash, bool delete_from_tablet) { + + pair key(partition_id, transaction_id); + TabletInfo tablet_info(tablet_id, schema_hash); + WriteLock wrlock(&_transaction_tablet_map_lock); + + auto it = _transaction_tablet_map.find(key); + if (it != _transaction_tablet_map.end()) { + VLOG(3) << "delete transaction to engine successfully." + << "partition_id: " << key.first << ", " + << "transaction_id: " << key.second << ", " + << "table: " << tablet_info.to_string(); + it->second.erase(tablet_info); + if (it->second.empty()) { + _transaction_tablet_map.erase(it); + } + + // delete transaction from tablet + if (delete_from_tablet) { + OLAPTablePtr tablet = get_table(tablet_info.tablet_id, tablet_info.schema_hash); + if (tablet.get() != nullptr) { + tablet->delete_pending_data(transaction_id); + } + } + } +} + +void OLAPEngine::get_transactions_by_tablet(OLAPTablePtr tablet, int64_t* partition_id, + set* transaction_ids) { + if (tablet.get() == nullptr || partition_id == nullptr || transaction_ids == nullptr) { + OLAP_LOG_WARNING("parameter is null when get transactions by tablet"); + return; + } + + TabletInfo tablet_info(tablet->tablet_id(), tablet->schema_hash()); + ReadLock rdlock(&_transaction_tablet_map_lock); + for (auto& it : _transaction_tablet_map) { + if (it.second.find(tablet_info) != it.second.end()) { + *partition_id = it.first.first; + transaction_ids->insert(it.first.second); + VLOG(3) << "find transaction on tablet." + << "partition_id: " << it.first.first << ", " + << "transaction_id: " << it.first.second << ", " + << "table: " << tablet_info.to_string(); + } + } +} + +bool OLAPEngine::has_transaction(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, SchemaHash schema_hash) { + pair key(partition_id, transaction_id); + TabletInfo tablet_info(tablet_id, schema_hash); + + _transaction_tablet_map_lock.rdlock(); + auto it = _transaction_tablet_map.find(key); + bool found = it != _transaction_tablet_map.end() + && it->second.find(tablet_info) != it->second.end(); + _transaction_tablet_map_lock.unlock(); + + return found; +} + +OLAPStatus OLAPEngine::publish_version(const TPublishVersionRequest& publish_version_req, + vector* error_tablet_ids) { + LOG(INFO) << "begin to process publish version. transaction_id=" + << publish_version_req.transaction_id; + + int64_t transaction_id = publish_version_req.transaction_id; + OLAPStatus res = OLAP_SUCCESS; + + // each partition + for (const TPartitionVersionInfo& partitionVersionInfo + : publish_version_req.partition_version_infos) { + + int64_t partition_id = partitionVersionInfo.partition_id; + pair key(partition_id, transaction_id); + + _transaction_tablet_map_lock.rdlock(); + auto it = _transaction_tablet_map.find(key); + if (it == _transaction_tablet_map.end()) { + OLAP_LOG_WARNING("no tablet to publish version. [partition_id=%ld transaction_id=%ld]", + partition_id, transaction_id); + _transaction_tablet_map_lock.unlock(); + continue; + } + std::map> load_info_map = it->second; + _transaction_tablet_map_lock.unlock(); + + Version version(partitionVersionInfo.version, partitionVersionInfo.version); + VersionHash version_hash = partitionVersionInfo.version_hash; + + // each tablet + for (auto& load_info : load_info_map) { + const TabletInfo& tablet_info = load_info.first; + OLAP_LOG_DEBUG("begin to publish version on tablet. " + "[tablet_id=%ld schema_hash=%d version=%d version_hash=%ld transaction_id=%ld]", + tablet_info.tablet_id, tablet_info.schema_hash, + version.first, version_hash, transaction_id); + + OLAPTablePtr tablet = get_table(tablet_info.tablet_id, tablet_info.schema_hash); + + if (tablet.get() == NULL) { + OLAP_LOG_WARNING("can't get table when publish version. [tablet_id=%ld schema_hash=%d]", + tablet_info.tablet_id, tablet_info.schema_hash); + error_tablet_ids->push_back(tablet_info.tablet_id); + res = OLAP_ERR_PUSH_TABLE_NOT_EXIST; + continue; + } + + + // publish version + OLAPStatus publish_status = tablet->publish_version( + transaction_id, version, version_hash); + + // if data existed, delete transaction from engine and tablet + if (publish_status == OLAP_ERR_PUSH_VERSION_ALREADY_EXIST) { + OLAP_LOG_WARNING("can't publish version on tablet since data existed. " + "[table=%s transaction_id=%ld version=%d]", + tablet->full_name().c_str(), transaction_id, version.first); + delete_transaction(partition_id, transaction_id, + tablet->tablet_id(), tablet->schema_hash()); + + // if publish successfully, delete transaction from engine + } else if (publish_status == OLAP_SUCCESS) { + LOG(INFO) << "publish version successfully on tablet. [table=" << tablet->full_name() + << " transaction_id=" << transaction_id << " version=" << version.first << "]"; + _transaction_tablet_map_lock.wrlock(); + auto it2 = _transaction_tablet_map.find(key); + if (it2 != _transaction_tablet_map.end()) { + VLOG(3) << "delete transaction from engine. table=" << tablet->full_name() << ", " + << "transaction_id: " << transaction_id; + it2->second.erase(tablet_info); + if (it2->second.empty()) { + _transaction_tablet_map.erase(it2); + } + } + _transaction_tablet_map_lock.unlock(); + + } else { + OLAP_LOG_WARNING("fail to publish version on tablet. " + "[table=%s transaction_id=%ld version=%d res=%d]", + tablet->full_name().c_str(), transaction_id, + version.first, publish_status); + error_tablet_ids->push_back(tablet->tablet_id()); + res = publish_status; + } + } + } + + OLAP_LOG_INFO("finish to publish version on transaction. " + "[transaction_id=%ld, error_tablet_size=%d]", + transaction_id, error_tablet_ids->size()); + return res; +} + +void OLAPEngine::clear_transaction_task(const TTransactionId transaction_id, + const vector partition_ids) { + OLAP_LOG_INFO("begin to clear transaction task. [transaction_id=%ld]", transaction_id); + + // each partition + for (const TPartitionId& partition_id : partition_ids) { + + // get tablets in this transaction + pair key(partition_id, transaction_id); + + _transaction_tablet_map_lock.rdlock(); + auto it = _transaction_tablet_map.find(key); + if (it == _transaction_tablet_map.end()) { + OLAP_LOG_WARNING("no tablet to clear transaction. [partition_id=%ld transaction_id=%ld]", + partition_id, transaction_id); + _transaction_tablet_map_lock.unlock(); + continue; + } + std::map> load_info_map = it->second; + _transaction_tablet_map_lock.unlock(); + + // each tablet + for (auto& load_info : load_info_map) { + const TabletInfo& tablet_info = load_info.first; + delete_transaction(partition_id, transaction_id, + tablet_info.tablet_id, tablet_info.schema_hash); + } + } + + OLAP_LOG_INFO("finish to clear transaction task. [transaction_id=%ld]", transaction_id); +} + +OLAPStatus OLAPEngine::clone_incremental_data(OLAPTablePtr tablet, OLAPHeader& clone_header, + int64_t committed_version) { + OLAP_LOG_INFO("begin to incremental clone. [table=%s committed_version=%ld]", + tablet->full_name().c_str(), committed_version); + + // calculate missing version again + vector missing_versions; + tablet->get_missing_versions_with_header_locked(committed_version, &missing_versions); + + // add least complete version + // prevent lastest version not replaced (if need to rewrite) when restart + const PDelta* least_complete_version = tablet->least_complete_version(missing_versions); + + vector versions_to_delete; + vector versions_to_clone; + + // it's not a merged version in principle + if (least_complete_version != NULL && + least_complete_version->start_version() == least_complete_version->end_version()) { + + Version version(least_complete_version->start_version(), least_complete_version->end_version()); + const PDelta* clone_src_version = clone_header.get_incremental_version(version); + + // if least complete version not found in clone src, return error + if (clone_src_version == nullptr) { + OLAP_LOG_WARNING("failed to find least complete version in clone header. " + "[clone_header_file=%s least_complete_version=%d-%d]", + clone_header.file_name().c_str(), + least_complete_version->start_version(), least_complete_version->end_version()); + return OLAP_ERR_VERSION_NOT_EXIST; + + // if least complete version_hash in clone src is different, clone it + } else if (clone_src_version->version_hash() != least_complete_version->version_hash()) { + versions_to_clone.push_back(clone_src_version); + versions_to_delete.push_back(Version( + least_complete_version->start_version(), + least_complete_version->end_version())); + + OLAP_LOG_DEBUG("least complete version_hash in clone src is different, replace it. " + "[tablet=%s least_complete_version=%d-%d local_hash=%ld clone_hash=%ld]", + tablet->full_name().c_str(), + least_complete_version->start_version(), least_complete_version->end_version(), + least_complete_version->version_hash(), clone_src_version->version_hash()); + } + } + + OLAP_LOG_DEBUG("get missing versions again when incremental clone. " + "[table=%s committed_version=%ld missing_versions_size=%d]", + tablet->full_name().c_str(), committed_version, missing_versions.size()); + + // check missing versions exist in clone src + for (Version version : missing_versions) { + const PDelta* clone_src_version = clone_header.get_incremental_version(version); + if (clone_src_version == NULL) { + LOG(WARNING) << "missing version not found in clone src." + << "clone_header_file=" << clone_header.file_name() << ", " + << "missing_version=" << version.first << "-" << version.second; + return OLAP_ERR_VERSION_NOT_EXIST; + } + + versions_to_clone.push_back(clone_src_version); + } + + // clone_data to tablet + OLAPStatus clone_res = tablet->clone_data(clone_header, versions_to_clone, versions_to_delete); + LOG(INFO) << "finish to incremental clone. [table=" << tablet->full_name() << " res=" << clone_res << "]"; + return clone_res; +} + +OLAPStatus OLAPEngine::clone_full_data(OLAPTablePtr tablet, OLAPHeader& clone_header) { + Version clone_latest_version = clone_header.get_latest_version(); + LOG(INFO) << "begin to full clone. table=" << tablet->full_name() << "," + << "clone_latest_version=" << clone_latest_version.first << "-" << clone_latest_version.second; + vector versions_to_delete; + + // check local versions + for (int i = 0; i < tablet->file_delta_size(); i++) { + Version local_version(tablet->get_delta(i)->start_version(), + tablet->get_delta(i)->end_version()); + VersionHash local_version_hash = tablet->get_delta(i)->version_hash(); + LOG(INFO) << "check local delta when full clone." + << "table=" << tablet->full_name() << ", " + << "local_version=" << local_version.first << "-" << local_version.second; + + // if local version cross src latest, clone failed + if (local_version.first <= clone_latest_version.second + && local_version.second > clone_latest_version.second) { + LOG(WARNING) << "stop to full clone, version cross src latest." + << "table=" << tablet->full_name() << ", " + << "local_version=" << local_version.first << "-" << local_version.second; + return OLAP_ERR_TABLE_VERSION_DUPLICATE_ERROR; + + } else if (local_version.second <= clone_latest_version.second) { + // if local version smaller than src, check if existed in src, will not clone it + bool existed_in_src = false; + + // if delta labeled with local_version is same with the specified version in clone header, + // there is no necessity to clone it. + for (int j = 0; j < clone_header.file_delta_size(); ++j) { + if (clone_header.get_delta(j)->start_version() == local_version.first + && clone_header.get_delta(j)->end_version() == local_version.second + && clone_header.get_delta(j)->version_hash() == local_version_hash) { + existed_in_src = true; + LOG(INFO) << "Delta has already existed in local header, no need to clone." + << "table=" << tablet->full_name() << ", " + << "version='" << local_version.first<< "-" << local_version.second << ", " + << "version_hash=" << local_version_hash; + + OLAPStatus delete_res = clone_header.delete_version(local_version); + if (delete_res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to delete existed version from clone src when full clone. " + << "clone_header_file=" << clone_header.file_name() << ", " + << "version=" << local_version.first << "-" << local_version.second; + return delete_res; + } + break; + } + } + + // Delta labeled in local_version is not existed in clone header, + // some overlapping delta will be cloned to replace it. + // And also, the specified delta should deleted from local header. + if (!existed_in_src) { + versions_to_delete.push_back(local_version); + LOG(INFO) << "Delete delta not included by the clone header, should delete it from local header." + << "table=" << tablet->full_name() << "," + << "version=" << local_version.first<< "-" << local_version.second << ", " + << "version_hash=" << local_version_hash; + } + } + } + vector clone_deltas; + for (int i = 0; i < clone_header.file_delta_size(); ++i) { + clone_deltas.push_back(clone_header.get_delta(i)); + LOG(INFO) << "Delta to clone." + << "table=" << tablet->full_name() << "," + << "version=" << clone_header.get_delta(i)->start_version() << "-" + << clone_header.get_delta(i)->end_version() << ", " + << "version_hash=" << clone_header.get_delta(i)->version_hash(); + } + + // clone_data to tablet + OLAPStatus clone_res = tablet->clone_data(clone_header, clone_deltas, versions_to_delete); + LOG(INFO) << "finish to full clone. [table=" << tablet->full_name() << ", res=" << clone_res << "]"; + return clone_res; +} + // Drop table specified, the main logical is as follows: // 1. table not in schema change: // drop specified table directly; @@ -467,12 +1152,15 @@ OLAPStatus OLAPEngine::add_table(TTabletId tablet_id, SchemaHash schema_hash, // drop specified table and clear schema change info. OLAPStatus OLAPEngine::drop_table( TTabletId tablet_id, SchemaHash schema_hash, bool keep_files) { - OLAP_LOG_INFO("begin to drop olap table. [tablet_id=%ld]", tablet_id); + LOG(INFO) << "begin to process drop table." + << "table=" << tablet_id << ", schema_hash=" << schema_hash; + PaloMetrics::drop_tablet_requests_total.increment(1); + OLAPStatus res = OLAP_SUCCESS; // Get table which need to be droped _tablet_map_lock.rdlock(); - SmartOLAPTable dropped_table = _get_table_with_no_lock(tablet_id, schema_hash); + OLAPTablePtr dropped_table = _get_table_with_no_lock(tablet_id, schema_hash); _tablet_map_lock.unlock(); if (dropped_table.get() == NULL) { OLAP_LOG_WARNING("fail to drop not existed table. [tablet_id=%ld schema_hash=%d]", @@ -503,7 +1191,7 @@ OLAPStatus OLAPEngine::drop_table( bool is_drop_base_table = false; _tablet_map_lock.rdlock(); - SmartOLAPTable related_table = _get_table_with_no_lock( + OLAPTablePtr related_table = _get_table_with_no_lock( related_tablet_id, related_schema_hash); _tablet_map_lock.unlock(); if (related_table.get() == NULL) { @@ -524,16 +1212,18 @@ OLAPStatus OLAPEngine::drop_table( } // Drop specified table and clear schema change info + _tablet_map_lock.wrlock(); related_table->obtain_header_wrlock(); related_table->clear_schema_change_request(); res = related_table->save_header(); - related_table->release_header_lock(); if (res != OLAP_SUCCESS) { OLAP_LOG_FATAL("fail to save table header. [res=%d table=%s]", res, related_table->full_name().c_str()); } - res = _drop_table_directly(tablet_id, schema_hash, keep_files); + res = _drop_table_directly_unlocked(tablet_id, schema_hash, keep_files); + related_table->release_header_lock(); + _tablet_map_lock.unlock(); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to drop table which in schema change. [table=%s]", dropped_table->full_name().c_str()); @@ -546,18 +1236,24 @@ OLAPStatus OLAPEngine::drop_table( OLAPStatus OLAPEngine::_drop_table_directly( TTabletId tablet_id, SchemaHash schema_hash, bool keep_files) { - OLAPStatus res = OLAP_SUCCESS; _tablet_map_lock.wrlock(); + OLAPStatus res = _drop_table_directly_unlocked(tablet_id, schema_hash, keep_files); + _tablet_map_lock.unlock(); + return res; +} - SmartOLAPTable dropped_table = _get_table_with_no_lock(tablet_id, schema_hash); +OLAPStatus OLAPEngine::_drop_table_directly_unlocked( + TTabletId tablet_id, SchemaHash schema_hash, bool keep_files) { + OLAPStatus res = OLAP_SUCCESS; + + OLAPTablePtr dropped_table = _get_table_with_no_lock(tablet_id, schema_hash); if (dropped_table.get() == NULL) { OLAP_LOG_WARNING("fail to drop not existed table. [tablet_id=%ld schema_hash=%d]", tablet_id, schema_hash); - _tablet_map_lock.unlock(); return OLAP_ERR_TABLE_NOT_FOUND; } - for (list::iterator it = _tablet_map[tablet_id].table_arr.begin(); + for (list::iterator it = _tablet_map[tablet_id].table_arr.begin(); it != _tablet_map[tablet_id].table_arr.end();) { if ((*it)->equal(tablet_id, schema_hash)) { if (!keep_files) { @@ -573,8 +1269,7 @@ OLAPStatus OLAPEngine::_drop_table_directly( _tablet_map.erase(tablet_id); } - _tablet_map_lock.unlock(); - res = OLAPRootPath::get_instance()->unregister_table_from_root_path(dropped_table.get()); + res = dropped_table->store()->deregister_table(dropped_table.get()); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to unregister from root path. [res=%d table=%ld]", res, tablet_id); @@ -584,23 +1279,23 @@ OLAPStatus OLAPEngine::_drop_table_directly( } OLAPStatus OLAPEngine::drop_tables_on_error_root_path( - const vector& table_info_vec) { + const vector& tablet_info_vec) { OLAPStatus res = OLAP_SUCCESS; _tablet_map_lock.wrlock(); - for (const TableInfo& table_info : table_info_vec) { - TTabletId tablet_id = table_info.tablet_id; - TSchemaHash schema_hash = table_info.schema_hash; + for (const TabletInfo& tablet_info : tablet_info_vec) { + TTabletId tablet_id = tablet_info.tablet_id; + TSchemaHash schema_hash = tablet_info.schema_hash; OLAP_LOG_DEBUG("drop_table begin. [table=%ld schema_hash=%d]", tablet_id, schema_hash); - SmartOLAPTable dropped_table = _get_table_with_no_lock(tablet_id, schema_hash); + OLAPTablePtr dropped_table = _get_table_with_no_lock(tablet_id, schema_hash); if (dropped_table.get() == NULL) { OLAP_LOG_WARNING("dropping table not exist. [table=%ld schema_hash=%d]", tablet_id, schema_hash); continue; } else { - for (list::iterator it = _tablet_map[tablet_id].table_arr.begin(); + for (list::iterator it = _tablet_map[tablet_id].table_arr.begin(); it != _tablet_map[tablet_id].table_arr.end();) { if ((*it)->equal(tablet_id, schema_hash)) { it = _tablet_map[tablet_id].table_arr.erase(it); @@ -620,54 +1315,43 @@ OLAPStatus OLAPEngine::drop_tables_on_error_root_path( return res; } -OLAPTable* OLAPEngine::create_table( +OLAPTablePtr OLAPEngine::create_table( const TCreateTabletReq& request, const string* ref_root_path, - const bool is_schema_change_table, const SmartOLAPTable ref_olap_table) { - OLAPTable* olap_table = NULL; - // Get all available root paths, use ref_root_path if the caller specified - OLAPRootPath::RootPathVec all_available_root_path; - if (ref_root_path == NULL) { - OLAPRootPath* olap_root_path = OLAPRootPath::get_instance(); - olap_root_path->get_root_path_for_create_table( - request.storage_medium, &all_available_root_path); - if (all_available_root_path.size() == 0) { - OLAP_LOG_WARNING("there is no available disk that can be used to create table."); - return olap_table; + const bool is_schema_change_table, const OLAPTablePtr ref_olap_table) { + // Get all available stores, use ref_root_path if the caller specified + std::vector stores; + if (ref_root_path == nullptr) { + stores = get_stores_for_create_table(request.storage_medium); + if (stores.empty()) { + LOG(WARNING) << "there is no available disk that can be used to create table."; + return nullptr; } } else { - all_available_root_path.push_back(*ref_root_path); + stores.push_back(ref_olap_table->store()); } + OLAPTablePtr olap_table; // Try to create table on each of all_available_root_path, util success - string header_path; - for (string root_path : all_available_root_path) { - OLAPStatus res = _create_new_table_header_file(request, root_path, &header_path, is_schema_change_table, ref_olap_table); + for (auto& store : stores) { + OLAPHeader* header = new OLAPHeader(); + OLAPStatus res = _create_new_table_header(request, store, is_schema_change_table, ref_olap_table, header); if (res != OLAP_SUCCESS) { - if (is_io_error(res)) { - OLAP_LOG_WARNING("io error when creating table header. [res=%d root=%s]", - res, root_path.c_str()); - continue; - } else { - OLAP_LOG_WARNING("fail to create table header. [res=%d root=%s]", - res, root_path.c_str()); - break; - } + LOG(WARNING) << "fail to create table header. [res=" << res << " root=" << store->path(); + break; } - olap_table = OLAPTable::create_from_header_file( - request.tablet_id, request.tablet_schema.schema_hash, header_path); - if (olap_table == NULL) { - OLAP_LOG_WARNING("fail to load olap table from header. [header_path=%s]", - header_path.c_str()); - if (remove_parent_dir(header_path) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to remove header path. [header_path=%s]", - header_path.c_str()); - } - continue; + olap_table = OLAPTable::create_from_header(header, store); + if (olap_table == nullptr) { + LOG(WARNING) << "fail to load olap table from header. root_path:%s" << store->path(); + break; } - OLAP_LOG_DEBUG("success to create table from header. [header_path=%s]", - header_path.c_str()); + // commit header finally + res = OlapHeaderManager::save(store, request.tablet_id, request.tablet_schema.schema_hash, header); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to save header. [res=" << res << " root=" << store->path(); + break; + } break; } @@ -679,10 +1363,11 @@ OLAPStatus OLAPEngine::create_init_version(TTabletId tablet_id, SchemaHash schem OLAP_LOG_DEBUG("begin to create init version. [begin=%d end=%d]", version.first, version.second); - SmartOLAPTable table; + OLAPTablePtr table; IWriter* writer = NULL; - OLAPIndex* new_index = NULL; + Rowset* new_rowset = NULL; OLAPStatus res = OLAP_SUCCESS; + std::vector index_vec; do { if (version.first > version.second) { @@ -700,42 +1385,37 @@ OLAPStatus OLAPEngine::create_init_version(TTabletId tablet_id, SchemaHash schem break; } - new_index = new(nothrow) OLAPIndex(table.get(), version, version_hash, false, 0, 0); - if (new_index == NULL) { - OLAP_LOG_WARNING("fail to malloc index. [table=%s]", table->full_name().c_str()); + new_rowset = new(nothrow) Rowset(table.get(), version, version_hash, false, 0, 0); + if (new_rowset == NULL) { + LOG(WARNING) << "fail to malloc index. [table=" << table->full_name() << "]"; res = OLAP_ERR_MALLOC_ERROR; break; } // Create writer, which write nothing to table, to generate empty data file - writer = IWriter::create(table, new_index, false); + writer = IWriter::create(table, new_rowset, false); if (writer == NULL) { - OLAP_LOG_WARNING("fail to create writer. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to create writer. [table=" << table->full_name() << "]"; res = OLAP_ERR_MALLOC_ERROR; break; } - res = writer->init(); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to init writer. [table=%s]", table->full_name().c_str()); - break; - } - res = writer->finalize(); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to finalize writer. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to finalize writer. [table=" << table->full_name() << "]"; break; } // Load new index and add to table - res = new_index->load(); + res = new_rowset->load(); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load new index. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to load new index. [table=" << table->full_name() << "]"; break; } - AutoRWLock auto_lock(table->get_header_lock_ptr(), false); - res = table->register_data_source(new_index); + WriteLock wrlock(table->get_header_lock_ptr()); + index_vec.push_back(new_rowset); + res = table->register_data_source(index_vec); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to register index to data sources. [table=%s]", table->full_name().c_str()); @@ -744,21 +1424,21 @@ OLAPStatus OLAPEngine::create_init_version(TTabletId tablet_id, SchemaHash schem res = table->save_header(); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to save header. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to save header. [table=" << table->full_name() << "]"; break; } } while (0); // Unregister index and delete files(index and data) if failed if (res != OLAP_SUCCESS && table.get() != NULL) { - OLAPIndex* unused_index = NULL; + std::vector unused_index; table->obtain_header_wrlock(); table->unregister_data_source(version, &unused_index); table->release_header_lock(); - if (new_index != NULL) { - new_index->delete_all_files(); - SAFE_DELETE(new_index); + for (Rowset* index : index_vec) { + index->delete_all_files(); + SAFE_DELETE(index); } } @@ -774,7 +1454,7 @@ bool OLAPEngine::try_schema_change_lock(TTabletId tablet_id) { tablet_map_t::iterator it = _tablet_map.find(tablet_id); if (it == _tablet_map.end()) { - OLAP_LOG_WARNING("table does not exists. [table=%ld]", tablet_id); + OLAP_LOG_WARNING("tablet does not exists. [table=%ld]", tablet_id); } else { res = (it->second.schema_change_lock.trylock() == OLAP_SUCCESS); } @@ -790,7 +1470,7 @@ void OLAPEngine::release_schema_change_lock(TTabletId tablet_id) { tablet_map_t::iterator it = _tablet_map.find(tablet_id); if (it == _tablet_map.end()) { - OLAP_LOG_WARNING("table does not exists. [table=%ld]", tablet_id); + OLAP_LOG_WARNING("tablet does not exists. [table=%ld]", tablet_id); } else { it->second.schema_change_lock.unlock(); } @@ -799,10 +1479,44 @@ void OLAPEngine::release_schema_change_lock(TTabletId tablet_id) { OLAP_LOG_DEBUG("release_schema_change_lock end. [table=%ld]", tablet_id); } +void OLAPEngine::_build_tablet_info(OLAPTablePtr olap_table, TTabletInfo* tablet_info) { + tablet_info->tablet_id = olap_table->tablet_id(); + tablet_info->schema_hash = olap_table->schema_hash(); + + olap_table->obtain_header_rdlock(); + tablet_info->row_count = olap_table->get_num_rows(); + tablet_info->data_size = olap_table->get_data_size(); + const PDelta* last_file_version = olap_table->lastest_version(); + if (last_file_version == NULL) { + tablet_info->version = -1; + tablet_info->version_hash = 0; + } else { + // report the version before first missing version + vector missing_versions; + olap_table->get_missing_versions_with_header_locked( + last_file_version->end_version(), &missing_versions); + const PDelta* least_complete_version = + olap_table->least_complete_version(missing_versions); + if (least_complete_version == NULL) { + tablet_info->version = -1; + tablet_info->version_hash = 0; + } else { + tablet_info->version = least_complete_version->end_version(); + tablet_info->version_hash = least_complete_version->version_hash(); + } + } + olap_table->release_header_lock(); +} + OLAPStatus OLAPEngine::report_tablet_info(TTabletInfo* tablet_info) { + PaloMetrics::report_tablet_requests_total.increment(1); + OLAP_LOG_INFO("begin to process report tablet info. " + "[table=%ld schema_hash=%d]", + tablet_info->tablet_id, tablet_info->schema_hash); + OLAPStatus res = OLAP_SUCCESS; - SmartOLAPTable olap_table = get_table( + OLAPTablePtr olap_table = get_table( tablet_info->tablet_id, tablet_info->schema_hash); if (olap_table.get() == NULL) { OLAP_LOG_WARNING("can't find table. [table=%ld schema_hash=%d]", @@ -810,36 +1524,19 @@ OLAPStatus OLAPEngine::report_tablet_info(TTabletInfo* tablet_info) { return OLAP_ERR_TABLE_NOT_FOUND; } - tablet_info->tablet_id = olap_table->tablet_id(); - tablet_info->schema_hash = olap_table->schema_hash(); - - olap_table->obtain_header_rdlock(); - tablet_info->row_count = olap_table->get_num_rows(); - tablet_info->data_size = olap_table->get_data_size(); - const FileVersionMessage* last_file_version = olap_table->latest_version(); - if (last_file_version == NULL) { - tablet_info->version = -1; - tablet_info->version_hash = 0; - } else { - tablet_info->version = last_file_version->end_version(); - tablet_info->version_hash = last_file_version->version_hash(); - } - olap_table->release_header_lock(); - + _build_tablet_info(olap_table, tablet_info); + OLAP_LOG_INFO("success to process report tablet info."); return res; } -OLAPStatus OLAPEngine::report_all_tablets_info( - map* tablets_info) { - OLAP_LOG_DEBUG("begin to get all tablet info."); +OLAPStatus OLAPEngine::report_all_tablets_info(std::map* tablets_info) { + OLAP_LOG_INFO("begin to process report all tablets info."); + PaloMetrics::report_all_tablets_requests_total.increment(1); if (tablets_info == NULL) { return OLAP_ERR_INPUT_PARAMETER_ERROR; } - uint32_t available_storage_medium_type_count = - OLAPRootPath::get_instance()->available_storage_medium_type_count(); - _tablet_map_lock.rdlock(); for (const auto& item : _tablet_map) { if (item.second.table_arr.size() == 0) { @@ -847,36 +1544,24 @@ OLAPStatus OLAPEngine::report_all_tablets_info( } TTablet tablet; - for (SmartOLAPTable olap_table : item.second.table_arr) { + for (OLAPTablePtr olap_table : item.second.table_arr) { if (olap_table.get() == NULL) { continue; } TTabletInfo tablet_info; - tablet_info.tablet_id = olap_table->tablet_id(); - tablet_info.schema_hash = olap_table->schema_hash(); + _build_tablet_info(olap_table, &tablet_info); - olap_table->obtain_header_rdlock(); - tablet_info.row_count = olap_table->get_num_rows(); - tablet_info.data_size = olap_table->get_data_size(); - const FileVersionMessage* last_file_version = olap_table->latest_version(); - if (last_file_version == NULL) { - tablet_info.version = -1; - tablet_info.version_hash = 0; - } else { - tablet_info.version = last_file_version->end_version(); - tablet_info.version_hash = last_file_version->version_hash(); - } - olap_table->release_header_lock(); + // report expire transaction + vector transaction_ids; + olap_table->get_expire_pending_data(&transaction_ids); + tablet_info.__set_transaction_ids(transaction_ids); - if (available_storage_medium_type_count > 1) { - tablet_info.__set_storage_medium(TStorageMedium::HDD); - if (OLAPRootPath::is_ssd_disk(olap_table->storage_root_path_name())) { - tablet_info.__set_storage_medium(TStorageMedium::SSD); - } + if (_available_storage_medium_type_count > 1) { + tablet_info.__set_storage_medium(olap_table->store()->storage_medium()); } - tablet_info.__set_version_count(olap_table->file_version_size()); + tablet_info.__set_version_count(olap_table->file_delta_size()); tablet.tablet_infos.push_back(tablet_info); } @@ -886,298 +1571,149 @@ OLAPStatus OLAPEngine::report_all_tablets_info( } _tablet_map_lock.unlock(); - OLAP_LOG_DEBUG("success to get all tablets info. [tablet_num=%u]", - tablets_info->size()); + LOG(INFO) << "success to process report all tablets info. tablet_num=" << tablets_info->size(); return OLAP_SUCCESS; } +void OLAPEngine::get_tablet_stat(TTabletStatResult& result) { + OLAP_LOG_DEBUG("begin to get all tablet stat."); + + // get current time + int64_t current_time = UnixMillis(); + + _tablet_map_lock.wrlock(); + // update cache if too old + if (current_time - _tablet_stat_cache_update_time_ms > + config::tablet_stat_cache_update_interval_second * 1000) { + OLAP_LOG_DEBUG("update tablet stat."); + _build_tablet_stat(); + } + + result.__set_tablets_stats(_tablet_stat_cache); + + _tablet_map_lock.unlock(); +} + +void OLAPEngine::_build_tablet_stat() { + _tablet_stat_cache.clear(); + for (const auto& item : _tablet_map) { + if (item.second.table_arr.size() == 0) { + continue; + } + + TTabletStat stat; + stat.tablet_id = item.first; + for (OLAPTablePtr olap_table : item.second.table_arr) { + if (olap_table.get() == NULL) { + continue; + } + + // we only get base tablet's stat + stat.__set_data_size(olap_table->get_data_size()); + stat.__set_row_num(olap_table->get_num_rows()); + OLAP_LOG_DEBUG("tablet %d get data size: %d, row num %d", + item.first, olap_table->get_data_size(), + olap_table->get_num_rows()); + break; + } + + _tablet_stat_cache.emplace(item.first, stat); + } + + _tablet_stat_cache_update_time_ms = UnixMillis(); +} + +bool OLAPEngine::_can_do_compaction(OLAPTablePtr table) { + // 如果table正在åšschema changeï¼Œåˆ™é€šè¿‡é€‰è·¯åˆ¤æ–­æ•°æ®æ˜¯å¦è½¬æ¢å®Œæˆ + // 如果选路æˆåŠŸï¼Œåˆ™è½¬æ¢å®Œæˆï¼Œå¯ä»¥è¿›è¡ŒBE + // å¦‚æžœé€‰è·¯å¤±è´¥ï¼Œåˆ™è½¬æ¢æœªå®Œæˆï¼Œä¸èƒ½è¿›è¡ŒBE + table->obtain_header_rdlock(); + const PDelta* lastest_version = table->lastest_version(); + if (lastest_version == NULL) { + table->release_header_lock(); + return false; + } + + if (table->is_schema_changing()) { + Version test_version = Version(0, lastest_version->end_version()); + vector path_versions; + if (OLAP_SUCCESS != table->select_versions_to_span(test_version, &path_versions)) { + table->release_header_lock(); + return false; + } + } + table->release_header_lock(); + + return true; +} + void OLAPEngine::start_clean_fd_cache() { OLAP_LOG_TRACE("start clean file descritpor cache"); _file_descriptor_lru_cache->prune(); OLAP_LOG_TRACE("end clean file descritpor cache"); } -void OLAPEngine::start_base_compaction(string* last_base_compaction_fs, TTabletId* last_base_compaction_tablet_id) { +void OLAPEngine::perform_cumulative_compaction() { + OLAPTablePtr best_table = _find_best_tablet_to_compaction(CompactionType::CUMULATIVE_COMPACTION); + if (best_table == nullptr) { return; } - { - std::lock_guard l(_base_compaction_queue_lock); - if (!_base_compaction_tablet_queue.empty()) { - TableInfo& tablet_info = _base_compaction_tablet_queue.front(); - _tablet_map_lock.rdlock(); - SmartOLAPTable table = OLAPEngine::get_instance()->get_table( - tablet_info.tablet_id, tablet_info.schema_hash); - _tablet_map_lock.unlock(); - _base_compaction_tablet_queue.pop(); - if (table == nullptr) { - return; - } - if (!table->is_loaded() || !table->can_do_compaction()) { - return; - } - BaseCompaction base_compaction; - OLAPStatus res = base_compaction.init(table, true); - if (res != OLAP_SUCCESS) { - LOG(WARNING) << "failed to init base compaction. table=" << table->full_name(); - return; - } - - res = base_compaction.run(); - if (res != OLAP_SUCCESS) { - LOG(WARNING) << "failed to do base compaction. table=" << table->full_name(); - return; - } - return; - } - } - - uint64_t base_compaction_start_hour = config::base_compaction_start_hour; - uint64_t base_compaction_end_hour = config::base_compaction_end_hour; - time_t current_time = time(NULL); - uint64_t current_hour = localtime(¤t_time)->tm_hour; - // 如果执行BE的时间区间设置为类似以下的形å¼ï¼š[1:00, 8:00) - if (base_compaction_start_hour <= base_compaction_end_hour) { - if (current_hour < base_compaction_start_hour - || current_hour >= base_compaction_end_hour) { - OLAP_LOG_TRACE("don't allow to excute base compaction in this time interval. " - "[now_hour=%d; allow_start_time=%d; allow_end_time=%d]", - current_hour, - base_compaction_start_hour, - base_compaction_end_hour); - return; - } - } else { // 如果执行BE的时间区间设置为类似以下的形å¼ï¼š[22:00, 8:00) - if (current_hour < base_compaction_start_hour - && current_hour >= base_compaction_end_hour) { - OLAP_LOG_TRACE("don't allow to excute base compaction in this time interval. " - "[now_hour=%d; allow_start_time=%d; allow_end_time=%d]", - current_hour, - base_compaction_start_hour, - base_compaction_end_hour); - return; - } - } - - SmartOLAPTable tablet; - BaseCompaction base_compaction; - - bool do_base_compaction = false; - OLAP_LOG_TRACE("start_base_compaction begin."); - _tablet_map_lock.rdlock(); - _fs_task_mutex.lock(); - - if (*last_base_compaction_fs != "") { - _fs_base_compaction_task_num_map[*last_base_compaction_fs] -= 1; - last_base_compaction_fs->clear(); - } - - for (const auto& i : _tablet_map) { - for (SmartOLAPTable j : i.second.table_arr) { - // ä¿è¯ä»Žä¸Šä¸€æ¬¡è¢«é€‰ä¸­è¿›è¡ŒBE的表开始轮询 - if (i.first <= *last_base_compaction_tablet_id) { - continue; - } - - if (_fs_base_compaction_task_num_map[j->storage_root_path_name()] >= _max_base_compaction_task_per_disk) { - continue; - } - - // 跳过正在åšschema changeçš„tablet - if (!j->can_do_compaction()) { - OLAP_LOG_DEBUG("skip tablet, it is schema changing. [tablet=%s]", - j->full_name().c_str()); - continue; - } - - if (base_compaction.init(j, false) == OLAP_SUCCESS) { - tablet = j; - do_base_compaction = true; - _fs_base_compaction_task_num_map[tablet->storage_root_path_name()] += 1; - *last_base_compaction_fs = tablet->storage_root_path_name(); - *last_base_compaction_tablet_id = i.first; - goto TRY_START_BE_OK; - } - } - } - - // when the loop comes the end, restart from begin - *last_base_compaction_tablet_id = -1; - -TRY_START_BE_OK: - _fs_task_mutex.unlock(); - _tablet_map_lock.unlock(); - OLAP_LOG_TRACE("start_base_compaction end."); - - if (do_base_compaction) { - OLAP_LOG_NOTICE_PUSH("request", "START_BASE_COMPACTION"); - PaloMetrics::base_compaction_request_total.increment(1); - OLAPStatus cmd_res = base_compaction.run(); - if (cmd_res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to do base compaction. [tablet='%s']", - tablet->full_name().c_str()); - PaloMetrics::base_compaction_request_failed.increment(1); - } - } -} - -void OLAPEngine::_select_candidate() { - // è¿™æ˜¯ä¸€ä¸ªå°æ ¹å †ï¼Œç”¨äºŽè®°å½•nice最大的top k个candidate tablet - SmartOLAPTable tablet; - typedef priority_queue, - CompactionCandidateComparator> candidate_heap_t; - vector candidate_heap_vec(_cumulative_compaction_disk_stat.size()); - for (const auto& i : _tablet_map) { - uint32_t nice = 0; - // calc nice - for (SmartOLAPTable j : i.second.table_arr) { - if (!j->is_loaded()) { - continue; - } - - j->obtain_header_rdlock(); - // const uint32_t curr_nice = j->get_compaction_nice_estimate(); - const uint32_t curr_nice = j->file_version_size(); - j->release_header_lock(); - nice = curr_nice > nice ? curr_nice : nice; - tablet = j; - } - - // save - if (nice > 0) { - uint32_t disk_id = _disk_id_map[tablet->storage_root_path_name()]; - candidate_heap_vec[disk_id].emplace(nice, i.first, disk_id); - if (candidate_heap_vec[disk_id].size() > OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE) { - candidate_heap_vec[disk_id].pop(); - } - } - } - - _cumulative_compaction_candidate.clear(); - for (auto& stat : _cumulative_compaction_disk_stat) { - stat.task_remaining = 0; - } - - for (auto& candidate_heap : candidate_heap_vec) { - while (!candidate_heap.empty()) { - _cumulative_compaction_candidate.push_back(candidate_heap.top()); - ++_cumulative_compaction_disk_stat[candidate_heap.top().disk_index].task_remaining; - candidate_heap.pop(); - } - } - - // sort small to big - sort(_cumulative_compaction_candidate.rbegin(), _cumulative_compaction_candidate.rend(), CompactionCandidateComparator()); -} - -void OLAPEngine::start_cumulative_priority() { - { - std::lock_guard l(_cumulative_compaction_queue_lock); - if (!_cumulative_compaction_tablet_queue.empty()) { - TableInfo& tablet_info = _cumulative_compaction_tablet_queue.front(); - _tablet_map_lock.rdlock(); - SmartOLAPTable table = OLAPEngine::get_instance()->get_table( - tablet_info.tablet_id, tablet_info.schema_hash); - _tablet_map_lock.unlock(); - _cumulative_compaction_tablet_queue.pop(); - if (table == nullptr) { - return; - } - if (!table->is_loaded() || !table->can_do_compaction()) { - return; - } - CumulativeCompaction cumulative_compaction; - OLAPStatus res = cumulative_compaction.init(table); - if (res != OLAP_SUCCESS) { - LOG(WARNING) << "failed to init cumulative compaction. table=" << table->full_name(); - return; - } - - res = cumulative_compaction.run(); - if (res != OLAP_SUCCESS) { - LOG(WARNING) << "failed to do cumulative compaction. table=" << table->full_name(); - return; - } - return; - } - } - - _tablet_map_lock.rdlock(); - _fs_task_mutex.lock(); - - // determine whether to select candidate or not - bool is_select = false; - vector all_root_paths_info; - OLAPRootPath::get_instance()->get_all_root_path_info(&all_root_paths_info, false); - for (uint32_t i = 0; i < all_root_paths_info.size(); i++) { - uint32_t disk_id = _disk_id_map[all_root_paths_info[i].path]; - _cumulative_compaction_disk_stat[disk_id].is_used = all_root_paths_info[i].is_used; - } - - for (auto& disk : _cumulative_compaction_disk_stat) { - if (!disk.task_remaining && disk.is_used) { - is_select = true; - } - } - - if (is_select) { - _select_candidate(); - } - - // traverse _cumulative_compaction_candidate to start cumulative compaction - OLAP_LOG_INFO("begin to traverse cumulative_compaction_candidate. size: %d", _cumulative_compaction_candidate.size()); CumulativeCompaction cumulative_compaction; - for (auto it_cand = _cumulative_compaction_candidate.rbegin(); it_cand != _cumulative_compaction_candidate.rend(); ++it_cand) { - CompactionCandidate candidate = *it_cand; - const auto i = _tablet_map.find(candidate.tablet_id); - if (i == _tablet_map.end()) { - // tabletå·²ç»ä¸å­˜åœ¨ - _cumulative_compaction_candidate.erase(it_cand.base() - 1); - --_cumulative_compaction_disk_stat[candidate.disk_index].task_remaining; - continue; - } + OLAPStatus res = cumulative_compaction.init(best_table); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to init cumulative compaction." + << "table=" << best_table->full_name(); + } - if (_cumulative_compaction_disk_stat[candidate.disk_index].task_running >= _max_cumulative_compaction_task_per_disk) { - OLAP_LOG_DEBUG("skip tablet, too much ce task on disk %s", - _cumulative_compaction_disk_stat[candidate.disk_index].storage_path.c_str()); - // æŸä¸ªdisk上任务数太多,跳过,candidate中ä¿ç•™è¿™ä¸ªä»»åŠ¡ - continue; - } + res = cumulative_compaction.run(); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to do cumulative compaction." + << "table=" << best_table->full_name(); + } +} - for (SmartOLAPTable j : i->second.table_arr) { - if (!j->can_do_compaction()) { - OLAP_LOG_DEBUG("skip tablet, it is schema changing. [tablet=%s]", - j->full_name().c_str()); +void OLAPEngine::perform_base_compaction() { + OLAPTablePtr best_table = _find_best_tablet_to_compaction(CompactionType::BASE_COMPACTION); + if (best_table == nullptr) { return; } + + BaseCompaction base_compaction; + OLAPStatus res = base_compaction.init(best_table); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to init base compaction." + << "table=" << best_table->full_name(); + return; + } + + res = base_compaction.run(); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to init base compaction." + << "table=" << best_table->full_name(); + } +} + +OLAPTablePtr OLAPEngine::_find_best_tablet_to_compaction(CompactionType compaction_type) { + ReadLock tablet_map_rdlock(&_tablet_map_lock); + uint32_t highest_score = 0; + OLAPTablePtr best_table; + for (tablet_map_t::value_type& table_ins : _tablet_map){ + for (OLAPTablePtr& table_ptr : table_ins.second.table_arr) { + if (!table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) { continue; } - if (cumulative_compaction.init(j) == OLAP_SUCCESS) { - OLAP_LOG_INFO("begin to do cumulative for tablet: %s, version num: %d", j->full_name().c_str(), j->file_version_size()); - _cumulative_compaction_candidate.erase(it_cand.base() - 1); - --_cumulative_compaction_disk_stat[candidate.disk_index].task_remaining; - ++_cumulative_compaction_disk_stat[candidate.disk_index].task_running; - _fs_task_mutex.unlock(); - _tablet_map_lock.unlock(); - - // start cumulative - PaloMetrics::cumulative_compaction_request_total.increment(1); - if (cumulative_compaction.run() != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to do cumulative. [tablet='%s']", - j->full_name().c_str()); - PaloMetrics::cumulative_compaction_request_failed.increment(1); - } - - _fs_task_mutex.lock(); - --_cumulative_compaction_disk_stat[candidate.disk_index].task_running; - _fs_task_mutex.unlock(); - return; + ReadLock rdlock(table_ptr->get_header_lock_ptr()); + uint32_t table_score = 0; + if (compaction_type == CompactionType::BASE_COMPACTION) { + table_score = table_ptr->get_base_compaction_score(); + } else if (compaction_type == CompactionType::CUMULATIVE_COMPACTION) { + table_score = table_ptr->get_cumulative_compaction_score(); + } + if (table_score > highest_score) { + highest_score = table_score; + best_table = table_ptr; } } - // 这个tabletä¸é€‚åˆåšce - _cumulative_compaction_candidate.erase(it_cand.base() - 1); - --_cumulative_compaction_disk_stat[candidate.disk_index].task_remaining; } - _fs_task_mutex.unlock(); - _tablet_map_lock.unlock(); - OLAP_LOG_TRACE("no tablet selected to do cumulative compaction this loop."); + return best_table; } void OLAPEngine::get_cache_status(rapidjson::Document* document) const { @@ -1192,7 +1728,7 @@ OLAPStatus OLAPEngine::start_trash_sweep(double* usage) { const uint32_t trash_expire = config::trash_file_expire_time_sec; const double guard_space = config::disk_capacity_insufficient_percentage / 100.0; std::vector root_paths_info; - res = OLAPRootPath::get_instance()->get_all_root_path_info(&root_paths_info); + res = get_all_root_path_info(&root_paths_info); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("failed to get root path stat info when sweep trash."); return res; @@ -1234,6 +1770,18 @@ OLAPStatus OLAPEngine::start_trash_sweep(double* usage) { } } + // clear expire incremental rowset + _tablet_map_lock.rdlock(); + for (const auto& item : _tablet_map) { + for (OLAPTablePtr olap_table : item.second.table_arr) { + if (olap_table.get() == NULL) { + continue; + } + olap_table->delete_expire_incremental_data(); + } + } + _tablet_map_lock.unlock(); + return res; } @@ -1255,7 +1803,7 @@ OLAPStatus OLAPEngine::_do_sweep( string str_time = dir_name.substr(0, dir_name.find('.')); tm local_tm_create; if (strptime(str_time.c_str(), "%Y%m%d%H%M%S", &local_tm_create) == nullptr) { - OLAP_LOG_WARNING("fail to strptime time. [time=%lu]", str_time.c_str()); + LOG(WARNING) << "fail to strptime time. [time=" << str_time << "]"; res = OLAP_ERR_OS_ERROR; continue; } @@ -1277,63 +1825,54 @@ OLAPStatus OLAPEngine::_do_sweep( return res; } -OLAPStatus OLAPEngine::_create_new_table_header_file( - const TCreateTabletReq& request, const string& root_path, string* header_path, - const bool is_schema_change_table, const SmartOLAPTable ref_olap_table) { - OLAPStatus res = OLAP_SUCCESS; - +OLAPStatus OLAPEngine::_create_new_table_header( + const TCreateTabletReq& request, + OlapStore* store, + const bool is_schema_change_table, + const OLAPTablePtr ref_olap_table, + OLAPHeader* header) { uint64_t shard = 0; - res = OLAPRootPath::get_instance()->get_root_path_shard(root_path, &shard); + OLAPStatus res = store->get_shard(&shard); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get root path shard. [res=%d]", res); + LOG(WARNING) << "fail to get root path shard. res=" << res; return res; } - - // Generate header path info: header_path = header_dir + "/" + header_file - stringstream header_dir_stream; - header_dir_stream << root_path + stringstream schema_hash_dir_stream; + schema_hash_dir_stream << store->path() << DATA_PREFIX << "/" << shard << "/" << request.tablet_id << "/" << request.tablet_schema.schema_hash; - string header_dir = header_dir_stream.str(); - - stringstream header_file_stream; - header_file_stream << request.tablet_id << ".hdr"; - string header_file = header_file_stream.str(); - - res = _check_existed_or_else_create_dir(header_dir); + string schema_hash_dir = schema_hash_dir_stream.str(); + if (check_dir_existed(schema_hash_dir)) { + LOG(WARNING) << "failed to create the dir that existed. path=" << schema_hash_dir; + return OLAP_ERR_CANNOT_CREATE_DIR; + } + res = create_dirs(schema_hash_dir); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("dir existed or create dir fail. [res=%d]", res); + LOG(WARNING) << "create dir fail. [res=" << res << " path:" << schema_hash_dir; return res; } - - // Generate and Initialize OLAPHeader - string header_path_tmp = header_dir + "/" + header_file; - OLAPHeader header(header_path_tmp); // set basic information - header.set_num_short_key_fields(request.tablet_schema.short_key_column_count); - header.set_compress_kind(COMPRESS_LZ4); - + header->set_num_short_key_fields(request.tablet_schema.short_key_column_count); + header->set_compress_kind(COMPRESS_LZ4); if (request.tablet_schema.keys_type == TKeysType::DUP_KEYS) { - header.set_keys_type(KeysType::DUP_KEYS); + header->set_keys_type(KeysType::DUP_KEYS); } else if (request.tablet_schema.keys_type == TKeysType::UNIQUE_KEYS) { - header.set_keys_type(KeysType::UNIQUE_KEYS); + header->set_keys_type(KeysType::UNIQUE_KEYS); } else { - header.set_keys_type(KeysType::AGG_KEYS); + header->set_keys_type(KeysType::AGG_KEYS); } - if (request.tablet_schema.storage_type == TStorageType::COLUMN) { - header.set_data_file_type(COLUMN_ORIENTED_FILE); - header.set_segment_size(OLAP_MAX_COLUMN_SEGMENT_FILE_SIZE); - header.set_num_rows_per_data_block(config::default_num_rows_per_column_file_block); + header->set_data_file_type(COLUMN_ORIENTED_FILE); + header->set_segment_size(OLAP_MAX_COLUMN_SEGMENT_FILE_SIZE); + header->set_num_rows_per_data_block(config::default_num_rows_per_column_file_block); } else { - header.set_data_file_type(OLAP_DATA_FILE); - header.set_segment_size(OLAP_MAX_SEGMENT_FILE_SIZE); - header.set_num_rows_per_data_block(config::default_num_rows_per_data_block); + header->set_data_file_type(OLAP_DATA_FILE); + header->set_segment_size(OLAP_MAX_SEGMENT_FILE_SIZE); + header->set_num_rows_per_data_block(config::default_num_rows_per_data_block); } - // set column information uint32_t i = 0; uint32_t key_count = 0; @@ -1345,145 +1884,120 @@ OLAPStatus OLAPEngine::_create_new_table_header_file( for (TColumn column : request.tablet_schema.columns) { if (column.column_type.type == TPrimitiveType::VARCHAR && i < request.tablet_schema.short_key_column_count - 1) { - OLAP_LOG_WARNING("varchar type column should be the last short key."); - remove_dir(header_dir); + LOG(WARNING) << "varchar type column should be the last short key."; return OLAP_ERR_SCHEMA_SCHEMA_INVALID; } - - header.add_column(); + header->add_column(); if (true == is_schema_change_table) { /* - * schema changeçš„old_olap_tableå’Œnew_olap_tableçš„schema进行比较 - * 1. 新表的列å在旧表中存在,则新表相应列的unique_idå¤ç”¨æ—§è¡¨åˆ—çš„unique_id - * 2. 新表的列å在旧表中ä¸å­˜åœ¨ï¼Œåˆ™æ–°è¡¨ç›¸åº”列的unique_id设为旧表列的next_unique_id - * + * for schema change, compare old_olap_table and new_olap_table + * 1. if column in both new_olap_table and old_olap_table, + * assign unique_id of old_olap_table to the column of new_olap_table + * 2. if column exists only in new_olap_table, assign next_unique_id of old_olap_table + * to the new column + * */ size_t field_num = ref_olap_table->tablet_schema().size(); size_t field_off = 0; for (field_off = 0; field_off < field_num; ++field_off) { if (ref_olap_table->tablet_schema()[field_off].name == column.column_name) { uint32_t unique_id = ref_olap_table->tablet_schema()[field_off].unique_id; - header.mutable_column(i)->set_unique_id(unique_id); + header->mutable_column(i)->set_unique_id(unique_id); break; } } if (field_off == field_num) { - header.mutable_column(i)->set_unique_id(next_unique_id++); + header->mutable_column(i)->set_unique_id(next_unique_id++); } } else { - header.mutable_column(i)->set_unique_id(i); + header->mutable_column(i)->set_unique_id(i); } - header.mutable_column(i)->set_name(column.column_name); - header.mutable_column(i)->set_is_root_column(true); + header->mutable_column(i)->set_name(column.column_name); + header->mutable_column(i)->set_is_root_column(true); string data_type; EnumToString(TPrimitiveType, column.column_type.type, data_type); - header.mutable_column(i)->set_type(data_type); - + header->mutable_column(i)->set_type(data_type); if (column.column_type.type == TPrimitiveType::DECIMAL) { if (column.column_type.__isset.precision && column.column_type.__isset.scale) { - header.mutable_column(i)->set_precision(column.column_type.precision); - header.mutable_column(i)->set_frac(column.column_type.scale); + header->mutable_column(i)->set_precision(column.column_type.precision); + header->mutable_column(i)->set_frac(column.column_type.scale); } else { - OLAP_LOG_WARNING("decimal type column should set precision and frac."); - remove_dir(header_dir); + LOG(WARNING) << "decimal type column should set precision and frac."; return OLAP_ERR_SCHEMA_SCHEMA_INVALID; } - } - + } if (column.column_type.type == TPrimitiveType::CHAR || column.column_type.type == TPrimitiveType::VARCHAR || column.column_type.type == TPrimitiveType::HLL) { if (!column.column_type.__isset.len) { - remove_dir(header_dir); - OLAP_LOG_WARNING("CHAR or VARCHAR should specify length. [type=%d]", - column.column_type.type); + LOG(WARNING) << "CHAR or VARCHAR should specify length. type=" << column.column_type.type; return OLAP_ERR_INPUT_PARAMETER_ERROR; } } uint32_t length = FieldInfo::get_field_length_by_type( column.column_type.type, column.column_type.len); - header.mutable_column(i)->set_length(length); - - header.mutable_column(i)->set_index_length(length); + header->mutable_column(i)->set_length(length); + header->mutable_column(i)->set_index_length(length); if (column.column_type.type == TPrimitiveType::VARCHAR || column.column_type.type == TPrimitiveType::HLL) { if (!column.column_type.__isset.index_len) { - header.mutable_column(i)->set_index_length(10); + header->mutable_column(i)->set_index_length(10); } else { - header.mutable_column(i)->set_index_length(column.column_type.index_len); + header->mutable_column(i)->set_index_length(column.column_type.index_len); } } - if (!column.is_key) { - header.mutable_column(i)->set_is_key(false); + header->mutable_column(i)->set_is_key(false); string aggregation_type; EnumToString(TAggregationType, column.aggregation_type, aggregation_type); - header.mutable_column(i)->set_aggregation(aggregation_type); + header->mutable_column(i)->set_aggregation(aggregation_type); } else { ++key_count; - header.add_selectivity(1); - header.mutable_column(i)->set_is_key(true); - header.mutable_column(i)->set_aggregation("NONE"); + header->add_selectivity(1); + header->mutable_column(i)->set_is_key(true); + header->mutable_column(i)->set_aggregation("NONE"); } - if (column.__isset.default_value) { - header.mutable_column(i)->set_default_value(column.default_value); + header->mutable_column(i)->set_default_value(column.default_value); } - if (column.__isset.is_allow_null) { - header.mutable_column(i)->set_is_allow_null(column.is_allow_null); + header->mutable_column(i)->set_is_allow_null(column.is_allow_null); } else { - header.mutable_column(i)->set_is_allow_null(false); + header->mutable_column(i)->set_is_allow_null(false); } - if (column.__isset.is_bloom_filter_column) { - header.mutable_column(i)->set_is_bf_column(column.is_bloom_filter_column); + header->mutable_column(i)->set_is_bf_column(column.is_bloom_filter_column); has_bf_columns = true; } - ++i; } if (true == is_schema_change_table){ - /* - * schema change时,新表的next_unique_id应ä¿è¯å¤§äºŽç­‰äºŽæ—§è¡¨çš„next_unique_id, - * 以防止出现先删除列,åŽåŠ ç›¸åŒåˆ—的两次linked schema change的列unique_id出现混淆 + /* + * for schema change, next_unique_id of new olap table should be greater than + * next_unique_id of old olap table * */ - header.set_next_column_unique_id(next_unique_id); + header->set_next_column_unique_id(next_unique_id); } else { - header.set_next_column_unique_id(i); + header->set_next_column_unique_id(i); } - if (has_bf_columns && request.tablet_schema.__isset.bloom_filter_fpp) { - header.set_bf_fpp(request.tablet_schema.bloom_filter_fpp); + header->set_bf_fpp(request.tablet_schema.bloom_filter_fpp); } - if (key_count < request.tablet_schema.short_key_column_count) { - OLAP_LOG_WARNING("short key num should not large than key num. " - "[key_num=%d short_key_num=%d]", - key_count, request.tablet_schema.short_key_column_count); - remove_dir(header_dir); + LOG(WARNING) << "short key num should not large than key num. " + << "key_num=" << key_count << " short_key_num=" << request.tablet_schema.short_key_column_count; return OLAP_ERR_INPUT_PARAMETER_ERROR; } - // set restore mode - if (request.__isset.in_restore_mode && request.in_restore_mode) { - header.set_in_restore_mode(true); - } - - // save header file - header.set_creation_time(time(NULL)); - header.set_cumulative_layer_point(-1); - res = header.save(); - if (res != OLAP_SUCCESS) { - remove_dir(header_dir); - return res; - } - - *header_path = header_path_tmp; - return res; + header->set_creation_time(time(NULL)); + header->set_cumulative_layer_point(-1); + header->set_tablet_id(request.tablet_id); + header->set_schema_hash(request.tablet_schema.schema_hash); + header->set_shard(shard); + return OLAP_SUCCESS; } OLAPStatus OLAPEngine::_check_existed_or_else_create_dir(const string& path) { if (check_dir_existed(path)) { - OLAP_LOG_WARNING("failed to create the dir that existed. [path='%s']", path.c_str()); + LOG(WARNING) << "failed to create the dir that existed. [path='" << path << "']"; return OLAP_ERR_CANNOT_CREATE_DIR; } @@ -1503,9 +2017,9 @@ void OLAPEngine::_cancel_unfinished_schema_change() { AlterTabletType type; for (const auto& tablet_instance : _tablet_map) { - for (SmartOLAPTable olap_table : tablet_instance.second.table_arr) { + for (OLAPTablePtr olap_table : tablet_instance.second.table_arr) { if (olap_table.get() == NULL) { - OLAP_LOG_WARNING("get empty SmartOLAPTable. [tablet_id=%ld]", tablet_instance.first); + OLAP_LOG_WARNING("get empty OLAPTablePtr. [tablet_id=%ld]", tablet_instance.first); continue; } @@ -1515,7 +2029,7 @@ void OLAPEngine::_cancel_unfinished_schema_change() { continue; } - SmartOLAPTable new_olap_table = get_table(tablet_id, schema_hash); + OLAPTablePtr new_olap_table = get_table(tablet_id, schema_hash, false); if (new_olap_table.get() == NULL) { OLAP_LOG_WARNING("the table referenced by schema change cannot be found. " "schema change cancelled. [tablet='%s']", @@ -1523,10 +2037,11 @@ void OLAPEngine::_cancel_unfinished_schema_change() { continue; } + // PALO-3741. Upon restart, it should not clear schema change request. new_olap_table->set_schema_change_status( - ALTER_TABLE_FAILED, olap_table->schema_hash(), -1); - olap_table->set_schema_change_status( ALTER_TABLE_FAILED, new_olap_table->schema_hash(), -1); + olap_table->set_schema_change_status( + ALTER_TABLE_FAILED, olap_table->schema_hash(), -1); OLAP_LOG_DEBUG("cancel unfinished schema change. [tablet='%s']", olap_table->full_name().c_str()); ++canceled_num; @@ -1536,4 +2051,784 @@ void OLAPEngine::_cancel_unfinished_schema_change() { OLAP_LOG_INFO("finish to cancel unfinished schema change! [canceled_num=%lu]", canceled_num); } +void OLAPEngine::start_delete_unused_index() { + _gc_mutex.lock(); + + for (auto it = _gc_files.begin(); it != _gc_files.end();) { + if (it->first->is_in_use()) { + ++it; + } else { + delete it->first; + vector files = it->second; + remove_files(files); + it = _gc_files.erase(it); + } + } + + _gc_mutex.unlock(); +} + +void OLAPEngine::add_unused_index(Rowset* olap_index) { + _gc_mutex.lock(); + + auto it = _gc_files.find(olap_index); + if (it == _gc_files.end()) { + vector files; + int32_t rowset_id = olap_index->rowset_id(); + for (size_t seg_id = 0; seg_id < olap_index->num_segments(); ++seg_id) { + string index_file = olap_index->construct_index_file_path(rowset_id, seg_id); + files.push_back(index_file); + + string data_file = olap_index->construct_data_file_path(rowset_id, seg_id); + files.push_back(data_file); + } + _gc_files[olap_index] = files; + } + + _gc_mutex.unlock(); +} + +OLAPStatus OLAPEngine::_create_init_version( + OLAPTablePtr olap_table, const TCreateTabletReq& request) { + OLAPStatus res = OLAP_SUCCESS; + + if (request.version < 1) { + OLAP_LOG_WARNING("init version of tablet should at least 1."); + return OLAP_ERR_CE_CMD_PARAMS_ERROR; + } else { + Version init_base_version(0, request.version); + res = create_init_version( + request.tablet_id, request.tablet_schema.schema_hash, + init_base_version, request.version_hash); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to create init base version. [res=%d version=%ld]", + res, request.version); + return res; + } + + Version init_delta_version(request.version + 1, request.version + 1); + res = create_init_version( + request.tablet_id, request.tablet_schema.schema_hash, + init_delta_version, 0); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to create init delta version. [res=%d version=%ld]", + res, request.version + 1); + return res; + } + } + + olap_table->obtain_header_wrlock(); + olap_table->set_cumulative_layer_point(request.version + 1); + res = olap_table->save_header(); + olap_table->release_header_lock(); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to save header. [table=" << olap_table->full_name() << "]"; + } + + return res; +} + +// TODO(zc): refactor this funciton +OLAPStatus OLAPEngine::create_table(const TCreateTabletReq& request) { + OLAPStatus res = OLAP_SUCCESS; + bool is_table_added = false; + + OLAP_LOG_INFO("begin to process create table. [tablet=%ld, schema_hash=%d]", + request.tablet_id, request.tablet_schema.schema_hash); + + PaloMetrics::create_tablet_requests_total.increment(1); + + // 1. Make sure create_table operation is idempotent: + // return success if table with same tablet_id and schema_hash exist, + // false if table with same tablet_id but different schema_hash exist + if (check_tablet_id_exist(request.tablet_id)) { + OLAPTablePtr table = get_table( + request.tablet_id, request.tablet_schema.schema_hash); + if (table.get() != NULL) { + OLAP_LOG_INFO("create table success for table already exist."); + return OLAP_SUCCESS; + } else { + OLAP_LOG_WARNING("table with different schema hash already exists."); + return OLAP_ERR_CE_TABLET_ID_EXIST; + } + } + + // 2. Lock to ensure that all create_table operation execute in serial + static Mutex create_table_lock; + MutexLock auto_lock(&create_table_lock); + + OLAPTablePtr olap_table; + do { + // 3. Create table with only header, no deltas + olap_table = create_table(request, NULL, false, NULL); + if (olap_table == NULL) { + res = OLAP_ERR_CE_CMD_PARAMS_ERROR; + OLAP_LOG_WARNING("fail to create olap table. [res=%d]", res); + break; + } + + // 4. Add table to OlapEngine will make it visiable to user + res = add_table( + request.tablet_id, request.tablet_schema.schema_hash, olap_table); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to add table to OLAPEngine. [res=%d]", res); + break; + } + is_table_added = true; + + OLAPTablePtr olap_table_ptr = get_table( + request.tablet_id, request.tablet_schema.schema_hash); + if (olap_table_ptr.get() == NULL) { + res = OLAP_ERR_TABLE_NOT_FOUND; + OLAP_LOG_WARNING("fail to get table. [res=%d]", res); + break; + } + + // 5. Register table into OLAPEngine, so that we can manage table from + // the perspective of root path. + // Example: unregister all tables when a bad disk found. + res = register_table_into_root_path(olap_table_ptr.get()); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to register table into OLAPEngine. [res=%d, root_path=%s]", + res, olap_table_ptr->storage_root_path_name().c_str()); + break; + } + + // 6. Create init version if this is not a restore mode replica and request.version is set + // bool in_restore_mode = request.__isset.in_restore_mode && request.in_restore_mode; + // if (!in_restore_mode && request.__isset.version) { + res = _create_init_version(olap_table_ptr, request); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to create initial version for table. [res=%d]", res); + } + // } + } while (0); + + // 7. clear environment + if (res != OLAP_SUCCESS) { + PaloMetrics::create_tablet_requests_failed.increment(1); + if (is_table_added) { + OLAPStatus status = drop_table( + request.tablet_id, request.tablet_schema.schema_hash); + if (status != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to drop table when create table failed. [res=%d]", res); + } + } else if (NULL != olap_table) { + olap_table->delete_all_files(); + } + } + + OLAP_LOG_INFO("finish to process create table. [res=%d]", res); + return res; +} + +OLAPStatus OLAPEngine::schema_change(const TAlterTabletReq& request) { + OLAP_LOG_INFO("begin to schema change. [base_table=%ld new_table=%ld]", + request.base_tablet_id, request.new_tablet_req.tablet_id); + + PaloMetrics::schema_change_requests_total.increment(1); + + OLAPStatus res = OLAP_SUCCESS; + + SchemaChangeHandler handler; + res = handler.process_alter_table(ALTER_TABLET_SCHEMA_CHANGE, request); + + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("failed to do schema change. " + "[base_table=%ld new_table=%ld] [res=%d]", + request.base_tablet_id, request.new_tablet_req.tablet_id, res); + PaloMetrics::schema_change_requests_failed.increment(1); + return res; + } + + OLAP_LOG_INFO("success to submit schema change. " + "[base_table=%ld new_table=%ld]", + request.base_tablet_id, request.new_tablet_req.tablet_id); + return res; +} + +OLAPStatus OLAPEngine::create_rollup_table(const TAlterTabletReq& request) { + OLAP_LOG_INFO("begin to create rollup table. " + "[base_table=%ld new_table=%ld]", + request.base_tablet_id, request.new_tablet_req.tablet_id); + + PaloMetrics::create_rollup_requests_total.increment(1); + + OLAPStatus res = OLAP_SUCCESS; + + SchemaChangeHandler handler; + res = handler.process_alter_table(ALTER_TABLET_CREATE_ROLLUP_TABLE, request); + + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("failed to do rollup. " + "[base_table=%ld new_table=%ld] [res=%d]", + request.base_tablet_id, request.new_tablet_req.tablet_id, res); + PaloMetrics::create_rollup_requests_failed.increment(1); + return res; + } + + OLAP_LOG_INFO("success to create rollup table. " + "[base_table=%ld new_table=%ld] [res=%d]", + request.base_tablet_id, request.new_tablet_req.tablet_id, res); + return res; +} + +AlterTableStatus OLAPEngine::show_alter_table_status( + TTabletId tablet_id, + TSchemaHash schema_hash) { + OLAP_LOG_INFO("begin to process show alter table status. " + "[table=%ld schema_hash=%d]", + tablet_id, schema_hash); + + AlterTableStatus status = ALTER_TABLE_FINISHED; + + OLAPTablePtr table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + if (table.get() == NULL) { + OLAP_LOG_WARNING("fail to get table. [table=%ld schema_hash=%d]", + tablet_id, schema_hash); + status = ALTER_TABLE_FAILED; + } else { + status = table->schema_change_status().status; + } + + return status; +} + +OLAPStatus OLAPEngine::compute_checksum( + TTabletId tablet_id, + TSchemaHash schema_hash, + TVersion version, + TVersionHash version_hash, + uint32_t* checksum) { + OLAP_LOG_INFO("begin to process compute checksum. " + "[tablet_id=%ld schema_hash=%d version=%ld]", + tablet_id, schema_hash, version); + OLAPStatus res = OLAP_SUCCESS; + + if (checksum == NULL) { + OLAP_LOG_WARNING("invalid output parameter which is null pointer."); + return OLAP_ERR_CE_CMD_PARAMS_ERROR; + } + + OLAPTablePtr tablet = get_table(tablet_id, schema_hash); + if (NULL == tablet.get()) { + OLAP_LOG_WARNING("can't find tablet. [tablet_id=%ld schema_hash=%d]", + tablet_id, schema_hash); + return OLAP_ERR_TABLE_NOT_FOUND; + } + + { + ReadLock rdlock(tablet->get_header_lock_ptr()); + const PDelta* message = tablet->lastest_version(); + if (message == NULL) { + OLAP_LOG_FATAL("fail to get latest version. [tablet_id=%ld]", tablet_id); + return OLAP_ERR_VERSION_NOT_EXIST; + } + + if (message->end_version() == version + && message->version_hash() != version_hash) { + OLAP_LOG_WARNING("fail to check latest version hash. " + "[res=%d tablet_id=%ld version_hash=%ld request_version_hash=%ld]", + res, tablet_id, message->version_hash(), version_hash); + return OLAP_ERR_CE_CMD_PARAMS_ERROR; + } + } + + Reader reader; + ReaderParams reader_params; + reader_params.olap_table = tablet; + reader_params.reader_type = READER_CHECKSUM; + reader_params.version = Version(0, version); + + // ignore float and double type considering to precision lose + for (size_t i = 0; i < tablet->tablet_schema().size(); ++i) { + FieldType type = tablet->get_field_type_by_index(i); + if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) { + continue; + } + + reader_params.return_columns.push_back(i); + } + + res = reader.init(reader_params); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("initiate reader fail. [res=%d]", res); + return res; + } + + RowCursor row; + res = row.init(tablet->tablet_schema(), reader_params.return_columns); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("failed to init row cursor. [res=%d]", res); + return res; + } + row.allocate_memory_for_string_type(tablet->tablet_schema()); + + bool eof = false; + uint32_t row_checksum = 0; + while (true) { + OLAPStatus res = reader.next_row_with_aggregation(&row, &eof); + if (res == OLAP_SUCCESS && eof) { + OLAP_LOG_DEBUG("reader reads to the end."); + break; + } else if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to read in reader. [res=%d]", res); + return res; + } + + row_checksum = row.hash_code(row_checksum); + } + + OLAP_LOG_INFO("success to finish compute checksum. [checksum=%u]", row_checksum); + *checksum = row_checksum; + return OLAP_SUCCESS; +} + +OLAPStatus OLAPEngine::cancel_delete(const TCancelDeleteDataReq& request) { + OLAP_LOG_INFO("begin to process cancel delete. [table=%ld version=%ld]", + request.tablet_id, request.version); + + PaloMetrics::cancel_delete_requests_total.increment(1); + + OLAPStatus res = OLAP_SUCCESS; + + // 1. Get all tablets with same tablet_id + list table_list; + res = get_tables_by_id(request.tablet_id, &table_list); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("can't find table. [table=%ld]", request.tablet_id); + return OLAP_ERR_TABLE_NOT_FOUND; + } + + // 2. Remove delete conditions from each tablet. + DeleteConditionHandler cond_handler; + for (OLAPTablePtr temp_table : table_list) { + temp_table->obtain_header_wrlock(); + res = cond_handler.delete_cond(temp_table, request.version, false); + if (res != OLAP_SUCCESS) { + temp_table->release_header_lock(); + OLAP_LOG_WARNING("cancel delete failed. [res=%d table=%s]", + res, temp_table->full_name().c_str()); + break; + } + + res = temp_table->save_header(); + if (res != OLAP_SUCCESS) { + temp_table->release_header_lock(); + OLAP_LOG_WARNING("fail to save header. [res=%d table=%s]", + res, temp_table->full_name().c_str()); + break; + } + temp_table->release_header_lock(); + } + + // Show delete conditions in tablet header. + for (OLAPTablePtr table : table_list) { + cond_handler.log_conds(table); + } + + OLAP_LOG_INFO("finish to process cancel delete. [res=%d]", res); + return res; +} + +OLAPStatus OLAPEngine::delete_data( + const TPushReq& request, + vector* tablet_info_vec) { + OLAP_LOG_INFO("begin to process delete data. [request='%s']", + ThriftDebugString(request).c_str()); + PaloMetrics::delete_requests_total.increment(1); + + OLAPStatus res = OLAP_SUCCESS; + + if (tablet_info_vec == NULL) { + OLAP_LOG_WARNING("invalid output parameter which is null pointer."); + return OLAP_ERR_CE_CMD_PARAMS_ERROR; + } + + // 1. Get all tablets with same tablet_id + OLAPTablePtr table = get_table(request.tablet_id, request.schema_hash); + if (table.get() == NULL) { + OLAP_LOG_WARNING("can't find table. [table=%ld schema_hash=%d]", + request.tablet_id, request.schema_hash); + return OLAP_ERR_TABLE_NOT_FOUND; + } + + // 2. Process delete data by push interface + PushHandler push_handler; + if (request.__isset.transaction_id) { + res = push_handler.process_realtime_push(table, request, PUSH_FOR_DELETE, tablet_info_vec); + } else { + res = push_handler.process(table, request, PUSH_FOR_DELETE, tablet_info_vec); + } + + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to push empty version for delete data. " + "[res=%d table='%s']", + res, table->full_name().c_str()); + PaloMetrics::delete_requests_failed.increment(1); + return res; + } + + OLAP_LOG_INFO("finish to process delete data. [res=%d]", res); + return res; +} + +OLAPStatus OLAPEngine::recover_tablet_until_specfic_version( + const TRecoverTabletReq& recover_tablet_req) { + OLAPTablePtr table = get_table(recover_tablet_req.tablet_id, + recover_tablet_req.schema_hash); + if (table == nullptr) { return OLAP_ERR_TABLE_NOT_FOUND; } + RETURN_NOT_OK(table->recover_tablet_until_specfic_version(recover_tablet_req.version, + recover_tablet_req.version_hash)); + return OLAP_SUCCESS; +} + +string OLAPEngine::get_info_before_incremental_clone(OLAPTablePtr tablet, + int64_t committed_version, vector* missing_versions) { + + // get missing versions + tablet->obtain_header_rdlock(); + tablet->get_missing_versions_with_header_locked(committed_version, missing_versions); + + // get least complete version + // prevent lastest version not replaced (if need to rewrite) after node restart + const PDelta* least_complete_version = tablet->least_complete_version(*missing_versions); + if (least_complete_version != NULL) { + // TODO: Used in upgraded. If old Palo version, version can be converted. + Version version(least_complete_version->start_version(), least_complete_version->end_version()); + missing_versions->push_back(version); + LOG(INFO) << "least complete version for incremental clone. table=" << tablet->full_name() << ", " + << "least_complete_version=" << least_complete_version->end_version(); + } + + tablet->release_header_lock(); + LOG(INFO) << "finish to calculate missing versions when clone. [table=" << tablet->full_name() + << " committed_version=" << committed_version << " missing_versions_size=" << missing_versions->size() << "]"; + + // get download path + return tablet->tablet_path() + CLONE_PREFIX; +} + +OLAPStatus OLAPEngine::finish_clone(OLAPTablePtr tablet, const string& clone_dir, + int64_t committed_version, bool is_incremental_clone) { + OLAPStatus res = OLAP_SUCCESS; + vector linked_success_files; + + // clone and compaction operation should be performed sequentially + tablet->obtain_base_compaction_lock(); + tablet->obtain_cumulative_lock(); + + tablet->obtain_push_lock(); + tablet->obtain_header_wrlock(); + do { + // check clone dir existed + if (!check_dir_existed(clone_dir)) { + res = OLAP_ERR_DIR_NOT_EXIST; + OLAP_LOG_WARNING("clone dir not existed when clone. [clone_dir=%s]", + clone_dir.c_str()); + break; + } + + // load src header + string clone_header_file = clone_dir + "/" + std::to_string(tablet->tablet_id()) + ".hdr"; + OLAPHeader clone_header(clone_header_file); + if ((res = clone_header.load_and_init()) != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to load src header when clone. [clone_header_file=%s]", + clone_header_file.c_str()); + break; + } + + // check all files in /clone and /tablet + set clone_files; + if ((res = dir_walk(clone_dir, NULL, &clone_files)) != OLAP_SUCCESS) { + LOG(WARNING) << "failed to dir walk when clone. [clone_dir=" << clone_dir << "]"; + break; + } + + set local_files; + string tablet_dir = tablet->tablet_path(); + if ((res = dir_walk(tablet_dir, NULL, &local_files)) != OLAP_SUCCESS) { + LOG(WARNING) << "failed to dir walk when clone. [tablet_dir=" << tablet_dir << "]"; + break; + } + + // link files from clone dir, if file exists, skip it + for (const string& clone_file : clone_files) { + if (local_files.find(clone_file) != local_files.end()) { + OLAP_LOG_DEBUG("find same file when clone, skip it. [table=%s clone_file=%s]", + tablet->full_name().c_str(), clone_file.c_str()); + continue; + } + + string from = clone_dir + "/" + clone_file; + string to = tablet_dir + "/" + clone_file; + LOG(INFO) << "src file:" << from << ", " << "dest file:" << to; + if (link(from.c_str(), to.c_str()) != 0) { + OLAP_LOG_WARNING("fail to create hard link when clone. [from=%s to=%s]", + from.c_str(), to.c_str()); + res = OLAP_ERR_OS_ERROR; + break; + } + linked_success_files.emplace_back(std::move(to)); + } + + if (res != OLAP_SUCCESS) { + break; + } + + if (is_incremental_clone) { + res = OLAPEngine::get_instance()->clone_incremental_data( + tablet, clone_header, committed_version); + } else { + res = OLAPEngine::get_instance()->clone_full_data(tablet, clone_header); + } + + // if full clone success, need to update cumulative layer point + if (!is_incremental_clone && res == OLAP_SUCCESS) { + tablet->set_cumulative_layer_point(clone_header.cumulative_layer_point()); + } + + } while (0); + + // clear linked files if errors happen + if (res != OLAP_SUCCESS) { + remove_files(linked_success_files); + } + tablet->release_header_lock(); + tablet->release_push_lock(); + + tablet->release_cumulative_lock(); + tablet->release_base_compaction_lock(); + + // clear clone dir + boost::filesystem::path clone_dir_path(clone_dir); + boost::filesystem::remove_all(clone_dir_path); + OLAP_LOG_INFO("finish to clone data, clear downloaded data. " + "[table=%s clone_dir=%s clone_res=%d]", + tablet->full_name().c_str(), clone_dir.c_str(), res); + return res; +} + +OLAPStatus OLAPEngine::obtain_shard_path( + TStorageMedium::type storage_medium, std::string* shard_path, OlapStore** store) { + OLAP_LOG_INFO("begin to process obtain root path. [storage_medium=%d]", storage_medium); + OLAPStatus res = OLAP_SUCCESS; + + if (shard_path == NULL) { + OLAP_LOG_WARNING("invalid output parameter which is null pointer."); + return OLAP_ERR_CE_CMD_PARAMS_ERROR; + } + + auto stores = OLAPEngine::get_instance()->get_stores_for_create_table(storage_medium); + if (stores.empty()) { + OLAP_LOG_WARNING("no available disk can be used to create table."); + return OLAP_ERR_NO_AVAILABLE_ROOT_PATH; + } + + uint64_t shard = 0; + res = stores[0]->get_shard(&shard); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to get root path shard. [res=%d]", res); + return res; + } + + stringstream root_path_stream; + root_path_stream << stores[0]->path() << DATA_PREFIX << "/" << shard; + *shard_path = root_path_stream.str(); + *store = stores[0]; + + OLAP_LOG_INFO("success to process obtain root path. [path='%s']", + shard_path->c_str()); + return res; +} + +OLAPStatus OLAPEngine::load_header( + const string& shard_path, + const TCloneReq& request) { + OLAP_LOG_INFO("begin to process load headers. " + "[tablet_id=%ld schema_hash=%d]", + request.tablet_id, request.schema_hash); + OLAPStatus res = OLAP_SUCCESS; + + OlapStore* store = nullptr; + { + // TODO(zc) + try { + auto store_path = + boost::filesystem::path(shard_path).parent_path().parent_path().string(); + store = OLAPEngine::get_instance()->get_store(store_path); + if (store == nullptr) { + LOG(WARNING) << "invalid shard path, path=" << shard_path; + return OLAP_ERR_INVALID_ROOT_PATH; + } + } catch (...) { + LOG(WARNING) << "invalid shard path, path=" << shard_path; + return OLAP_ERR_INVALID_ROOT_PATH; + } + } + + stringstream schema_hash_path_stream; + schema_hash_path_stream << shard_path + << "/" << request.tablet_id + << "/" << request.schema_hash; + res = OLAPEngine::get_instance()->load_one_tablet( + store, + request.tablet_id, request.schema_hash, + schema_hash_path_stream.str()); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to process load headers. [res=%d]", res); + return res; + } + + OLAP_LOG_INFO("success to process load headers."); + return res; +} + +OLAPStatus OLAPEngine::load_header( + OlapStore* store, + const string& shard_path, + TTabletId tablet_id, + TSchemaHash schema_hash) { + OLAP_LOG_INFO("begin to process load headers. [tablet_id=%ld schema_hash=%d]", + tablet_id, schema_hash); + OLAPStatus res = OLAP_SUCCESS; + + stringstream schema_hash_path_stream; + schema_hash_path_stream << shard_path + << "/" << tablet_id + << "/" << schema_hash; + res = OLAPEngine::get_instance()->load_one_tablet( + store, + tablet_id, schema_hash, + schema_hash_path_stream.str()); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to process load headers. [res=%d]", res); + return res; + } + + OLAP_LOG_INFO("success to process load headers."); + return res; +} + +OLAPStatus OLAPEngine::clear_alter_task(const TTabletId tablet_id, + const TSchemaHash schema_hash) { + OLAP_LOG_INFO("begin to process clear alter task. [tablet_id=%ld schema_hash=%d]", + tablet_id, schema_hash); + OLAPTablePtr tablet = get_table(tablet_id, schema_hash); + if (tablet.get() == NULL) { + OLAP_LOG_WARNING("can't find tablet when process clear alter task. ", + "[tablet_id=%ld, schema_hash=%d]", tablet_id, schema_hash); + return OLAP_SUCCESS; + } + + // get schema change info + AlterTabletType type; + TTabletId related_tablet_id; + TSchemaHash related_schema_hash; + vector schema_change_versions; + tablet->obtain_header_rdlock(); + bool ret = tablet->get_schema_change_request( + &related_tablet_id, &related_schema_hash, &schema_change_versions, &type); + tablet->release_header_lock(); + if (!ret) { + return OLAP_SUCCESS; + } else if (!schema_change_versions.empty()) { + OLAP_LOG_WARNING("find alter task unfinished when process clear alter task. ", + "[tablet=%s versions_to_change_size=%d]", + tablet->full_name().c_str(), schema_change_versions.size()); + return OLAP_ERR_PREVIOUS_SCHEMA_CHANGE_NOT_FINISHED; + } + + // clear schema change info + tablet->obtain_header_wrlock(); + tablet->clear_schema_change_request(); + OLAPStatus res = tablet->save_header(); + if (res != OLAP_SUCCESS) { + LOG(FATAL) << "fail to save header. [res=" << res << " tablet='" << tablet->full_name() << "']"; + } else { + LOG(INFO) << "clear alter task on tablet. [tablet='" << tablet->full_name() << "']"; + } + tablet->release_header_lock(); + + // clear related tablet's schema change info + OLAPTablePtr related_table = get_table(related_tablet_id, related_schema_hash); + if (related_table.get() == NULL) { + OLAP_LOG_WARNING("related table not found when process clear alter task. " + "[tablet_id=%ld schema_hash=%d " + "related_tablet_id=%ld related_schema_hash=%d]", + tablet_id, schema_hash, related_tablet_id, related_schema_hash); + } else { + related_table->obtain_header_wrlock(); + related_table->clear_schema_change_request(); + res = related_table->save_header(); + if (res != OLAP_SUCCESS) { + LOG(FATAL) << "fail to save header. [res=" << res << " tablet='" + << related_table->full_name() << "']"; + } else { + LOG(INFO) << "clear alter task on tablet. [tablet='" << related_table->full_name() << "']"; + } + related_table->release_header_lock(); + } + + OLAP_LOG_INFO("finish to process clear alter task. [tablet_id=%ld schema_hash=%d]", + related_tablet_id, related_schema_hash); + return OLAP_SUCCESS; +} + +OLAPStatus OLAPEngine::push( + const TPushReq& request, + vector* tablet_info_vec) { + OLAPStatus res = OLAP_SUCCESS; + OLAP_LOG_INFO("begin to process push. [tablet_id=%ld version=%ld]", + request.tablet_id, request.version); + + if (tablet_info_vec == NULL) { + OLAP_LOG_WARNING("invalid output parameter which is null pointer."); + PaloMetrics::push_requests_fail_total.increment(1); + return OLAP_ERR_CE_CMD_PARAMS_ERROR; + } + + OLAPTablePtr olap_table = OLAPEngine::get_instance()->get_table( + request.tablet_id, request.schema_hash); + if (NULL == olap_table.get()) { + OLAP_LOG_WARNING("false to find table. [table=%ld schema_hash=%d]", + request.tablet_id, request.schema_hash); + PaloMetrics::push_requests_fail_total.increment(1); + return OLAP_ERR_TABLE_NOT_FOUND; + } + + PushType type = PUSH_NORMAL; + if (request.push_type == TPushType::LOAD_DELETE) { + type = PUSH_FOR_LOAD_DELETE; + } + + int64_t duration_ns = 0; + PushHandler push_handler; + if (request.__isset.transaction_id) { + { + SCOPED_RAW_TIMER(&duration_ns); + res = push_handler.process_realtime_push(olap_table, request, type, tablet_info_vec); + } + } else { + { + SCOPED_RAW_TIMER(&duration_ns); + res = push_handler.process(olap_table, request, type, tablet_info_vec); + } + } + + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to push delta, table=" << olap_table->full_name().c_str() + << ",cost=" << PrettyPrinter::print(duration_ns, TUnit::TIME_NS); + PaloMetrics::push_requests_fail_total.increment(1); + } else { + LOG(INFO) << "success to push delta, table=" << olap_table->full_name().c_str() + << ",cost=" << PrettyPrinter::print(duration_ns, TUnit::TIME_NS); + PaloMetrics::push_requests_success_total.increment(1); + PaloMetrics::push_request_duration_us.increment(duration_ns / 1000); + PaloMetrics::push_request_write_bytes.increment(push_handler.write_bytes()); + PaloMetrics::push_request_write_rows.increment(push_handler.write_rows()); + } + return res; +} + } // namespace palo diff --git a/be/src/olap/olap_engine.h b/be/src/olap/olap_engine.h index 5c08c5b0f7..e3eaa17432 100644 --- a/be/src/olap/olap_engine.h +++ b/be/src/olap/olap_engine.h @@ -20,51 +20,84 @@ #include #include #include +#include #include #include #include +#include #include #include +#include "agent/status.h" +#include "common/status.h" #include "gen_cpp/AgentService_types.h" +#include "gen_cpp/BackendService_types.h" #include "gen_cpp/MasterService_types.h" +#include "olap/atomic.h" #include "olap/lru_cache.h" #include "olap/olap_common.h" #include "olap/olap_define.h" -#include "olap/olap_rootpath.h" -#include "olap/olap_snapshot.h" #include "olap/olap_table.h" +#include "olap/olap_meta.h" +#include "olap/options.h" namespace palo { -void* load_root_path_thread_callback(void* arg); - class OLAPTable; +class OlapStore; + +struct RootPathInfo { + RootPathInfo(): + capacity(1), + available(0), + data_used_capacity(0), + is_used(false) { } + + std::string path; + int64_t capacity; // 总空间,å•ä½å­—节 + int64_t available; // å¯ç”¨ç©ºé—´ï¼Œå•ä½å­—节 + int64_t data_used_capacity; + bool is_used; // 是å¦å¯ç”¨æ ‡è¯† + TStorageMedium::type storage_medium; // 存储介质类型:SSD|HDD +}; // OLAPEngine singleton to manage all Table pointers. // Providing add/drop/get operations. // OLAPEngine instance doesn't own the Table resources, just hold the pointer, // allocation/deallocation must be done outside. class OLAPEngine { - friend void* load_root_path_thread_callback(void* arg); - - DECLARE_SINGLETON(OLAPEngine) public: - // Get table pointer - SmartOLAPTable get_table(TTabletId tablet_id, SchemaHash schema_hash); + OLAPEngine() { } + OLAPEngine(const EngineOptions& options); + ~OLAPEngine(); - OLAPStatus get_tables_by_id(TTabletId tablet_id, std::list* table_list); + static Status open(const EngineOptions& options, OLAPEngine** engine_ptr); + + static void set_instance(OLAPEngine* engine) { + _s_instance = engine; + } + + static OLAPEngine *get_instance() { + return _s_instance; + } + + // Get table pointer + OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true); + + OLAPStatus get_tables_by_id(TTabletId tablet_id, std::list* table_list); bool check_tablet_id_exist(TTabletId tablet_id); + OLAPStatus create_table(const TCreateTabletReq& request); + // Create new table for OLAPEngine // // Return OLAPTable * succeeded; Otherwise, return NULL if failed - OLAPTable* create_table(const TCreateTabletReq& request, - const std::string* ref_root_path, - const bool is_schema_change_table, - const SmartOLAPTable ref_olap_table); + OLAPTablePtr create_table(const TCreateTabletReq& request, + const std::string* ref_root_path, + const bool is_schema_change_table, + const OLAPTablePtr ref_olap_table); // Add a table pointer to OLAPEngine // If force, drop the existing table add this new one @@ -73,7 +106,32 @@ public: // OLAP_ERR_TABLE_INSERT_DUPLICATION_ERROR, if find duplication // OLAP_ERR_NOT_INITED, if not inited OLAPStatus add_table(TTabletId tablet_id, SchemaHash schema_hash, - OLAPTable* table, bool force = false); + const OLAPTablePtr& table, bool force = false); + + OLAPStatus add_transaction(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, SchemaHash schema_hash, + const PUniqueId& load_id); + + void delete_transaction(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, SchemaHash schema_hash, + bool delete_from_tablet = true); + + void get_transactions_by_tablet(OLAPTablePtr tablet, int64_t* partition_id, + std::set* transaction_ids); + + bool has_transaction(TPartitionId partition_id, TTransactionId transaction_id, + TTabletId tablet_id, SchemaHash schema_hash); + + OLAPStatus publish_version(const TPublishVersionRequest& publish_version_req, + std::vector* error_tablet_ids); + + void clear_transaction_task(const TTransactionId transaction_id, + const std::vector partition_ids); + + OLAPStatus clone_incremental_data(OLAPTablePtr tablet, OLAPHeader& clone_header, + int64_t committed_version); + + OLAPStatus clone_full_data(OLAPTablePtr tablet, OLAPHeader& clone_header); // Add empty data for OLAPTable // @@ -91,10 +149,10 @@ public: TTabletId tablet_id, SchemaHash schema_hash, bool keep_files = false); // Drop table directly with check schema change info. - OLAPStatus _drop_table_directly( - TTabletId tablet_id, TSchemaHash schema_hash, bool keep_files = false); + OLAPStatus _drop_table_directly(TTabletId tablet_id, TSchemaHash schema_hash, bool keep_files = false); + OLAPStatus _drop_table_directly_unlocked(TTabletId tablet_id, TSchemaHash schema_hash, bool keep_files = false); - OLAPStatus drop_tables_on_error_root_path(const std::vector& table_info_vec); + OLAPStatus drop_tables_on_error_root_path(const std::vector& tablet_info_vec); // Prevent schema change executed concurrently. bool try_schema_change_lock(TTabletId tablet_id); @@ -107,27 +165,28 @@ public: OLAPStatus report_tablet_info(TTabletInfo* tablet_info); OLAPStatus report_all_tablets_info(std::map* tablets_info); + void get_tablet_stat(TTabletStatResult& result); + // Instance should be inited from create_instance // MUST NOT be called in other circumstances. - OLAPStatus init(); + OLAPStatus open(); // Clear status(tables, ...) OLAPStatus clear(); void start_clean_fd_cache(); - void start_base_compaction(std::string* last_base_compaction_fs, TTabletId* last_base_compaction_tablet_id); - - // 调度ce,优先级调度 - void start_cumulative_priority(); + void perform_cumulative_compaction(); + void perform_base_compaction(); // 获å–cacheçš„ä½¿ç”¨æƒ…å†µä¿¡æ¯ void get_cache_status(rapidjson::Document* document) const; // Note: 这里åªèƒ½reload原先已ç»å­˜åœ¨çš„root path,å³re-loadå¯åŠ¨æ—¶å°±ç™»è®°çš„root path // 是å…许的,但re-load全新的path是ä¸å…许的,因为此处没有彻底更新ceè°ƒåº¦å™¨ä¿¡æ¯ - void load_root_paths(const OLAPRootPath::RootPathVec& root_paths); + void load_stores(const std::vector& stores); - OLAPStatus load_one_tablet(TTabletId tablet_id, + OLAPStatus load_one_tablet(OlapStore* store, + TTabletId tablet_id, SchemaHash schema_hash, const std::string& schema_hash_path, bool force = false); @@ -143,19 +202,253 @@ public: // 清ç†trashå’Œsnapshot文件,返回清ç†åŽçš„ç£ç›˜ä½¿ç”¨é‡ OLAPStatus start_trash_sweep(double *usage); - void add_tablet_to_base_compaction_queue(const TableInfo& tablet_info) { - std::lock_guard l(_base_compaction_queue_lock); - _base_compaction_tablet_queue.push(tablet_info); + std::condition_variable disk_broken_cv; + std::atomic_bool is_report_disk_state_already; + std::atomic_bool is_report_olap_table_already; + + template + std::vector get_stores(); + Status set_cluster_id(int32_t cluster_id); + + // @brief 设置root_path是å¦å¯ç”¨ + void set_store_used_flag(const std::string& root_path, bool is_used); + + // @brief èŽ·å–æ‰€æœ‰root_pathä¿¡æ¯ + OLAPStatus get_all_root_path_info(std::vector* root_paths_info); + + void get_all_available_root_path(std::vector* available_paths); + + OLAPStatus register_table_into_root_path(OLAPTable* olap_table); + + // ç£ç›˜çжæ€ç›‘测。监测unused_flag路劲新的对应root_path unused标识ä½ï¼Œ + // 当检测到有unused标识时,从内存中删除对应表信æ¯ï¼Œç£ç›˜æ•°æ®ä¸åŠ¨ã€‚ + // 当ç£ç›˜çжæ€ä¸ºä¸å¯ç”¨ï¼Œä½†æœªæ£€æµ‹åˆ°unused标识时,需è¦ä»Žroot_path上 + // 釿–°åŠ è½½æ•°æ®ã€‚ + void start_disk_stat_monitor(); + + // get root path for creating table. The returned vector of root path should be random, + // for avoiding that all the table would be deployed one disk. + std::vector get_stores_for_create_table( + TStorageMedium::type storage_medium); + OlapStore* get_store(const std::string& path); + + uint32_t available_storage_medium_type_count() { + return _available_storage_medium_type_count; } - void add_tablet_to_cumulative_compaction_queue(const TableInfo& tablet_info) { - std::lock_guard l(_cumulative_compaction_queue_lock); - _cumulative_compaction_tablet_queue.push(tablet_info); + int32_t effective_cluster_id() const { + return _effective_cluster_id; } + + uint32_t get_file_system_count() { + return _store_map.size(); + } + + // @brief 创建snapshot + // @param tablet_id [in] 原表的id + // @param schema_hash [in] 原表的schema,与tablet_id傿•°åˆèµ·æ¥å”¯ä¸€ç¡®å®šä¸€å¼ è¡¨ + // @param snapshot_path [out] 新生æˆçš„snapshot的路径 + OLAPStatus make_snapshot( + const TSnapshotRequest& request, + std::string* snapshot_path); + + // @brief 释放snapshot + // @param snapshot_path [in] è¦è¢«é‡Šæ”¾çš„snapshot的路径,åªåŒ…å«åˆ°ID + OLAPStatus release_snapshot(const std::string& snapshot_path); + + // @brief è¿ç§»æ•°æ®ï¼Œä»Žä¸€ç§å­˜å‚¨ä»‹è´¨åˆ°å¦ä¸€ç§å­˜å‚¨ä»‹è´¨ + OLAPStatus storage_medium_migrate( + TTabletId tablet_id, + TSchemaHash schema_hash, + TStorageMedium::type storage_medium); + + void start_delete_unused_index(); + + void add_unused_index(Rowset* olap_index); + + // ######################### ALTER TABLE BEGIN ######################### + // The following interfaces are all about alter tablet operation, + // the main logical is that generating a new tablet with different + // schema on base tablet. + + // Create rollup tablet on base tablet, after create_rollup_table, + // both base tablet and new tablet is effective. + // + // @param [in] request specify base tablet, new tablet and its schema + // @return OLAP_SUCCESS if submit success + OLAPStatus create_rollup_table(const TAlterTabletReq& request); + + // Do schema change on tablet, OLAPEngine support + // add column, drop column, alter column type and order, + // after schema_change, base tablet is abandoned. + // Note that the two tablets has same tablet_id but different schema_hash + // + // @param [in] request specify base tablet, new tablet and its schema + // @return OLAP_SUCCESS if submit success + OLAPStatus schema_change(const TAlterTabletReq& request); + + // Show status of all alter table operation. + // + // @param [in] tablet_id & schema_hash specify a tablet + // @return alter table status + AlterTableStatus show_alter_table_status(TTabletId tablet_id, TSchemaHash schema_hash); + + OLAPStatus compute_checksum( + TTabletId tablet_id, + TSchemaHash schema_hash, + TVersion version, + TVersionHash version_hash, + uint32_t* checksum); + + OLAPStatus cancel_delete(const TCancelDeleteDataReq& request); + + // Delete data of specified tablet according to delete conditions, + // once delete_data command submit success, deleted data is not visible, + // but not actually deleted util delay_delete_time run out. + // + // @param [in] request specify tablet and delete conditions + // @param [out] tablet_info_vec return tablet lastest status, which + // include version info, row count, data size, etc + // @return OLAP_SUCCESS if submit delete_data success + virtual OLAPStatus delete_data( + const TPushReq& request, + std::vector* tablet_info_vec); + + OLAPStatus recover_tablet_until_specfic_version( + const TRecoverTabletReq& recover_tablet_req); + + // before doing incremental clone, + // need to calculate tablet's download dir and tablet's missing versions + virtual std::string get_info_before_incremental_clone(OLAPTablePtr tablet, + int64_t committed_version, std::vector* missing_versions); + + virtual OLAPStatus finish_clone(OLAPTablePtr tablet, const std::string& clone_dir, + int64_t committed_version, bool is_incremental_clone); + + // Obtain shard path for new tablet. + // + // @param [out] shard_path choose an available root_path to clone new tablet + // @return error code + virtual OLAPStatus obtain_shard_path( + TStorageMedium::type storage_medium, + std::string* shared_path, + OlapStore** store); + + // Load new tablet to make it effective. + // + // @param [in] root_path specify root path of new tablet + // @param [in] request specify new tablet info + // @return OLAP_SUCCESS if load tablet success + virtual OLAPStatus load_header( + const std::string& shard_path, const TCloneReq& request); + virtual OLAPStatus load_header( + OlapStore* store, + const std::string& shard_path, + TTabletId tablet_id, + TSchemaHash schema_hash); + + OLAPStatus clear_alter_task(const TTabletId tablet_id, + const TSchemaHash schema_hash); + OLAPStatus push( + const TPushReq& request, + std::vector* tablet_info_vec); + +private: + OLAPStatus check_all_root_path_cluster_id(); + + bool _used_disk_not_enough(uint32_t unused_num, uint32_t total_num); + + OLAPStatus _get_root_path_capacity( + const std::string& root_path, + int64_t* data_used, + int64_t* disk_available); + + OLAPStatus _config_root_path_unused_flag_file( + const std::string& root_path, + std::string* unused_flag_file); + + void _delete_tables_on_unused_root_path(); + + void _update_storage_medium_type_count(); + + OLAPStatus _judge_and_update_effective_cluster_id(int32_t cluster_id); + + OLAPStatus _calc_snapshot_id_path( + const OLAPTablePtr& olap_table, + std::string* out_path); + + std::string _get_schema_hash_full_path( + const OLAPTablePtr& ref_olap_table, + const std::string& location) const; + + std::string _get_header_full_path( + const OLAPTablePtr& ref_olap_table, + const std::string& schema_hash_path) const; + + void _update_header_file_info( + const std::vector& shortest_version_entity, + OLAPHeader* header); + + OLAPStatus _link_index_and_data_files( + const std::string& header_path, + const OLAPTablePtr& ref_olap_table, + const std::vector& version_entity_vec); + + OLAPStatus _copy_index_and_data_files( + const std::string& header_path, + const OLAPTablePtr& ref_olap_table, + std::vector& version_entity_vec); + + OLAPStatus _create_snapshot_files( + const OLAPTablePtr& ref_olap_table, + const TSnapshotRequest& request, + std::string* snapshot_path); + + OLAPStatus _create_incremental_snapshot_files( + const OLAPTablePtr& ref_olap_table, + const TSnapshotRequest& request, + std::string* snapshot_path); + + OLAPStatus _prepare_snapshot_dir(const OLAPTablePtr& ref_olap_table, + std::string* snapshot_id_path); + + OLAPStatus _append_single_delta( + const TSnapshotRequest& request, + OlapStore* store); + + std::string _construct_index_file_path( + const std::string& tablet_path_prefix, + const Version& version, + VersionHash version_hash, + int32_t rowset_id, int32_t segment) const; + + std::string _construct_data_file_path( + const std::string& tablet_path_prefix, + const Version& version, + VersionHash version_hash, + int32_t rowset_id, int32_t segment) const; + + OLAPStatus _generate_new_header( + OlapStore* store, + const uint64_t new_shard, + const OLAPTablePtr& tablet, + const std::vector& version_entity_vec, OLAPHeader* new_olap_header); + + OLAPStatus _create_hard_link(const std::string& from_path, const std::string& to_path); + + OLAPStatus _start_bg_worker(); + + OLAPStatus _create_init_version(OLAPTablePtr olap_table, const TCreateTabletReq& request); + private: struct TableInstances { - MutexLock schema_change_lock; - std::list table_arr; + Mutex schema_change_lock; + std::list table_arr; + }; + + enum CompactionType { + BASE_COMPACTION = 1, + CUMULATIVE_COMPACTION = 2 }; struct CompactionCandidate { @@ -189,7 +482,7 @@ private: typedef std::map tablet_map_t; typedef std::map file_system_task_count_t; - SmartOLAPTable _get_table_with_no_lock(TTabletId tablet_id, SchemaHash schema_hash); + OLAPTablePtr _get_table_with_no_lock(TTabletId tablet_id, SchemaHash schema_hash); // é历root所指定目录, 通过dirs返回此目录下所有有文件夹的åå­—, files返回所有文件的åå­— OLAPStatus _dir_walk(const std::string& root, @@ -197,44 +490,107 @@ private: std::set* files); // 扫æç›®å½•, 加载表 - OLAPStatus _load_tables(const std::string& tables_root_path); + OLAPStatus _load_store(OlapStore* store); - OLAPStatus _create_new_table_header_file(const TCreateTabletReq& request, - const std::string& root_path, - std::string* header_path, + OLAPStatus _create_new_table_header(const TCreateTabletReq& request, + OlapStore* store, const bool is_schema_change_table, - const SmartOLAPTable ref_olap_table); + const OLAPTablePtr ref_olap_table, + OLAPHeader* header); OLAPStatus _check_existed_or_else_create_dir(const std::string& path); - void _select_candidate(); + OLAPTablePtr _find_best_tablet_to_compaction(CompactionType compaction_type); + bool _can_do_compaction(OLAPTablePtr table); void _cancel_unfinished_schema_change(); - static OLAPStatus _spawn_load_root_path_thread(pthread_t* thread, const std::string& root_path); - OLAPStatus _do_sweep( const std::string& scan_root, const time_t& local_tm_now, const uint32_t expire); - RWLock _tablet_map_lock; + void _build_tablet_info(OLAPTablePtr olap_table, TTabletInfo* tablet_info); + void _build_tablet_stat(); + + EngineOptions _options; + std::mutex _store_lock; + std::map _store_map; + uint32_t _available_storage_medium_type_count; + + int32_t _effective_cluster_id; + bool _is_all_cluster_id_exist; + bool _is_drop_tables; + + // 错误ç£ç›˜æ‰€åœ¨ç™¾åˆ†æ¯”,超过设定的值,则engine需è¦é€€å‡ºè¿è¡Œ + uint32_t _min_percentage_of_error_disk; + + RWMutex _tablet_map_lock; tablet_map_t _tablet_map; + RWMutex _transaction_tablet_map_lock; + using TxnKey = std::pair; //transaction_id, partition_id; + std::map>> _transaction_tablet_map; size_t _global_table_id; Cache* _file_descriptor_lru_cache; Cache* _index_stream_lru_cache; uint32_t _max_base_compaction_task_per_disk; - std::queue _base_compaction_tablet_queue; - std::mutex _base_compaction_queue_lock; uint32_t _max_cumulative_compaction_task_per_disk; - std::queue _cumulative_compaction_tablet_queue; - std::mutex _cumulative_compaction_queue_lock; - MutexLock _fs_task_mutex; + Mutex _fs_task_mutex; file_system_task_count_t _fs_base_compaction_task_num_map; std::vector _cumulative_compaction_candidate; - std::vector _cumulative_compaction_disk_stat; - std::map _disk_id_map; - DISALLOW_COPY_AND_ASSIGN(OLAPEngine); + // cache to save tablets' statistics, such as data size and row + // TODO(cmy): for now, this is a naive implementation + std::map _tablet_stat_cache; + // last update time of tablet stat cache + int64_t _tablet_stat_cache_update_time_ms; + + static OLAPEngine* _s_instance; + + // snapshot + Mutex _snapshot_mutex; + uint64_t _snapshot_base_id; + + std::unordered_map> _gc_files; + Mutex _gc_mutex; + + // Thread functions + + // base compaction thread process function + void* _base_compaction_thread_callback(void* arg); + + // garbage sweep thread process function. clear snapshot and trash folder + void* _garbage_sweeper_thread_callback(void* arg); + + // delete table with io error process function + void* _disk_stat_monitor_thread_callback(void* arg); + + // unused index process function + void* _unused_index_thread_callback(void* arg); + + // cumulative process function + void* _cumulative_compaction_thread_callback(void* arg); + + // clean file descriptors cache + void* _fd_cache_clean_callback(void* arg); + + // thread to monitor snapshot expiry + std::thread _garbage_sweeper_thread; + + // thread to monitor disk stat + std::thread _disk_stat_monitor_thread; + + // thread to monitor unused index + std::thread _unused_index_thread; + + // thread to run base compaction + std::vector _base_compaction_threads; + + // thread to check cumulative + std::vector _cumulative_compaction_threads; + + std::thread _fd_cache_clean_thread; + + static atomic_t _s_request_number; }; } // namespace palo diff --git a/be/src/olap/olap_header.cpp b/be/src/olap/olap_header.cpp index f81b261fb1..43d98980e7 100644 --- a/be/src/olap/olap_header.cpp +++ b/be/src/olap/olap_header.cpp @@ -46,7 +46,7 @@ namespace palo { // Construct version graph(using adjacency list) from header's information. static OLAPStatus construct_version_graph( - const RepeatedPtrField& versions_in_header, + const RepeatedPtrField& versions_in_header, vector* version_graph, unordered_map* vertex_helper_map); @@ -61,7 +61,7 @@ static OLAPStatus add_version_to_graph(const Version& version, // Delete version from graph, it is called near the end of delete_version static OLAPStatus delete_version_from_graph( - const RepeatedPtrField& versions_in_header, + const RepeatedPtrField& versions_in_header, const Version& version, vector* version_graph, unordered_map* vertex_helper_map); @@ -77,38 +77,76 @@ OLAPHeader::~OLAPHeader() { Clear(); } -OLAPStatus OLAPHeader::load() { +void OLAPHeader::change_file_version_to_delta() { + // convert FileVersionMessage to PDelta and PRowSet in initialization. + // FileVersionMessage is used in previous code, and PDelta and PRowSet + // is used in streaming load branch. + for (int i = 0; i < file_version_size(); ++i) { + PDelta* delta = add_delta(); + _convert_file_version_to_delta(file_version(i), delta); + } + + clear_file_version(); +} + +OLAPStatus OLAPHeader::init() { + clear_version_graph(&_version_graph, &_vertex_helper_map); + if (construct_version_graph(delta(), + &_version_graph, + &_vertex_helper_map) != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to construct version graph."); + return OLAP_ERR_OTHER_ERROR; + } + if (_file_name == "") { + stringstream file_stream; + file_stream << tablet_id() << ".hdr"; + _file_name = file_stream.str(); + } + return OLAP_SUCCESS; +} + +OLAPStatus OLAPHeader::load_and_init() { + // check the tablet_path is not empty + if (_file_name == "") { + LOG(WARNING) << "file_path is empty for header"; + return OLAP_ERR_DIR_NOT_EXIST; + } + FileHeader file_header; FileHandler file_handler; if (file_handler.open(_file_name.c_str(), O_RDONLY) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to open index file. [file='%s']", _file_name.c_str()); + LOG(WARNING) << "fail to open index file. [file='" << _file_name << "']"; return OLAP_ERR_IO_ERROR; } // In file_header.unserialize(), it validates file length, signature, checksum of protobuf. if (file_header.unserialize(&file_handler) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to unserialize header. [path='%s']", _file_name.c_str()); + LOG(WARNING) << "fail to unserialize header. [path='" << _file_name << "']"; return OLAP_ERR_PARSE_PROTOBUF_ERROR; } try { CopyFrom(file_header.message()); } catch (...) { - OLAP_LOG_WARNING("fail to copy protocol buffer object. [path='%s']", _file_name.c_str()); + LOG(WARNING) << "fail to copy protocol buffer object. [path='" << _file_name << "']"; return OLAP_ERR_PARSE_PROTOBUF_ERROR; } - clear_version_graph(&_version_graph, &_vertex_helper_map); + if (file_version_size() != 0) { + // convert FileVersionMessage to PDelta and PRowSet in initialization. + for (int i = 0; i < file_version_size(); ++i) { + PDelta* delta = add_delta(); + _convert_file_version_to_delta(file_version(i), delta); + } - if (construct_version_graph(file_version(), - &_version_graph, - &_vertex_helper_map) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to construct version graph."); - return OLAP_ERR_OTHER_ERROR; + clear_file_version(); + OLAPStatus res = save(); + if (res != OLAP_SUCCESS) { + LOG(FATAL) << "failed to remove file version in initialization"; + } } - - return OLAP_SUCCESS; + return init(); } OLAPStatus OLAPHeader::save() { @@ -116,75 +154,92 @@ OLAPStatus OLAPHeader::save() { } OLAPStatus OLAPHeader::save(const string& file_path) { + // check the tablet_path is not empty + if (file_path == "") { + LOG(WARNING) << "file_path is empty for header"; + return OLAP_ERR_DIR_NOT_EXIST; + } + FileHeader file_header; FileHandler file_handler; if (file_handler.open_with_mode(file_path.c_str(), O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to open header file. [file='%s']", file_path.c_str()); + LOG(WARNING) << "fail to open header file. [file='" << file_path << "']"; return OLAP_ERR_IO_ERROR; } try { file_header.mutable_message()->CopyFrom(*this); } catch (...) { - OLAP_LOG_WARNING("fail to copy protocol buffer object. [path='%s']", file_path.c_str()); + LOG(WARNING) << "fail to copy protocol buffer object. [path='" << file_path << "']"; return OLAP_ERR_OTHER_ERROR; } if (file_header.prepare(&file_handler) != OLAP_SUCCESS || file_header.serialize(&file_handler) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to serialize to file header. [path='%s']", file_path.c_str()); + LOG(WARNING) << "fail to serialize to file header. [path='" << file_path << "']"; return OLAP_ERR_SERIALIZE_PROTOBUF_ERROR; } return OLAP_SUCCESS; } -OLAPStatus OLAPHeader::add_version( - Version version, - VersionHash version_hash, - uint32_t num_segments, - time_t max_timestamp, - int64_t index_size, - int64_t data_size, - int64_t num_rows, - const std::vector>* column_statistics) { +OLAPStatus OLAPHeader::add_version(Version version, VersionHash version_hash, + int32_t rowset_id, int32_t num_segments, + int64_t index_size, int64_t data_size, int64_t num_rows, + bool empty, const std::vector* column_statistics) { // Check whether version is valid. if (version.first > version.second) { - OLAP_LOG_WARNING("the version is not valid. [version='%d,%d']", - version.first, - version.second); + LOG(WARNING) << "the version is not valid." + << "version=" << version.first << "-" << version.second; return OLAP_ERR_HEADER_ADD_VERSION; } - // Check whether the version is existed. - for (int i = 0; i < file_version_size(); ++i) { - if (file_version(i).start_version() == version.first - && file_version(i).end_version() == version.second) { - OLAP_LOG_WARNING("the version is existed. [version='%d,%d']", - version.first, - version.second); - return OLAP_ERR_HEADER_ADD_VERSION; + int delta_id = 0; + for (int i = 0; i < delta_size(); ++i) { + if (delta(i).start_version() == version.first + && delta(i).end_version() == version.second) { + for (const PRowSet& rowset : delta(i).rowset()) { + if (rowset.rowset_id() == rowset_id) { + LOG(WARNING) << "the version is existed." + << "version=" << version.first << ", " + << version.second; + return OLAP_ERR_HEADER_ADD_VERSION; + } + } + delta_id = i; + break; } } + // if rowset_id is greater or equal than zero, it is used + // to streaming load + // Try to add version to protobuf. + PDelta* new_delta = nullptr; try { - FileVersionMessage* new_version = add_file_version(); - new_version->set_num_segments(num_segments); - new_version->set_start_version(version.first); - new_version->set_end_version(version.second); - new_version->set_version_hash(version_hash); - new_version->set_max_timestamp(max_timestamp); - new_version->set_index_size(index_size); - new_version->set_data_size(data_size); - new_version->set_num_rows(num_rows); - new_version->set_creation_time(time(NULL)); + if (rowset_id == -1 || rowset_id == 0) { + // snapshot will use rowset_id which equals minus one + new_delta = add_delta(); + new_delta->set_start_version(version.first); + new_delta->set_end_version(version.second); + new_delta->set_version_hash(version_hash); + new_delta->set_creation_time(time(NULL)); + } else { + new_delta = const_cast(&delta(delta_id)); + } + PRowSet* new_rowset = new_delta->add_rowset(); + new_rowset->set_rowset_id(rowset_id); + new_rowset->set_num_segments(num_segments); + new_rowset->set_index_size(index_size); + new_rowset->set_data_size(data_size); + new_rowset->set_num_rows(num_rows); + new_rowset->set_empty(empty); if (NULL != column_statistics) { for (size_t i = 0; i < column_statistics->size(); ++i) { - ColumnPruning *column_pruning = - new_version->mutable_delta_pruning()->add_column_pruning(); + ColumnPruning *column_pruning = + new_rowset->add_column_pruning(); column_pruning->set_min(column_statistics->at(i).first->to_string()); column_pruning->set_max(column_statistics->at(i).second->to_string()); column_pruning->set_null_flag(column_statistics->at(i).first->is_null()); @@ -205,12 +260,220 @@ OLAPStatus OLAPHeader::add_version( return OLAP_SUCCESS; } +OLAPStatus OLAPHeader::add_pending_version( + int64_t partition_id, int64_t transaction_id, + const std::vector* delete_conditions) { + for (int i = 0; i < pending_delta_size(); ++i) { + if (pending_delta(i).transaction_id() == transaction_id) { + LOG(WARNING) << "pending delta already exists in header." + << "transaction_id: " << transaction_id; + return OLAP_ERR_HEADER_ADD_PENDING_DELTA; + } + } + + try { + PPendingDelta* new_pending_delta = add_pending_delta(); + new_pending_delta->set_partition_id(partition_id); + new_pending_delta->set_transaction_id(transaction_id); + new_pending_delta->set_creation_time(time(NULL)); + + if (delete_conditions != nullptr) { + DeleteConditionMessage* del_cond = new_pending_delta->mutable_delete_condition(); + del_cond->set_version(0); + for (const string& condition : *delete_conditions) { + del_cond->add_sub_conditions(condition); + OLAP_LOG_INFO("store one sub-delete condition. [condition='%s' transaction_id=%ld]", + condition.c_str(), transaction_id); + } + } + + } catch (...) { + LOG(WARNING) << "fail to add pending rowset to header protobf"; + return OLAP_ERR_HEADER_ADD_PENDING_DELTA; + } + + return OLAP_SUCCESS; +} + +OLAPStatus OLAPHeader::add_pending_rowset( + int64_t transaction_id, int32_t num_segments, + int32_t pending_rowset_id, const PUniqueId& load_id, + bool empty, const std::vector* column_statistics) { + + int32_t delta_id = 0; + for (int32_t i = 0; i < pending_delta_size(); ++i) { + const PPendingDelta& delta = pending_delta(i); + if (delta.transaction_id() == transaction_id) { + delta_id = i; + for (int j = 0; j < delta.pending_rowset_size(); ++j) { + const PPendingRowSet& pending_rowset = delta.pending_rowset(j); + if (pending_rowset.pending_rowset_id() == pending_rowset_id) { + LOG(WARNING) << "pending rowset already exists in header." + << "transaction_id:" << transaction_id << ", " + << "pending_rowset_id: " << pending_rowset_id; + return OLAP_ERR_HEADER_ADD_PENDING_DELTA; + } + } + } + } + + try { + PPendingRowSet* new_pending_rowset + = const_cast(pending_delta(delta_id)).add_pending_rowset(); + new_pending_rowset->set_pending_rowset_id(pending_rowset_id); + new_pending_rowset->set_num_segments(num_segments); + new_pending_rowset->mutable_load_id()->set_hi(load_id.hi()); + new_pending_rowset->mutable_load_id()->set_lo(load_id.lo()); + new_pending_rowset->set_empty(empty); + if (NULL != column_statistics) { + for (size_t i = 0; i < column_statistics->size(); ++i) { + ColumnPruning *column_pruning = + new_pending_rowset->add_column_pruning(); + column_pruning->set_min(column_statistics->at(i).first->to_string()); + column_pruning->set_max(column_statistics->at(i).second->to_string()); + column_pruning->set_null_flag(column_statistics->at(i).first->is_null()); + } + } + + } catch (...) { + OLAP_LOG_WARNING("fail to add pending rowset to protobf"); + return OLAP_ERR_HEADER_ADD_PENDING_DELTA; + } + + return OLAP_SUCCESS; +} + +OLAPStatus OLAPHeader::add_incremental_version(Version version, VersionHash version_hash, + int32_t rowset_id, int32_t num_segments, + int64_t index_size, int64_t data_size, int64_t num_rows, + bool empty, const std::vector* column_statistics) { + // Check whether version is valid. + if (version.first != version.second) { + OLAP_LOG_WARNING("the incremental version is not valid. [version=%d]", version.first); + return OLAP_ERR_HEADER_ADD_INCREMENTAL_VERSION; + } + + // Check whether the version is existed. + int32_t delta_id = 0; + for (int i = 0; i < incremental_delta_size(); ++i) { + const PDelta& incre_delta = incremental_delta(i); + if (incre_delta.start_version() == version.first) { + delta_id = i; + for (int j = 0; j < incre_delta.rowset_size(); ++j) { + const PRowSet& incremental_rowset = incre_delta.rowset(j); + if (incremental_rowset.rowset_id() == rowset_id) { + LOG(WARNING) << "rowset already exists in header." + << "version: " << version.first << "-" << version.second << "," + << "rowset_id: " << rowset_id; + return OLAP_ERR_HEADER_ADD_PENDING_DELTA; + } + } + } + } + + // Try to add version to protobuf. + try { + PDelta* new_incremental_delta = nullptr; + if (rowset_id == 0) { + new_incremental_delta = add_incremental_delta(); + new_incremental_delta->set_start_version(version.first); + new_incremental_delta->set_end_version(version.second); + new_incremental_delta->set_version_hash(version_hash); + new_incremental_delta->set_creation_time(time(NULL)); + } else { + new_incremental_delta = const_cast(&incremental_delta(delta_id)); + } + PRowSet* new_incremental_rowset = new_incremental_delta->add_rowset(); + new_incremental_rowset->set_rowset_id(rowset_id); + new_incremental_rowset->set_num_segments(num_segments); + new_incremental_rowset->set_index_size(index_size); + new_incremental_rowset->set_data_size(data_size); + new_incremental_rowset->set_num_rows(num_rows); + new_incremental_rowset->set_empty(empty); + if (NULL != column_statistics) { + for (size_t i = 0; i < column_statistics->size(); ++i) { + ColumnPruning *column_pruning = + new_incremental_rowset->add_column_pruning(); + column_pruning->set_min(column_statistics->at(i).first->to_string()); + column_pruning->set_max(column_statistics->at(i).second->to_string()); + column_pruning->set_null_flag(column_statistics->at(i).first->is_null()); + } + } + } catch (...) { + OLAP_LOG_WARNING("add incremental version to protobf error"); + return OLAP_ERR_HEADER_ADD_INCREMENTAL_VERSION; + } + + return OLAP_SUCCESS; +} + +void OLAPHeader::add_delete_condition(const DeleteConditionMessage& delete_condition, + int64_t version) { + // check whether condition exist + DeleteConditionMessage* del_cond = NULL; + int i = 0; + for (; i < delete_data_conditions_size(); i++) { + DeleteConditionMessage temp = delete_data_conditions().Get(i); + if (temp.version() == version) { + break; + } + } + + // clear existed condition + if (i < delete_data_conditions_size()) { + del_cond = mutable_delete_data_conditions(i); + del_cond->clear_sub_conditions(); + } else { + del_cond = add_delete_data_conditions(); + del_cond->set_version(version); + } + + for (const string& condition : delete_condition.sub_conditions()) { + del_cond->add_sub_conditions(condition); + } + OLAP_LOG_INFO("add delete condition. [version=%d]", version); +} + +const PPendingDelta* OLAPHeader::get_pending_delta(int64_t transaction_id) const { + for (int i = 0; i < pending_delta_size(); i++) { + if (pending_delta(i).transaction_id() == transaction_id) { + return &pending_delta(i); + } + } + return nullptr; +} + +const PPendingRowSet* OLAPHeader::get_pending_rowset(int64_t transaction_id, int32_t pending_rowset_id) const { + for (int i = 0; i < pending_delta_size(); i++) { + if (pending_delta(i).transaction_id() == transaction_id) { + const PPendingDelta& delta = pending_delta(i); + for (int j = 0; j < delta.pending_rowset_size(); ++j) { + const PPendingRowSet& pending_rowset = delta.pending_rowset(j); + if (pending_rowset.pending_rowset_id() == pending_rowset_id) { + return &pending_rowset; + } + } + } + } + return nullptr; +} + +const PDelta* OLAPHeader::get_incremental_version(Version version) const { + for (int i = 0; i < incremental_delta_size(); i++) { + if (incremental_delta(i).start_version() == version.first + && incremental_delta(i).end_version() == version.second) { + return &incremental_delta(i); + } + } + return nullptr; +} + OLAPStatus OLAPHeader::delete_version(Version version) { // Find the version that need to be deleted. int index = -1; - for (int i = 0; i < file_version_size(); ++i) { - if (file_version(i).start_version() == version.first - && file_version(i).end_version() == version.second) { + for (int i = 0; i < delta_size(); ++i) { + if (delta(i).start_version() == version.first + && delta(i).end_version() == version.second) { index = i; break; } @@ -218,8 +481,8 @@ OLAPStatus OLAPHeader::delete_version(Version version) { // Delete version from protobuf. if (index != -1) { - RepeatedPtrField* version_ptr = mutable_file_version(); - for (int i = index; i < file_version_size() - 1; ++i) { + RepeatedPtrField* version_ptr = mutable_delta(); + for (int i = index; i < delta_size() - 1; ++i) { version_ptr->SwapElements(i, i + 1); } @@ -227,8 +490,7 @@ OLAPStatus OLAPHeader::delete_version(Version version) { } // Atomic delete is not supported now. - if (delete_version_from_graph(file_version(), - version, + if (delete_version_from_graph(delta(), version, &_version_graph, &_vertex_helper_map) != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to delete version from graph. [version='%d-%d']", @@ -242,18 +504,59 @@ OLAPStatus OLAPHeader::delete_version(Version version) { OLAPStatus OLAPHeader::delete_all_versions() { clear_file_version(); + clear_delta(); + clear_pending_delta(); + clear_incremental_delta(); clear_version_graph(&_version_graph, &_vertex_helper_map); - if (construct_version_graph(file_version(), + if (construct_version_graph(delta(), &_version_graph, &_vertex_helper_map) != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to construct version graph."); return OLAP_ERR_OTHER_ERROR; } - return OLAP_SUCCESS; } +void OLAPHeader::delete_pending_delta(int64_t transaction_id) { + int index = -1; + for (int i = 0; i < pending_delta_size(); ++i) { + if (pending_delta(i).transaction_id() == transaction_id) { + index = i; + break; + } + } + + if (index != -1) { + RepeatedPtrField* pending_delta_ptr = mutable_pending_delta(); + for (int i = index; i < pending_delta_size() - 1; ++i) { + pending_delta_ptr->SwapElements(i, i + 1); + } + + pending_delta_ptr->RemoveLast(); + } +} + +void OLAPHeader::delete_incremental_delta(Version version) { + int index = -1; + for (int i = 0; i < incremental_delta_size(); ++i) { + if (incremental_delta(i).start_version() == version.first + && incremental_delta(i).end_version() == version.second) { + index = i; + break; + } + } + + if (index != -1) { + RepeatedPtrField* version_ptr = mutable_incremental_delta(); + for (int i = index; i < incremental_delta_size() - 1; ++i) { + version_ptr->SwapElements(i, i + 1); + } + + version_ptr->RemoveLast(); + } +} + // This function is called when base-compaction, cumulative-compaction, quering. // we use BFS algorithm to get the shortest version path. OLAPStatus OLAPHeader::select_versions_to_span(const Version& target_version, @@ -388,86 +691,125 @@ OLAPStatus OLAPHeader::select_versions_to_span(const Version& target_version, return OLAP_SUCCESS; } -const FileVersionMessage* OLAPHeader::get_lastest_delta_version() const { - if (file_version_size() == 0) { - return NULL; +const PDelta* OLAPHeader::get_lastest_delta_version() const { + if (delta_size() == 0) { + return nullptr; } - const FileVersionMessage* max_version = NULL; - for (int i = file_version_size() - 1; i >= 0; --i) { - if (file_version(i).start_version() == file_version(i).end_version()) { - if (max_version == NULL) { - max_version = &file_version(i); - } else if (file_version(i).start_version() > max_version->start_version()) { - max_version = &file_version(i); + const PDelta* max_delta = nullptr; + for (int i = delta_size() - 1; i >= 0; --i) { + if (delta(i).start_version() == delta(i).end_version()) { + if (max_delta == nullptr) { + max_delta = &delta(i); + } else if (delta(i).start_version() > max_delta->start_version()) { + max_delta = &delta(i); } } } - - return max_version; + if (max_delta != nullptr) { + LOG(INFO) << "max_delta:" << max_delta->start_version() << "," + << max_delta->end_version(); + } + return max_delta; } -const FileVersionMessage* OLAPHeader::get_latest_version() const { - if (file_version_size() == 0) { - return NULL; +const PDelta* OLAPHeader::get_lastest_version() const { + if (delta_size() == 0) { + return nullptr; } - const FileVersionMessage* max_version = NULL; - for (int i = file_version_size() - 1; i >= 0; --i) { - if (max_version == NULL) { - max_version = &file_version(i); - } else if (file_version(i).end_version() > max_version->end_version()) { - max_version = &file_version(i); - } else if (file_version(i).end_version() == max_version->end_version() - && file_version(i).start_version() == file_version(i).end_version()) { - max_version = &file_version(i); + const PDelta* max_delta = nullptr; + for (int i = delta_size() - 1; i >= 0; --i) { + if (max_delta == nullptr) { + max_delta = &delta(i); + } else if (delta(i).end_version() > max_delta->end_version()) { + max_delta = &delta(i); + } else if (delta(i).end_version() == max_delta->end_version() + && delta(i).start_version() == delta(i).end_version()) { + max_delta = &delta(i); } } - - return max_version; + return max_delta; } -const FileVersionMessage* OLAPHeader::get_base_version() const { - if (file_version_size() == 0) { - return NULL; +Version OLAPHeader::get_latest_version() const { + auto delta = get_lastest_version(); + return {delta->start_version(), delta->end_version()}; +} + +const PDelta* OLAPHeader::get_delta(int index) const { + if (delta_size() == 0) { + return nullptr; } - const FileVersionMessage* base_version = NULL; - for (int i = 0; i < file_version_size(); ++i) { - if (file_version(i).start_version() == 0) { - base_version = &file_version(i); - break; + return &delta(index); +} + +void OLAPHeader::_convert_file_version_to_delta(const FileVersionMessage& version, + PDelta* delta) { + delta->set_start_version(version.start_version()); + delta->set_end_version(version.end_version()); + delta->set_version_hash(version.version_hash()); + delta->set_creation_time(version.creation_time()); + + PRowSet* rowset = delta->add_rowset(); + rowset->set_rowset_id(-1); + rowset->set_num_segments(version.num_segments()); + rowset->set_index_size(version.index_size()); + rowset->set_data_size(version.data_size()); + rowset->set_num_rows(version.num_rows()); + if (version.has_delta_pruning()) { + for (int i = 0; i < version.delta_pruning().column_pruning_size(); ++i) { + ColumnPruning* column_pruning = rowset->add_column_pruning(); + *column_pruning = version.delta_pruning().column_pruning(i); } } - return base_version; } -const uint32_t OLAPHeader::get_compaction_nice_estimate() const{ - uint32_t nice = 0; +const uint32_t OLAPHeader::get_cumulative_compaction_score() const{ + uint32_t score = 0; bool base_version_exists = false; const int32_t point = cumulative_layer_point(); - for (int i = file_version_size() - 1; i >= 0; --i) { - if (file_version(i).start_version() >= point) { - nice++; + for (int i = delta_size() - 1; i >= 0; --i) { + if (delta(i).start_version() >= point) { + score++; } - if (file_version(i).start_version() == 0) { + if (delta(i).start_version() == 0) { base_version_exists = true; } } - nice = nice < config::cumulative_compaction_num_singleton_deltas ? 0 : nice; + score = score < config::cumulative_compaction_num_singleton_deltas ? 0 : score; - // baseä¸å­˜åœ¨å¯èƒ½æ˜¯tablet正在åšalter table,先ä¸é€‰å®ƒï¼Œè®¾nice=0 - return base_version_exists ? nice : 0; + // baseä¸å­˜åœ¨å¯èƒ½æ˜¯tablet正在åšalter table,先ä¸é€‰å®ƒï¼Œè®¾score=0 + return base_version_exists ? score : 0; +} + +const uint32_t OLAPHeader::get_base_compaction_score() const{ + uint32_t score = 0; + const int32_t point = cumulative_layer_point(); + bool base_version_exists = false; + for (int i = delta_size() - 1; i >= 0; --i) { + if (delta(i).end_version() < point) { + score++; + } + if (delta(i).start_version() == 0) { + base_version_exists = true; + } + } + score = score < config::base_compaction_num_cumulative_deltas ? 0 : score; + + // baseä¸å­˜åœ¨å¯èƒ½æ˜¯tablet正在åšalter table,先ä¸é€‰å®ƒï¼Œè®¾score=0 + return base_version_exists ? score : 0; } const OLAPStatus OLAPHeader::version_creation_time(const Version& version, int64_t* creation_time) const { - if (0 == file_version_size()) { + if (delta_size() == 0) { return OLAP_ERR_VERSION_NOT_EXIST; } - for (int i = file_version_size() - 1; i >= 0; --i) { - const FileVersionMessage& temp = file_version(i); + for (int i = delta_size() - 1; i >= 0; --i) { + const PDelta& temp = delta(i); if (temp.start_version() == version.first && temp.end_version() == version.second) { *creation_time = temp.creation_time(); return OLAP_SUCCESS; @@ -487,7 +829,7 @@ const OLAPStatus OLAPHeader::version_creation_time(const Version& version, // Construct version graph(using adjacency list) from header's information. static OLAPStatus construct_version_graph( - const RepeatedPtrField& versions_in_header, + const RepeatedPtrField& versions_in_header, vector* version_graph, unordered_map* vertex_helper_map) { if (versions_in_header.size() == 0) { @@ -562,7 +904,7 @@ static OLAPStatus clear_version_graph(vector* version_graph, SAFE_DELETE(it->edges); } version_graph->clear(); - + return OLAP_SUCCESS; } @@ -604,7 +946,7 @@ static OLAPStatus add_version_to_graph(const Version& version, // Delete version from graph, it is called near the end of delete_version static OLAPStatus delete_version_from_graph( - const RepeatedPtrField& versions_in_header, + const RepeatedPtrField& versions_in_header, const Version& version, vector* version_graph, unordered_map* vertex_helper_map) { @@ -702,4 +1044,18 @@ static OLAPStatus add_vertex_to_graph(int vertex_value, return OLAP_SUCCESS; } +const PDelta* OLAPHeader::get_base_version() const { + if (delta_size() == 0) { + return nullptr; + } + + for (int i = 0; i < delta_size(); ++i) { + if (delta(i).start_version() == 0) { + return &delta(i); + } + } + + return nullptr; +} + } // namespace palo diff --git a/be/src/olap/olap_header.h b/be/src/olap/olap_header.h index 62c8c01709..541eb0d6c2 100644 --- a/be/src/olap/olap_header.h +++ b/be/src/olap/olap_header.h @@ -22,6 +22,7 @@ #include #include "gen_cpp/olap_file.pb.h" +#include "gen_cpp/Types_types.h" #include "olap/olap_common.h" #include "olap/olap_define.h" @@ -29,21 +30,27 @@ namespace palo { // Class for managing olap table header. class OLAPHeader : public OLAPHeaderMessage { public: + explicit OLAPHeader() : + _support_reverse_version(false) {} + + // for compatible header file explicit OLAPHeader(const std::string& file_name) : _file_name(file_name), _support_reverse_version(false) {} virtual ~OLAPHeader(); - // Loads the header from disk, returning true on success. - // In load(), we will validate olap header file, which mainly include + // Loads the header from disk and init, returning true on success. + // In load_and_init(), we will validate olap header file, which mainly include // tablet schema, delta version and so on. - OLAPStatus load(); + OLAPStatus load_and_init(); // Saves the header to disk, returning true on success. OLAPStatus save(); OLAPStatus save(const std::string& file_path); + OLAPStatus init(); + // Return the file name of the heade. std::string file_name() const { return _file_name; @@ -51,19 +58,34 @@ public: // Adds a new version to the header. Do not use the proto's // add_version() directly. - OLAPStatus add_version( - Version version, - VersionHash version_hash, - uint32_t num_segments, - time_t max_timestamp, - int64_t index_size, - int64_t data_size, - int64_t num_rows, - const std::vector>* column_statistics = nullptr); + OLAPStatus add_version(Version version, VersionHash version_hash, + int32_t rowset_id, int32_t num_segments, + int64_t index_size, int64_t data_size, int64_t num_rows, + bool empty, const std::vector* column_statistics); + + OLAPStatus add_pending_version(int64_t partition_id, int64_t transaction_id, + const std::vector* delete_conditions); + OLAPStatus add_pending_rowset(int64_t transaction_id, int32_t num_segments, + int32_t pending_rowset_id, const PUniqueId& load_id, + bool empty, const std::vector* column_statistics); + + // add incremental rowset into header like "9-9" "10-10", for incremental cloning + OLAPStatus add_incremental_version(Version version, VersionHash version_hash, + int32_t rowset_id, int32_t num_segments, + int64_t index_size, int64_t data_size, int64_t num_rows, + bool empty, const std::vector* column_statistics); + + void add_delete_condition(const DeleteConditionMessage& delete_condition, int64_t version); + + const PPendingDelta* get_pending_delta(int64_t transaction_id) const; + const PPendingRowSet* get_pending_rowset(int64_t transaction_id, int32_t pending_rowset_id) const; + const PDelta* get_incremental_version(Version version) const; // Deletes a version from the header. OLAPStatus delete_version(Version version); OLAPStatus delete_all_versions(); + void delete_pending_delta(int64_t transaction_id); + void delete_incremental_delta(Version version); // Constructs a canonical file name (without path) for the header. // eg "DailyUnitStats_PRIMARY.hdr" @@ -84,16 +106,24 @@ public: virtual OLAPStatus select_versions_to_span(const Version& target_version, std::vector* span_versions); - const FileVersionMessage* get_lastest_delta_version() const; - const FileVersionMessage* get_latest_version() const; - const FileVersionMessage* get_base_version() const; - const uint32_t get_compaction_nice_estimate() const; + const PDelta* get_lastest_delta_version() const; + const PDelta* get_lastest_version() const; + Version get_latest_version() const; + const PDelta* get_delta(int index) const; + const PDelta* get_base_version() const; + const uint32_t get_cumulative_compaction_score() const; + const uint32_t get_base_compaction_score() const; const OLAPStatus version_creation_time(const Version& version, int64_t* creation_time) const; + int file_delta_size() const { + return delta_size(); + } + void change_file_version_to_delta(); private: // Compute schema hash(all fields name and type, index name and its field // names) using lzo_adler32 function. OLAPStatus _compute_schema_hash(SchemaHash* schema_hash); + void _convert_file_version_to_delta(const FileVersionMessage& version, PDelta* delta); // full path of olap header file std::string _file_name; @@ -115,7 +145,7 @@ private: // vertex value --> vertex_index of _version_graph // It is easy to find vertex index according to vertex value. std::unordered_map _vertex_helper_map; - + DISALLOW_COPY_AND_ASSIGN(OLAPHeader); }; diff --git a/be/src/olap/olap_header_manager.cpp b/be/src/olap/olap_header_manager.cpp old mode 100644 new mode 100755 diff --git a/be/src/olap/olap_index.cpp b/be/src/olap/olap_index.cpp index 0f9803d769..f83278210f 100644 --- a/be/src/olap/olap_index.cpp +++ b/be/src/olap/olap_index.cpp @@ -32,595 +32,6 @@ using std::string; using std::vector; namespace palo { -#define TABLE_PARAM_VALIDATE() \ - do { \ - if (!_index_loaded) { \ - OLAP_LOG_WARNING("fail to find, index is not loaded. [table=%ld schema_hash=%d]", \ - _table->tablet_id(), \ - _table->schema_hash()); \ - return OLAP_ERR_NOT_INITED; \ - } \ - } while (0); - -#define POS_PARAM_VALIDATE(pos) \ - do { \ - if (NULL == pos) { \ - OLAP_LOG_WARNING("fail to find, NULL position parameter."); \ - return OLAP_ERR_INPUT_PARAMETER_ERROR; \ - } \ - } while (0); - -#define SLICE_PARAM_VALIDATE(slice) \ - do { \ - if (NULL == slice) { \ - OLAP_LOG_WARNING("fail to find, NULL slice parameter."); \ - return OLAP_ERR_INPUT_PARAMETER_ERROR; \ - } \ - } while (0); - -OLAPIndex::OLAPIndex(OLAPTable* table, - Version version, - VersionHash version_hash, - bool delete_flag, - uint32_t num_segments, - time_t max_timestamp) : - _table(table), - _version(version), - _delete_flag(delete_flag), - _max_timestamp(max_timestamp), - _num_segments(num_segments), - _version_hash(version_hash), - _current_num_rows_per_row_block(0), - _inited_column_statistics(false), - _column_statistics( - _table->num_key_fields(), std::pair(NULL, NULL)) { - const RowFields& tablet_schema = _table->tablet_schema(); - _short_key_length = 0; - _new_short_key_length = 0; - _short_key_buf = NULL; - - for (size_t i = 0; i < _table->num_short_key_fields(); ++i) { - _short_key_info_list.push_back(tablet_schema[i]); - _short_key_length += tablet_schema[i].index_length + 1;// 1 for null byte - if (tablet_schema[i].type == OLAP_FIELD_TYPE_CHAR || - tablet_schema[i].type == OLAP_FIELD_TYPE_VARCHAR) { - _new_short_key_length += sizeof(StringSlice) + 1; - } else { - _new_short_key_length += tablet_schema[i].index_length + 1; - } - } - - _index_loaded = false; - _ref_count = 0; - _header_file_name = _table->header_file_name(); -} - -OLAPIndex::~OLAPIndex() { - delete [] _short_key_buf; - _current_file_handler.close(); - - if (_inited_column_statistics) { - for (size_t i = 0; i < _column_statistics.size(); ++i) { - SAFE_DELETE(_column_statistics[i].first); - SAFE_DELETE(_column_statistics[i].second); - } - } - - _seg_pb_map.clear(); -} - -void OLAPIndex::acquire() { - atomic_inc(&_ref_count); -} - -int64_t OLAPIndex::ref_count() { - return _ref_count; -} - -void OLAPIndex::release() { - atomic_dec(&_ref_count); -} - -bool OLAPIndex::is_in_use() { - return _ref_count > 0; -} - -// you can not use OLAPIndex after delete_all_files(), or else unknown behavior occurs. -void OLAPIndex::delete_all_files() { - for (uint32_t seg_id = 0; seg_id < _num_segments; ++seg_id) { - // get full path for one segment - string index_path = _construct_index_file_path(_version, _version_hash, seg_id); - string data_path = _construct_data_file_path(_version, _version_hash, seg_id); - - if (remove(index_path.c_str()) != 0) { - OLAP_LOG_WARNING("fail to delete index file. [err='%m' path='%s']", index_path.c_str()); - } - - if (remove(data_path.c_str()) != 0) { - OLAP_LOG_WARNING("fail to delete data file. [err='%m' path='%s']", data_path.c_str()); - } - } -} - -OLAPStatus OLAPIndex::set_column_statistics( - const std::vector>& column_statistics) { - if (_inited_column_statistics) { - return OLAP_SUCCESS; - } - - if (column_statistics.size() != _column_statistics.size()) { - OLAP_LOG_WARNING("fail to set delta pruning![column statistics size=%d:%d]", - _column_statistics.size(), column_statistics.size()); - return OLAP_ERR_INDEX_DELTA_PRUNING; - } - - for (size_t i = 0; i < _column_statistics.size(); ++i) { - _column_statistics[i].first = WrapperField::create(_table->tablet_schema()[i]); - if (_column_statistics[i].first == NULL) { - OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i); - return OLAP_ERR_MALLOC_ERROR; - } - - _column_statistics[i].second = WrapperField::create(_table->tablet_schema()[i]); - if (_column_statistics[i].second == NULL) { - OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i); - return OLAP_ERR_MALLOC_ERROR; - } - } - - for (size_t i = 0; i < _column_statistics.size(); ++i) { - _column_statistics[i].first->copy(column_statistics[i].first); - _column_statistics[i].second->copy(column_statistics[i].second); - } - - _inited_column_statistics = true; - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPIndex::set_column_statistics_from_string( - std::vector > &column_statistics_string, - std::vector &has_null_flags) { - if (_inited_column_statistics) { - return OLAP_SUCCESS; - } - - if (column_statistics_string.size() != _column_statistics.size()) { - OLAP_LOG_WARNING("fail to set delta pruning![column statistics size=%d:%d]", - _column_statistics.size(), column_statistics_string.size()); - return OLAP_ERR_INDEX_DELTA_PRUNING; - } - - for (size_t i = 0; i < _column_statistics.size(); ++i) { - _column_statistics[i].first = WrapperField::create(_table->tablet_schema()[i]); - if (_column_statistics[i].first == NULL) { - OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i); - return OLAP_ERR_MALLOC_ERROR; - } - - _column_statistics[i].second = WrapperField::create(_table->tablet_schema()[i]); - if (_column_statistics[i].second == NULL) { - OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i); - return OLAP_ERR_MALLOC_ERROR; - } - } - - OLAPStatus res = OLAP_SUCCESS; - for (size_t i = 0; i < _column_statistics.size(); ++i) { - res = _column_statistics[i].first->from_string( - const_cast(column_statistics_string[i].first.c_str())); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to init field from string.[string=%s]", - column_statistics_string[i].first.c_str()); - return res; - } - if (has_null_flags[i]) { - //[min, max] -> [NULL, max] - _column_statistics[i].first->set_null(); - } - res = _column_statistics[i].second->from_string( - const_cast(column_statistics_string[i].second.c_str())); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("fail to init field from string.[string=%s]", - column_statistics_string[i].second.c_str()); - return res; - } - } - - _inited_column_statistics = true; - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPIndex::load() { - OLAPStatus res = OLAP_ERR_INDEX_LOAD_ERROR; - boost::lock_guard guard(_index_load_lock); - - if (_index_loaded) { - return OLAP_SUCCESS; - } - - if (_num_segments == 0) { - OLAP_LOG_WARNING("fail to load index, segments number is 0."); - return res; - } - - if (_index.init(_short_key_length, _new_short_key_length, - _table->num_short_key_fields(), &_short_key_info_list) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create MemIndex. [num_segment=%d]", _num_segments); - return res; - } - - // for each segment - for (uint32_t seg_id = 0; seg_id < _num_segments; ++seg_id) { - if (COLUMN_ORIENTED_FILE == _table->data_file_type()) { - string seg_path = _table->construct_data_file_path(_version, _version_hash, seg_id); - if (OLAP_SUCCESS != (res = load_pb(seg_path.c_str(), seg_id))) { - OLAP_LOG_WARNING("faile to load pb structures. [seg_path='%s']", seg_path.c_str()); - _check_io_error(res); - return res; - } - } - - // get full path for one segment - string path = _table->construct_index_file_path(_version, _version_hash, seg_id); - if ((res = _index.load_segment(path.c_str(), &_current_num_rows_per_row_block)) - != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load segment. [path='%s']", path.c_str()); - _check_io_error(res); - return res; - } - } - - _index_loaded = true; - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPIndex::load_pb(const char* file, uint32_t seg_id) { - OLAPStatus res = OLAP_SUCCESS; - - FileHeader seg_file_header; - FileHandler seg_file_handler; - res = seg_file_handler.open_with_cache(file, O_RDONLY); - if (OLAP_SUCCESS != res) { - OLAP_LOG_WARNING("failed to open segment file. [err=%d, file=%s]", res, file); - return res; - } - - res = seg_file_header.unserialize(&seg_file_handler); - if (OLAP_SUCCESS != res) { - seg_file_handler.close(); - OLAP_LOG_WARNING("fail to unserialize header. [err=%d, path='%s']", res, file); - return res; - } - - _seg_pb_map[seg_id] = seg_file_header; - seg_file_handler.close(); - return OLAP_SUCCESS; -} - -bool OLAPIndex::index_loaded() { - return _index_loaded; -} - -OLAPStatus OLAPIndex::validate() { - OLAPStatus res = OLAP_SUCCESS; - - for (uint32_t seg_id = 0; seg_id < _num_segments; ++seg_id) { - FileHeader index_file_header; - FileHeader data_file_header; - - // get full path for one segment - string index_path = _table->construct_index_file_path(_version, _version_hash, seg_id); - string data_path = _table->construct_data_file_path(_version, _version_hash, seg_id); - - // 检查index文件头 - if ((res = index_file_header.validate(index_path)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("validate index file error. [file='%s']", index_path.c_str()); - _check_io_error(res); - return res; - } - - // 检查data文件头 - if ((res = data_file_header.validate(data_path)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("validate data file error. [file='%s']", data_path.c_str()); - _check_io_error(res); - return res; - } - } - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPIndex::find_row_block(const RowCursor& key, - RowCursor* helper_cursor, - bool find_last, - RowBlockPosition* pos) const { - TABLE_PARAM_VALIDATE(); - POS_PARAM_VALIDATE(pos); - - // 将这部分逻辑从memindex移出æ¥ï¼Œè¿™æ ·å¯ä»¥å¤ç”¨find。 - OLAPIndexOffset offset = _index.find(key, helper_cursor, find_last); - if (offset.offset > 0) { - offset.offset = offset.offset - 1; - } else { - offset.offset = 0; - } - - if (find_last) { - OLAPIndexOffset next_offset = _index.next(offset); - if (!(next_offset == _index.end())) { - offset = next_offset; - } - } - - return _index.get_row_block_position(offset, pos); -} - -OLAPStatus OLAPIndex::find_short_key(const RowCursor& key, - RowCursor* helper_cursor, - bool find_last, - RowBlockPosition* pos) const { - TABLE_PARAM_VALIDATE(); - POS_PARAM_VALIDATE(pos); - - // 由于find会从å‰ä¸€ä¸ªsegment找起,如果å‰ä¸€ä¸ªsegment中æ°å¥½æ²¡æœ‰è¯¥key, - // 就用å‰ç§»åŽç§»æ¥ç§»åЍsegmentçš„ä½ç½®. - OLAPIndexOffset offset = _index.find(key, helper_cursor, find_last); - if (offset.offset > 0) { - offset.offset = offset.offset - 1; - - OLAPIndexOffset next_offset = _index.next(offset); - if (!(next_offset == _index.end())) { - offset = next_offset; - } - } - - OLAP_LOG_DEBUG("[seg='%d', offset='%d']", offset.segment, offset.offset); - return _index.get_row_block_position(offset, pos); -} - -OLAPStatus OLAPIndex::get_row_block_entry(const RowBlockPosition& pos, EntrySlice* entry) const { - TABLE_PARAM_VALIDATE(); - SLICE_PARAM_VALIDATE(entry); - - return _index.get_entry(_index.get_offset(pos), entry); -} - -OLAPStatus OLAPIndex::find_first_row_block(RowBlockPosition* position) const { - TABLE_PARAM_VALIDATE(); - POS_PARAM_VALIDATE(position); - - return _index.get_row_block_position(_index.find_first(), position); -} - -OLAPStatus OLAPIndex::find_last_row_block(RowBlockPosition* position) const { - TABLE_PARAM_VALIDATE(); - POS_PARAM_VALIDATE(position); - - return _index.get_row_block_position(_index.find_last(), position); -} - -OLAPStatus OLAPIndex::find_next_row_block(RowBlockPosition* pos, bool* eof) const { - TABLE_PARAM_VALIDATE(); - POS_PARAM_VALIDATE(pos); - POS_PARAM_VALIDATE(eof); - - OLAPIndexOffset current = _index.get_offset(*pos); - *eof = false; - - OLAPIndexOffset next = _index.next(current); - if (next == _index.end()) { - *eof = true; - return OLAP_ERR_INDEX_EOF; - } - - return _index.get_row_block_position(next, pos); -} - -OLAPStatus OLAPIndex::find_mid_point(const RowBlockPosition& low, - const RowBlockPosition& high, - RowBlockPosition* output, - uint32_t* dis) const { - *dis = compute_distance(low, high); - if (*dis >= _index.count()) { - return OLAP_ERR_INDEX_EOF; - } else { - *output = low; - if (advance_row_block(*dis / 2, output) != OLAP_SUCCESS) { - return OLAP_ERR_INDEX_EOF; - } - - return OLAP_SUCCESS; - } -} - -OLAPStatus OLAPIndex::find_prev_point( - const RowBlockPosition& current, RowBlockPosition* prev) const { - OLAPIndexOffset current_offset = _index.get_offset(current); - OLAPIndexOffset prev_offset = _index.prev(current_offset); - - return _index.get_row_block_position(prev_offset, prev); -} - -OLAPStatus OLAPIndex::advance_row_block(int64_t num_row_blocks, RowBlockPosition* position) const { - TABLE_PARAM_VALIDATE(); - POS_PARAM_VALIDATE(position); - - OLAPIndexOffset off = _index.get_offset(*position); - iterator_offset_t absolute_offset = _index.get_absolute_offset(off) + num_row_blocks; - if (absolute_offset >= _index.count()) { - return OLAP_ERR_INDEX_EOF; - } - - return _index.get_row_block_position(_index.get_relative_offset(absolute_offset), position); -} - -// PRECONDITION position1 < position2 -uint32_t OLAPIndex::compute_distance(const RowBlockPosition& position1, - const RowBlockPosition& position2) const { - iterator_offset_t offset1 = _index.get_absolute_offset(_index.get_offset(position1)); - iterator_offset_t offset2 = _index.get_absolute_offset(_index.get_offset(position2)); - - return offset2 > offset1 ? offset2 - offset1 : 0; -} - -OLAPStatus OLAPIndex::add_segment() { - // 打开文件 - ++_num_segments; - OLAPStatus res = OLAP_SUCCESS; - OLAPIndexHeaderMessage* index_header = NULL; - - string file_path = - _table->construct_index_file_path(version(), version_hash(), _num_segments - 1); - res = _current_file_handler.open_with_mode( - file_path.c_str(), O_CREAT | O_EXCL | O_WRONLY, S_IRUSR | S_IWUSR); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("can not create file. [file_path='%s' err=%m]", file_path.c_str()); - _check_io_error(res); - return res; - } - - // 构造Protoæ ¼å¼çš„Header - index_header = _file_header.mutable_message(); - index_header->set_start_version(_version.first); - index_header->set_end_version(_version.second); - index_header->set_cumulative_version_hash(_version_hash); - index_header->set_segment(_num_segments - 1); - index_header->set_num_rows_per_block(_table->num_rows_per_row_block()); - index_header->set_delete_flag(_delete_flag); - index_header->set_null_supported(true); - - // 准备FileHeader - if ((res = _file_header.prepare(&_current_file_handler)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("write file header error. [err=%m]"); - _check_io_error(res); - return res; - } - - // 跳过FileHeader - if (_current_file_handler.seek(_file_header.size(), SEEK_SET) == -1) { - OLAP_LOG_WARNING("lseek header file error. [err=%m]"); - res = OLAP_ERR_IO_ERROR; - _check_io_error(res); - return res; - } - - // 分é…一段存储short key的内存, åˆå§‹åŒ–index_row - if (_short_key_buf == NULL) { - _short_key_buf = new(std::nothrow) char[_short_key_length]; - if (_short_key_buf == NULL) { - OLAP_LOG_WARNING("malloc short_key_buf error."); - return OLAP_ERR_MALLOC_ERROR; - } - - if (_current_index_row.init(_table->tablet_schema()) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("init _current_index_row fail."); - return OLAP_ERR_INIT_FAILED; - } - } - - // åˆå§‹åŒ–checksum - _checksum = ADLER32_INIT; - return OLAP_SUCCESS; -} - -OLAPStatus OLAPIndex::add_row_block(const RowBlock& row_block, const uint32_t data_offset) { - // get first row of the row_block to distill index item. - row_block.get_row(0, &_current_index_row); - return add_short_key(_current_index_row, data_offset); -} - -OLAPStatus OLAPIndex::add_short_key(const RowCursor& short_key, const uint32_t data_offset) { - // å°†short key的内容写入_short_key_buf - OLAPStatus res = OLAP_SUCCESS; - size_t offset = 0; - - //short_key.write_null_array(_short_key_buf); - //offset += short_key.get_num_null_byte(); - for (size_t i = 0; i < _short_key_info_list.size(); i++) { - short_key.write_index_by_index(i, _short_key_buf + offset); - offset += short_key.get_index_size(i) + 1; - } - - // 写入Short Keyå¯¹åº”çš„æ•°æ® - if ((res = _current_file_handler.write(_short_key_buf, _short_key_length)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("write short key failed. [err=%m]"); - _check_io_error(res); - return res; - } - - // å†™å…¥å¯¹åº”çš„æ•°æ®æ–‡ä»¶åç§»é‡ - if ((res = _current_file_handler.write(&data_offset, sizeof(data_offset))) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("write data_offset failed. [err=%m]"); - _check_io_error(res); - return res; - } - - _checksum = olap_adler32(_checksum, _short_key_buf, _short_key_length); - _checksum = olap_adler32(_checksum, - reinterpret_cast(&data_offset), - sizeof(data_offset)); - return OLAP_SUCCESS; -} - -OLAPStatus OLAPIndex::finalize_segment(uint32_t data_segment_size, int64_t num_rows) { - // 准备FileHeader - OLAPStatus res = OLAP_SUCCESS; - - int file_length = _current_file_handler.tell(); - if (file_length == -1) { - OLAP_LOG_WARNING("get file_length error. [err=%m]"); - _check_io_error(res); - return OLAP_ERR_IO_ERROR; - } - - _file_header.set_file_length(file_length); - _file_header.set_checksum(_checksum); - _file_header.mutable_extra()->data_length = data_segment_size; - _file_header.mutable_extra()->num_rows = num_rows; - - // 写入更新之åŽçš„FileHeader - if ((res = _file_header.serialize(&_current_file_handler)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("write file header error. [err=%m]"); - _check_io_error(res); - return res; - } - - OLAP_LOG_DEBUG("finalize_segment. [file_name='%s' file_length=%d]", - _current_file_handler.file_name().c_str(), - file_length); - - if ((res = _current_file_handler.close()) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("close file error. [err=%m]"); - _check_io_error(res); - return res; - } - - return OLAP_SUCCESS; -} - -void OLAPIndex::sync() { - if (_current_file_handler.sync() == -1) { - OLAP_LOG_WARNING("fail to sync file.[err=%m]"); - _table->set_io_error(); - } -} - -VersionHash OLAPIndex::version_hash() const { - return _version_hash; -} - -void OLAPIndex::_check_io_error(OLAPStatus res) { - if (is_io_error(res)) { - _table->set_io_error(); - } -} - -uint64_t OLAPIndex::num_index_entries() const { - return _index.count(); -} MemIndex::MemIndex() : _key_length(0), @@ -1015,7 +426,7 @@ OLAPStatus MemIndex::get_entry(const OLAPIndexOffset& pos, EntrySlice* slice) co OLAPStatus MemIndex::get_row_block_position( const OLAPIndexOffset& pos, RowBlockPosition* rbp) const { - if (empty()) { + if (zero_num_rows()) { return OLAP_ERR_INDEX_EOF; } @@ -1068,50 +479,4 @@ const OLAPIndexOffset MemIndex::get_relative_offset(iterator_offset_t absolute_o offset.offset = absolute_offset - _meta[offset.segment].range.first; return offset; } - -OLAPUnusedIndex::OLAPUnusedIndex() { - -} - -OLAPUnusedIndex::~OLAPUnusedIndex() { - clear(); -} - -void OLAPUnusedIndex::start_delete_unused_index() { - _mutex.lock(); - - for (unused_index_list_t::iterator it = _unused_index_list.begin(); - it != _unused_index_list.end();) { - if (!(*it)->is_in_use()) { - OLAP_LOG_TRACE("deleting index succeed, it is in use. [version=%d,%d version_hash=%lu]", - (*it)->version().first, - (*it)->version().second, - (*it)->version_hash()); - (*it)->delete_all_files(); - SAFE_DELETE(*it); - it = _unused_index_list.erase(it); - } else { - OLAP_LOG_TRACE("fail to delete index, it is in use. [version=%d,%d version_hash=%lu]", - (*it)->version().first, (*it)->version().second, - (*it)->version_hash()); - ++it; - } - } - - _mutex.unlock(); -} - -void OLAPUnusedIndex::add_unused_index(OLAPIndex* olap_index) { - _mutex.lock(); - - unused_index_list_t::iterator iter = find(_unused_index_list.begin(), - _unused_index_list.end(), - olap_index); - if (iter == _unused_index_list.end()) { - _unused_index_list.push_back(olap_index); - } - - _mutex.unlock(); -} - } // namespace palo diff --git a/be/src/olap/olap_index.h b/be/src/olap/olap_index.h index b74919b5b1..49957b2c6b 100644 --- a/be/src/olap/olap_index.h +++ b/be/src/olap/olap_index.h @@ -37,7 +37,7 @@ namespace palo { class IndexComparator; -class OLAPIndex; +class Rowset; class OLAPTable; class RowBlock; class RowCursor; @@ -165,7 +165,7 @@ struct SegmentMetaInfo { // In memory index structure, all index hold here class MemIndex { public: - friend class OLAPIndex; + friend class Rowset; friend class IndexComparator; friend class SegmentComparator; @@ -233,8 +233,6 @@ public: // the 2-dimension offset of the first element of second segment is (1, 0), // it's plain offset is 100 const iterator_offset_t get_absolute_offset(const OLAPIndexOffset& offset) const { - //size_t num_rows_per_block = - // _meta[offset.segment].file_header.message().num_rows_per_block(); if (offset.segment >= segment_count() || offset.offset >= _meta[offset.segment].count()) { return _num_entries; } else { @@ -290,7 +288,7 @@ public: return _meta.size(); } - bool empty() const { + bool zero_num_rows() const { return _num_entries == 0; } @@ -428,284 +426,6 @@ private: RowCursor* _helper_cursor; }; -// Class for managing OLAP table indices -// For fast key lookup, we maintain a sparse index for every data file. The -// index is sparse because we only have one pointer per row block. Each -// index entry contains the short key for the first row of the -// corresponding row block -class OLAPIndex { - friend class MemIndex; -public: - OLAPIndex(OLAPTable* table, - Version version, - VersionHash version_hash, - bool delete_flag, - uint32_t num_segments, - time_t max_timestamp); - - virtual ~OLAPIndex(); - - // Load the index into memory. - OLAPStatus load(); - bool index_loaded(); - OLAPStatus load_pb(const char* file, uint32_t seg_id); - - bool has_column_statistics() { - return _inited_column_statistics; - } - - OLAPStatus set_column_statistics( - const std::vector>& column_statistics); - - const std::vector>& get_column_statistics() { - return _column_statistics; - } - - OLAPStatus set_column_statistics_from_string( - std::vector> &column_statistics_string, - std::vector &has_null_flags); - - // 检查index文件和data文件的有效性 - OLAPStatus validate(); - - // Finds position of the first (or last if find_last is set) row - // block that may contain the smallest key equal to or greater than - // 'key'. Returns true on success. If find_last is set, note that - // the position is the last block that can possibly contain the - // given key. - OLAPStatus find_row_block(const RowCursor& key, - RowCursor* helper_cursor, - bool find_last, - RowBlockPosition* position) const; - - // Finds position of first row block contain the smallest key equal - // to or greater than 'key'. Returns true on success. - OLAPStatus find_short_key(const RowCursor& key, - RowCursor* helper_cursor, - bool find_last, - RowBlockPosition* position) const; - - // Returns position of the first row block in the index. - OLAPStatus find_first_row_block(RowBlockPosition* position) const; - - // Returns position of the last row block in the index. - OLAPStatus find_last_row_block(RowBlockPosition* position) const; - - // Given the position of a row block, finds position of the next block. - // Sets eof to tru if there are no more blocks to go through, and - // returns false. Returns true on success. - OLAPStatus find_next_row_block(RowBlockPosition* position, bool* eof) const; - - // Given two positions in an index, low and high, set output to be - // the midpoint between those two positions. Returns the distance - // between low and high as computed by ComputeDistance. - OLAPStatus find_mid_point(const RowBlockPosition& low, - const RowBlockPosition& high, - RowBlockPosition* output, - uint32_t* dis) const; - - OLAPStatus find_prev_point(const RowBlockPosition& current, RowBlockPosition* prev) const; - - OLAPStatus get_row_block_entry(const RowBlockPosition& pos, EntrySlice* entry) const; - - // Given a starting row block position, advances the position by - // num_row_blocks, then stores back the new position through the - // pointer. Returns true on success, false on attempt to seek past - // the last block. - OLAPStatus advance_row_block(int64_t num_row_blocks, RowBlockPosition* position) const; - - // Computes the distance between two positions, in row blocks. - uint32_t compute_distance(const RowBlockPosition& position1, - const RowBlockPosition& position2) const; - - // The following four functions are used for creating new index - // files. AddSegment() and FinalizeSegment() start and end a new - // segment respectively, while IndexRowBlock() and IndexShortKey() - // add a new index entry to the current segment. - OLAPStatus add_segment(); - OLAPStatus add_short_key(const RowCursor& short_key, const uint32_t data_offset); - OLAPStatus add_row_block(const RowBlock& row_block, const uint32_t data_offset); - OLAPStatus finalize_segment(uint32_t data_segment_size, int64_t num_rows); - void sync(); - - // reference count - void acquire(); - void release(); - bool is_in_use(); - int64_t ref_count(); - - // delete all files (*.idx; *.dat) - void delete_all_files(); - - // getters and setters. - // get associated OLAPTable pointer - OLAPTable* table() const { - return _table; - } - - void set_table(OLAPTable* table) { - _table = table; - } - - Version version() const { - return _version; - } - - VersionHash version_hash() const; - - bool delete_flag() const { - return _index.delete_flag(); - } - - uint32_t num_segments() const { - return _num_segments; - } - - void set_num_segments(uint32_t num_segments) { - _num_segments = num_segments; - } - - time_t max_timestamp() const { - return _max_timestamp; - } - - size_t index_size() const { - return _index.index_size(); - } - - size_t data_size() const { - return _index.data_size(); - } - - int64_t num_rows() const { - return _index.num_rows(); - } - - const size_t short_key_length() const { - return _short_key_length; - } - - const size_t new_short_key_length() const { - return _new_short_key_length; - } - - const RowFields& short_key_fields() const { - return _short_key_info_list; - } - - bool empty() const { - return _index.empty(); - } - - // return count of entries in MemIndex - uint64_t num_index_entries() const; - - size_t current_num_rows_per_row_block() const { - return _current_num_rows_per_row_block; - } - - OLAPStatus get_row_block_position(const OLAPIndexOffset& pos, RowBlockPosition* rbp) const { - return _index.get_row_block_position(pos, rbp); - } - - inline const FileHeader* get_seg_pb(uint32_t seg_id) const { - return &(_seg_pb_map.at(seg_id)); - } - - inline bool get_null_supported(uint32_t seg_id) { - return _index.get_null_supported(seg_id); - } - -private: - void _check_io_error(OLAPStatus res); - - std::string _construct_index_file_path(const Version& version, - VersionHash version_hash, - uint32_t segment) const { - return OLAPTable::construct_file_path(_header_file_name, - version, - version_hash, - segment, - "idx"); - } - - std::string _construct_data_file_path(const Version& version, - VersionHash version_hash, - uint32_t segment) const { - return OLAPTable::construct_file_path(_header_file_name, - version, - version_hash, - segment, - "dat"); - } - - OLAPTable* _table; // table definition for this index - Version _version; // version of associated data file - bool _delete_flag; - time_t _max_timestamp; // max pusher delta timestamp - uint32_t _num_segments; // number of segments in this index - VersionHash _version_hash; // version hash for this index - bool _index_loaded; // whether the index has been read - atomic_t _ref_count; // reference count - MemIndex _index; - - std::string _header_file_name; // the name of the related header file - // short key对应的field_info数组 - RowFields _short_key_info_list; - // short key对应的总长度 - size_t _short_key_length; - size_t _new_short_key_length; - - // 以下是写入æµç¨‹æ—¶éœ€è¦çš„ä¸€äº›ä¸­é—´çŠ¶æ€ - // 当å‰å†™å…¥æ–‡ä»¶çš„FileHandler - FileHandler _current_file_handler; - // 当å‰å†™å…¥çš„FileHeader - FileHeader _file_header; - // 当å‰å†™å…¥çš„short_keyçš„buf - char* _short_key_buf; - // 当å‰å†™å…¥çš„segmentçš„checksum - uint32_t _checksum; - // 当å‰å†™å…¥æ—¶ç”¨ä½œç´¢å¼•项的RowCursor - RowCursor _current_index_row; - - // Lock held while loading the index. - mutable boost::mutex _index_load_lock; - - size_t _current_num_rows_per_row_block; - - bool _inited_column_statistics; - - std::vector> _column_statistics; - std::vector _has_null_flags; - std::unordered_map > _seg_pb_map; - - DISALLOW_COPY_AND_ASSIGN(OLAPIndex); -}; - -class OLAPUnusedIndex { - DECLARE_SINGLETON(OLAPUnusedIndex); -public: - OLAPStatus init() { - clear(); - return OLAP_SUCCESS; - } - - void clear() { - _unused_index_list.clear(); - } - - void start_delete_unused_index(); - - void add_unused_index(OLAPIndex* olap_index); - -private: - typedef std::list unused_index_list_t; - unused_index_list_t _unused_index_list; - MutexLock _mutex; - - DISALLOW_COPY_AND_ASSIGN(OLAPUnusedIndex); -}; - } // namespace palo #endif // BDG_PALO_BE_SRC_OLAP_OLAP_INDEX_H diff --git a/be/src/olap/olap_main.cpp b/be/src/olap/olap_main.cpp deleted file mode 100755 index bc6bc1371d..0000000000 --- a/be/src/olap/olap_main.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/olap_main.h" - -#include -#include - -#include "olap/command_executor.h" -#include "olap/olap_define.h" -#include "olap/olap_engine.h" -#include "olap/olap_index.h" -#include "olap/olap_server.h" -#include "olap/utils.h" - -using std::string; - -namespace palo { - -// åˆå§‹åŒ–所有的singletonå¯¹è±¡ï¼ŒåŒæ—¶åˆå§‹åŒ–comlog, -// 这么åšä¸ºäº†å°½é‡æŠŠæ—¥å¿—打到日志文件中 -static bool touch_all_singleton() { - OLAPStatus res = OLAP_SUCCESS; - - OLAPRootPath* root_path = OLAPRootPath::get_instance(); - if (NULL == root_path || OLAP_SUCCESS != (res = root_path->init())) { - OLAP_LOG_FATAL("fail to init olap root path. [res=%d]", res); - return false; - } - - OLAPEngine* engine = OLAPEngine::get_instance(); - if (NULL == engine || OLAP_SUCCESS != (res = engine->init())) { - OLAP_LOG_FATAL("fail to init olap engine. [res=%d]", res); - return false; - } - - OLAPSnapshot* snapshot = OLAPSnapshot::get_instance(); - if (NULL == snapshot) { - OLAP_LOG_FATAL("fail to init olap snapshot. [res=%d]", res); - return false; - } - - OLAPUnusedIndex* unused_index = OLAPUnusedIndex::get_instance(); - if (NULL == unused_index || OLAP_SUCCESS != (res = unused_index->init())) { - OLAP_LOG_FATAL("fail to init delete unused index. [res=%d]", res); - return false; - } - - return true; -} - -#ifdef OLAP_UNIT_TEST -int olap_main(int argc, char** argv) { -#else -int olap_main(int argc, char** argv) { -#endif -#ifdef GOOGLE_PROFILER - const char* google_profiler_output = "profiler_out"; - ProfilerStart(google_profiler_output); - HeapProfilerStart("heap_prof"); -#endif - - int ret = 0; - OLAPServer server; - if (!touch_all_singleton()) { - OLAP_LOG_FATAL("fail to touch all singleton."); - ret = 1; - goto EXIT; - } - - if (OLAP_SUCCESS != server.init(NULL, NULL)) { - OLAP_LOG_FATAL("server init failed, exiting."); - ret = 1; - goto EXIT; - } - -#ifdef GOOGLE_PROFILER - HeapProfilerStop(); - ProfilerStop(); -#endif - -EXIT: - - return ret; -} - -} // namespace palo diff --git a/be/src/olap/olap_main.h b/be/src/olap/olap_main.h deleted file mode 100755 index 0f9618d0d0..0000000000 --- a/be/src/olap/olap_main.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_OLAP_MAIN_H -#define BDG_PALO_BE_SRC_OLAP_MAIN_H - -#include - -#include "olap/olap_common.h" - -namespace palo { - -int olap_main(int argc, char** argv); - -} // namespace palo - -#endif // BDG_PALO_BE_SRC_OLAP_MAIN_H diff --git a/be/src/olap/olap_meta.cpp b/be/src/olap/olap_meta.cpp old mode 100644 new mode 100755 diff --git a/be/src/olap/olap_meta.h b/be/src/olap/olap_meta.h old mode 100644 new mode 100755 diff --git a/be/src/olap/olap_rootpath.cpp b/be/src/olap/olap_rootpath.cpp deleted file mode 100644 index ecc3a28847..0000000000 --- a/be/src/olap/olap_rootpath.cpp +++ /dev/null @@ -1,1387 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "olap/olap_rootpath.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "olap/file_helper.h" -#include "olap/olap_engine.h" -#include "olap/utils.h" - -using boost::filesystem::canonical; -using boost::filesystem::file_size; -using boost::filesystem::is_directory; -using boost::filesystem::path; -using boost::filesystem::recursive_directory_iterator; -using boost::interprocess::file_lock; -using std::find; -using std::fstream; -using std::make_pair; -using std::nothrow; -using std::pair; -using std::random_device; -using std::random_shuffle; -using std::set; -using std::sort; -using std::string; -using std::stringstream; -using std::unique; -using std::vector; - -namespace palo { - -static const char* const kMtabPath = "/etc/mtab"; -static const char* const kTouchPath = "/.touch_flag"; -static const char* const kUnusedFlagFilePrefix = "unused"; -static const char* const kTestFilePath = "/.testfile"; - -OLAPRootPath::OLAPRootPath() : - is_report_disk_state_already(false), - is_report_olap_table_already(false), - _test_file_write_buf(NULL), - _test_file_read_buf(NULL), - _total_storage_medium_type_count(0), - _available_storage_medium_type_count(0), - _effective_cluster_id(-1), - _is_all_cluster_id_exist(true), - _is_drop_tables(false) {} - -OLAPRootPath::~OLAPRootPath() { - clear(); -} - -OLAPStatus OLAPRootPath::init() { - OLAPStatus res = OLAP_SUCCESS; - string& root_paths = config::storage_root_path; - OLAP_LOG_DEBUG("root_path='%s'. ", root_paths.c_str()); - RootPathVec root_path_vec; - CapacityVec capacity_vec; - - _rand_seed = static_cast(time(NULL)); - _root_paths.clear(); - _min_percentage_of_error_disk = config::min_percentage_of_error_disk; - - if (posix_memalign((void**)&_test_file_write_buf, - DIRECT_IO_ALIGNMENT, - TEST_FILE_BUF_SIZE) != 0) { - OLAP_LOG_WARNING("fail to malloc _test_file_write_buf. [size=%lu]", TEST_FILE_BUF_SIZE); - clear(); - - return OLAP_ERR_MALLOC_ERROR; - } - - if (posix_memalign((void**)&_test_file_read_buf, - DIRECT_IO_ALIGNMENT, - TEST_FILE_BUF_SIZE) != 0) { - OLAP_LOG_WARNING("fail to malloc _test_file_read_buf. [size=%lu]", TEST_FILE_BUF_SIZE); - clear(); - - return OLAP_ERR_MALLOC_ERROR; - } - - _unused_flag_path = string(getenv("LOG_DIR")) + UNUSED_PREFIX; - if (!check_dir_existed(_unused_flag_path)) { - if ((res = create_dir(_unused_flag_path)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create unused flag path.[path='%s']", - _unused_flag_path.c_str()); - clear(); - - return res; - } - } else { - _remove_all_unused_flag_file(); - } - - res = parse_root_paths_from_string(root_paths.c_str(), &root_path_vec, &capacity_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("get root path failed. [res=%d root_paths='%s']", - res, root_paths.c_str()); - clear(); - - return res; - } - - vector is_accessable_vec; - res = _check_root_paths(root_path_vec, &capacity_vec, &is_accessable_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to check root path. [res=%d]", res); - clear(); - - return res; - } - - _effective_cluster_id = config::cluster_id; - res = check_all_root_path_cluster_id(root_path_vec, is_accessable_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to check cluster info. [res=%d]", res); - clear(); - return res; - } - - for (size_t i = 0; i < root_path_vec.size(); ++i) { - RootPathInfo root_path_info; - root_path_info.path = root_path_vec[i]; - root_path_info.is_used = true; - root_path_info.capacity = capacity_vec[i]; - res = _update_root_path_info(root_path_vec[i], &root_path_info); - if (res != OLAP_SUCCESS || !is_accessable_vec[i]) { - OLAP_LOG_WARNING("fail to update root path info[root path='%s']", - root_path_vec[i].c_str()); - root_path_info.is_used = false; - _create_unused_flag_file(root_path_info.unused_flag_file); - } - - _root_paths.insert(pair(root_path_vec[i], root_path_info)); - } - - _update_storage_medium_type_count(); - - return res; -} - -OLAPStatus OLAPRootPath::clear() { - _root_paths.clear(); - - if (_test_file_read_buf != NULL) { - free(_test_file_read_buf); - _test_file_read_buf = NULL; - } - if (_test_file_write_buf != NULL) { - free(_test_file_write_buf); - _test_file_write_buf = NULL; - } - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::get_root_path_used_stat(const string& root_path, bool* is_used) { - OLAPStatus res = OLAP_SUCCESS; - *is_used = false; - - _mutex.lock(); - - RootPathMap::iterator it = _root_paths.find(root_path); - if (it != _root_paths.end()) { - *is_used = it->second.is_used; - } else { - res = OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - _mutex.unlock(); - - return res; -} - -OLAPStatus OLAPRootPath::set_root_path_used_stat(const string& root_path, bool is_used) { - OLAPStatus res = OLAP_SUCCESS; - - _mutex.lock(); - RootPathMap::iterator it = _root_paths.find(root_path); - if (it != _root_paths.end()) { - it->second.is_used = is_used; - if (!is_used) { - if ((res = _create_unused_flag_file(it->second.unused_flag_file)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create unused flag file." - "[root_path='%s' unused_flag_file='%s']", - root_path.c_str(), - it->second.unused_flag_file.c_str()); - } - } - } else { - res = OLAP_ERR_INPUT_PARAMETER_ERROR; - } - _mutex.unlock(); - - _update_storage_medium_type_count(); - - return res; -} - -void OLAPRootPath::get_all_available_root_path(RootPathVec* all_available_root_path) { - all_available_root_path->clear(); - _mutex.lock(); - - for (RootPathMap::iterator it = _root_paths.begin(); it != _root_paths.end(); ++it) { - if (it->second.is_used) { - all_available_root_path->push_back(it->first); - } - } - - _mutex.unlock(); -} - -OLAPStatus OLAPRootPath::get_all_root_path_info( - vector* root_paths_info, - bool need_capacity) { - - OLAPStatus res = OLAP_SUCCESS; - root_paths_info->clear(); - - _mutex.lock(); - for (RootPathMap::iterator it = _root_paths.begin(); it != _root_paths.end(); ++it) { - RootPathInfo info; - info.path = it->first; - info.is_used = it->second.is_used; - info.capacity = it->second.capacity; - root_paths_info->push_back(info); - } - _mutex.unlock(); - - if (need_capacity) { - for (auto& info: *root_paths_info) { - if (info.is_used) { - _get_root_path_capacity(info.path, &info.data_used_capacity, &info.available); - } else { - info.capacity = 1; - info.data_used_capacity = 0; - info.available = 0; - } - } - } - - return res; -} - -OLAPStatus OLAPRootPath::reload_root_paths(const char* root_paths) { - OLAPStatus res = OLAP_SUCCESS; - - RootPathVec root_path_vec; - CapacityVec capacity_vec; - res = parse_root_paths_from_string(root_paths, &root_path_vec, &capacity_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("get root path failed when reload root path. [root_paths=%s]", root_paths); - return res; - } - - vector is_accessable_vec; - res = _check_root_paths(root_path_vec, &capacity_vec, &is_accessable_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to check reload root paths. [res=%d root_paths=%s]", - res, root_paths); - return res; - } - - res = check_all_root_path_cluster_id(root_path_vec, is_accessable_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to check cluster info. [res=%d]", res); - return res; - } - - _mutex.lock(); - - _remove_all_unused_flag_file(); - - for (RootPathMap::iterator it = _root_paths.begin(); it != _root_paths.end(); ++it) { - if (root_path_vec.end() == find(root_path_vec.begin(), root_path_vec.end(), it->first)) { - it->second.to_be_deleted = true; - } - } - - OLAPRootPath::RootPathVec root_path_to_be_loaded; - for (size_t i = 0; i < root_path_vec.size(); ++i) { - RootPathMap::iterator iter_root_path = _root_paths.find(root_path_vec[i]); - if (iter_root_path == _root_paths.end()) { - RootPathInfo root_path_info; - root_path_info.path = root_path_vec[i]; - root_path_info.is_used = true; - root_path_info.capacity = capacity_vec[i]; - root_path_to_be_loaded.push_back(root_path_vec[i]); - res = _update_root_path_info(root_path_vec[i], &root_path_info); - if (res != OLAP_SUCCESS || !is_accessable_vec[i]) { - OLAP_LOG_WARNING("fail to update root path info.[root path='%s']", - root_path_vec[i].c_str()); - root_path_info.is_used = false; - root_path_to_be_loaded.pop_back(); - _create_unused_flag_file(root_path_info.unused_flag_file); - } - - _root_paths.insert(pair(root_path_vec[i], root_path_info)); - } else { - if (!iter_root_path->second.is_used) { - iter_root_path->second.is_used = true; - iter_root_path->second.capacity = capacity_vec[i]; - root_path_to_be_loaded.push_back(root_path_vec[i]); - res = _update_root_path_info(iter_root_path->first, &iter_root_path->second); - if (res != OLAP_SUCCESS || !is_accessable_vec[i]) { - iter_root_path->second.is_used = false; - root_path_to_be_loaded.pop_back(); - _create_unused_flag_file(iter_root_path->second.unused_flag_file); - } - } - } - } - - vector table_info_vec; - for (RootPathMap::iterator iter_root_path = _root_paths.begin(); - iter_root_path != _root_paths.end();) { - if (iter_root_path->second.to_be_deleted) { - for (set::iterator iter_table = iter_root_path->second.table_set.begin(); - iter_table != iter_root_path->second.table_set.end(); - ++iter_table) { - table_info_vec.push_back(*iter_table); - } - - _root_paths.erase(iter_root_path++); - } else { - ++iter_root_path; - } - } - - _mutex.unlock(); - - _update_storage_medium_type_count(); - - OLAPEngine::get_instance()->drop_tables_on_error_root_path(table_info_vec); - OLAPEngine::get_instance()->load_root_paths(root_path_to_be_loaded); - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::register_table_into_root_path(OLAPTable* olap_table) { - OLAPStatus res = OLAP_SUCCESS; - _mutex.lock(); - - RootPathMap::iterator it = _root_paths.find(olap_table->storage_root_path_name()); - if (it == _root_paths.end()) { - OLAP_LOG_WARNING("fail to register table into root path.[root_path='%s' table='%s']", - olap_table->storage_root_path_name().c_str(), - olap_table->full_name().c_str()); - res = OLAP_ERR_INVALID_ROOT_PATH; - } else { - TableInfo table_info(olap_table->tablet_id(), - olap_table->schema_hash()); - it->second.table_set.insert(table_info); - } - - _mutex.unlock(); - - return res; -} - -OLAPStatus OLAPRootPath::unregister_table_from_root_path(OLAPTable* olap_table) { - _mutex.lock(); - - RootPathMap::iterator it = _root_paths.find(olap_table->storage_root_path_name()); - if (it == _root_paths.end()) { - OLAP_LOG_WARNING("fail to unregister table into root path.[root_path='%s' table='%s']", - olap_table->storage_root_path_name().c_str(), - olap_table->full_name().c_str()); - } else { - TableInfo table_info(olap_table->tablet_id(), - olap_table->schema_hash()); - it->second.table_set.erase(table_info); - } - - _mutex.unlock(); - - return OLAP_SUCCESS; -} - -void OLAPRootPath::start_disk_stat_monitor() { - _start_check_disks(); - _detect_unused_flag(); - _delete_tables_on_unused_root_path(); - - // if drop tables - // notify disk_state_worker_thread and olap_table_worker_thread until they received - if (_is_drop_tables) { - disk_broken_cv.notify_all(); - - bool is_report_disk_state_expected = true; - bool is_report_olap_table_expected = true; - bool is_report_disk_state_exchanged = - is_report_disk_state_already.compare_exchange_strong(is_report_disk_state_expected, false); - bool is_report_olap_table_exchanged = - is_report_olap_table_already.compare_exchange_strong(is_report_olap_table_expected, false); - if (is_report_disk_state_exchanged && is_report_olap_table_exchanged) { - _is_drop_tables = false; - } - } -} - - -void OLAPRootPath::_start_check_disks() { - OLAPRootPath::RootPathVec all_available_root_path; - get_all_available_root_path(&all_available_root_path); - for (OLAPRootPath::RootPathVec::iterator iter = all_available_root_path.begin(); - iter != all_available_root_path.end(); ++iter) { - OLAPStatus res = OLAP_SUCCESS; - if ((res = _read_and_write_test_file(*iter)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("root path occur IO Error. [root path='%s']", - iter->c_str()); - - if (is_io_error(res)) { - set_root_path_used_stat(*iter, false); - } - } - } -} - -bool OLAPRootPath::_used_disk_not_enough(uint32_t unused_num, uint32_t total_num) { - return ((total_num == 0) || (unused_num * 100 / total_num > _min_percentage_of_error_disk)); -} - -OLAPStatus OLAPRootPath::_check_root_paths( - RootPathVec& root_path_vec, - CapacityVec* capacity_vec, - vector* is_accessable_vec) { - OLAPStatus res = OLAP_SUCCESS; - - size_t i = 0; - for (RootPathVec::iterator iter = root_path_vec.begin(); iter != root_path_vec.end();) { - // if root path exist: - // check extension, capacity and create subdir; - // else: - // if root path can be created, return error; - // otherwise, the disk may be corrupted, check it when recover. - if (_check_root_path_exist(*iter)) { - string align_tag_path = *iter + ALIGN_TAG_PREFIX; - if (access(align_tag_path.c_str(), F_OK) == 0) { - OLAP_LOG_WARNING("disk with align tag find. [root_path='%s']", (*iter).c_str()); - root_path_vec.erase(iter); - continue; - } - - res = _check_existed_root_path(*iter, &(*capacity_vec)[i]); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to check existed root path. [res=%d]", res); - return res; - } - - is_accessable_vec->push_back(true); - ++i; - ++iter; - } else { - path boost_path = *iter; - path last_boost_path = boost_path; - boost_path = boost_path.parent_path(); - while (!check_dir_existed(boost_path.string())) { - last_boost_path = boost_path; - boost_path = boost_path.parent_path(); - } - - res = create_dirs(*iter); - if (res != OLAP_SUCCESS && errno != EACCES) { - OLAP_LOG_WARNING("root path is unusable! [root_path='%s' err='%m']", - (*iter).c_str()); - is_accessable_vec->push_back(false); - ++i; - ++iter; - continue; - } else { - OLAP_LOG_WARNING("root path not exist. [root_path='%s']", (*iter).c_str()); - remove_dir(last_boost_path.string()); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - } - } - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::_check_existed_root_path( - const std::string& root_path, - int64_t* capacity) { - path boost_path = root_path; - string extension = canonical(boost_path).extension().string(); - if (extension != "" && extension != ".SSD" && extension != ".ssd" - && extension != ".HDD" && extension != ".hdd") { - OLAP_LOG_WARNING("root path has wrong extension. [root_path='%s']", - root_path.c_str()); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - int64_t disk_capacity = space(boost_path).capacity; - if (*capacity == -1) { - *capacity = disk_capacity; - } else if (*capacity > disk_capacity) { - OLAP_LOG_WARNING("root path capacity should not larger than disk capacity. " - "[root_path='%s' root_path_capacity=%lu disk_capacity=%lu]", - root_path.c_str(), *capacity, disk_capacity); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } else { - *capacity = *capacity; - } - - string data_path = root_path + DATA_PREFIX; - if (!check_dir_existed(data_path) && create_dir(data_path) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to create data root path. [path='%s']", data_path.c_str()); - return OLAP_ERR_CANNOT_CREATE_DIR; - } - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::_read_and_write_test_file(const string& root_path) { - OLAPStatus res = OLAP_SUCCESS; - string test_file = root_path + kTestFilePath; - FileHandler file_handler; - - if (access(test_file.c_str(), F_OK) == 0) { - if (remove(test_file.c_str()) != 0) { - OLAP_LOG_WARNING("fail to delete test file. [err='%m' path='%s']", test_file.c_str()); - return OLAP_ERR_IO_ERROR; - } - } else { - if (errno != ENOENT) { - OLAP_LOG_WARNING("fail to access test file. [err='%m' path='%s']", test_file.c_str()); - return OLAP_ERR_IO_ERROR; - } - } - - if ((res = file_handler.open_with_mode(test_file.c_str(), - O_RDWR | O_CREAT | O_DIRECT, - S_IRUSR | S_IWUSR)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create test file. [file_name=%s]", test_file.c_str()); - return res; - } - - for (size_t i = 0; i < TEST_FILE_BUF_SIZE; ++i) { - int32_t tmp_value = rand_r(&_rand_seed); - _test_file_write_buf[i] = static_cast(tmp_value); - } - - if ((res = file_handler.pwrite(_test_file_write_buf, TEST_FILE_BUF_SIZE, SEEK_SET)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to write test file. [file_name=%s]", test_file.c_str()); - return res; - } - - if ((res = file_handler.pread(_test_file_read_buf, TEST_FILE_BUF_SIZE, SEEK_SET)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to read test file. [file_name=%s]", test_file.c_str()); - return res; - } - - if (memcmp(_test_file_write_buf, _test_file_read_buf, TEST_FILE_BUF_SIZE) != 0) { - OLAP_LOG_WARNING("the test file write_buf and read_buf not equal."); - return OLAP_ERR_TEST_FILE_ERROR; - } - - if ((res = file_handler.close()) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to close test file. [file_name=%s]", test_file.c_str()); - return res; - } - - if (remove(test_file.c_str()) != 0) { - OLAP_LOG_TRACE("fail to delete test file. [err='%m' path='%s']", test_file.c_str()); - return OLAP_ERR_IO_ERROR; - } - - return res; -} - -OLAPStatus OLAPRootPath::_create_unused_flag_file(string& unused_flag_file) { - OLAPStatus res = OLAP_SUCCESS; - string unused_flag_file_path = _unused_flag_path + "/" + unused_flag_file; - if (!check_dir_existed(_unused_flag_path)) { - if (OLAP_SUCCESS != (res = create_dir(_unused_flag_path))) { - OLAP_LOG_WARNING("fail to create unused flag path.[path='%s']", - _unused_flag_path.c_str()); - return res; - } - } - - if (access(unused_flag_file_path.c_str(), F_OK) != 0) { - int fd = open(unused_flag_file_path.c_str(), - O_RDWR | O_CREAT, - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); - if ((fd >= 0) && !close(fd)) { - OLAP_LOG_TRACE("success to create unused flag file.[path='%s']", - unused_flag_file_path.c_str()); - } else { - OLAP_LOG_WARNING("fail to create unused flag file.[err='%m' path='%s']", - _unused_flag_path.c_str()); - res = OLAP_ERR_OTHER_ERROR; - } - } - - return res; -} - -OLAPStatus OLAPRootPath::parse_root_paths_from_string( - const char* root_paths, - RootPathVec* root_path_vec, - CapacityVec* capacity_vec) { - root_path_vec->clear(); - capacity_vec->clear(); - - try { - vector item_vec; - boost::split(item_vec, root_paths, boost::is_any_of(";"), boost::token_compress_on); - for (string item : item_vec) { - vector tmp_vec; - boost::split(tmp_vec, item, boost::is_any_of(","), boost::token_compress_on); - - // parse root path name - boost::trim(tmp_vec[0]); - tmp_vec[0].erase(tmp_vec[0].find_last_not_of("/") + 1); - if (tmp_vec[0].size() == 0 || tmp_vec[0][0] != '/') { - OLAP_LOG_WARNING("invalid root path name. [root_path='%s']", - tmp_vec[0].c_str()); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - root_path_vec->push_back(tmp_vec[0]); - - // parse root path capacity - if (tmp_vec.size() > 1) { - if (!valid_signed_number(tmp_vec[1]) - || strtol(tmp_vec[1].c_str(), NULL, 10) < 0) { - OLAP_LOG_WARNING("invalid capacity of root path. [capacity='%s']", - tmp_vec[1].c_str()); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - capacity_vec->push_back(strtol(tmp_vec[1].c_str(), NULL, 10) * GB_EXCHANGE_BYTE); - } else { - capacity_vec->push_back(-1); - } - } - } catch (...) { - OLAP_LOG_WARNING("get root path failed. [root_paths: %s]", root_paths); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - if (root_path_vec->size() == 0) { - OLAP_LOG_WARNING("there are no valid root path."); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - // verify if root path dumplicated - vector verify_vec = *root_path_vec; - sort(verify_vec.begin(), verify_vec.end()); - verify_vec.erase(unique(verify_vec.begin(), verify_vec.end()), verify_vec.end()); - if (verify_vec.size() != root_path_vec->size()) { - OLAP_LOG_WARNING("there are dumplicated root paths. [root_paths='%s']", root_paths); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::_get_root_path_capacity( - const string& root_path, - int64_t* data_used, - int64_t* disk_available) { - OLAPStatus res = OLAP_SUCCESS; - int64_t used = 0; - - OlapStopWatch watch; - try { - path boost_root_path(root_path + DATA_PREFIX); - for (recursive_directory_iterator it(boost_root_path); - it != recursive_directory_iterator(); ++it) { - if (!is_directory(*it)) { - used += file_size(*it); - } - } - *data_used = used; - - boost::filesystem::path path_name(root_path); - boost::filesystem::space_info path_info = boost::filesystem::space(path_name); - *disk_available = path_info.available; - } catch (boost::filesystem::filesystem_error& e) { - OLAP_LOG_WARNING("get space info failed. [path: %s, erro:%s]", root_path.c_str(), e.what()); - return OLAP_ERR_STL_ERROR; - } - - OLAP_LOG_INFO("get all root path capacity cost: %ld us", watch.get_elapse_time_us()); - return res; -} - -OLAPStatus OLAPRootPath::_get_root_path_file_system(const string& root_path, string* file_system) { - struct stat s; - if (stat(root_path.c_str(), &s) != 0) { - OLAP_LOG_WARNING("get path stat failed.[err=%m path='%s']", root_path.c_str()); - return OLAP_ERR_OS_ERROR; - } - - dev_t mountDevice; - if ((s.st_mode & S_IFMT) == S_IFBLK) { - mountDevice = s.st_rdev; - } else { - mountDevice = s.st_dev; - } - - FILE* mountTable = NULL; - if ((mountTable = setmntent(kMtabPath, "r")) == NULL) { - OLAP_LOG_WARNING("fail to open the file system description.[file='%s']", kMtabPath); - return OLAP_ERR_OS_ERROR; - } - - bool is_find = false; - struct mntent* mountEntry = NULL; - while ((mountEntry = getmntent(mountTable)) != NULL) { - if (strcmp(root_path.c_str(), mountEntry->mnt_dir) == 0 - || strcmp(root_path.c_str(), mountEntry->mnt_fsname) == 0) { - is_find = true; - break; - } - - if (stat(mountEntry->mnt_fsname, &s) == 0 && s.st_rdev == mountDevice) { - is_find = true; - break; - } - - if (stat(mountEntry->mnt_dir, &s) == 0 && s.st_dev == mountDevice) { - is_find = true; - break; - } - } - - endmntent(mountTable); - - if (!is_find) { - OLAP_LOG_WARNING("fail to find file system.[path='%s']", root_path.c_str()); - return OLAP_ERR_OS_ERROR; - } - - file_system->assign(mountEntry->mnt_fsname); - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::_get_root_path_current_shard(const string& root_path, uint64_t* shard) { - OLAPStatus res = OLAP_SUCCESS; - - set shards; - res = dir_walk(root_path + DATA_PREFIX, &shards, NULL); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to dir walk root path.[root path='%s']", - (root_path + DATA_PREFIX).c_str()); - return res; - } - - errno = 0; - char* end_ptr = NULL; - uint64_t max_shard = 0; - for (const auto& i : shards) { - uint64_t j = strtoul(i.c_str(), &end_ptr, 10); - if (*end_ptr != 0 || errno != 0) { - OLAP_LOG_WARNING("fail to convert shard string to int. [shard='%s']", i.c_str()); - continue; - } - - max_shard = j > max_shard ? j : max_shard; - } - - *shard = max_shard; - return res; -} - -OLAPStatus OLAPRootPath::_config_root_path_unused_flag_file(const string& root_path, - string* unused_flag_file) { - vector vector_name_element; - - try { - boost::split(vector_name_element, root_path, - boost::is_any_of("/"), - boost::token_compress_on); - } catch (...) { - OLAP_LOG_WARNING("get root path unused file name failed.[root_path='%s']", - root_path.c_str()); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - *unused_flag_file = kUnusedFlagFilePrefix; - - for (vector::iterator it = vector_name_element.begin(); - it != vector_name_element.end(); ++it) { - if (it->size() == 0) { - continue; - } - - *unused_flag_file += "_" + *it; - } - - return OLAP_SUCCESS; -} - -OLAPStatus OLAPRootPath::_update_root_path_info( - const string& root_path, RootPathInfo* root_path_info) { - OLAPStatus res = OLAP_SUCCESS; - - root_path_info->available = 0; - root_path_info->to_be_deleted = false; - root_path_info->table_set.clear(); - - res = _config_root_path_unused_flag_file(root_path, &root_path_info->unused_flag_file); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get unused flag file.[root path='%s']", root_path.c_str()); - return res; - } - - if (!_check_root_path_exist(root_path)) { - OLAP_LOG_WARNING("root path not exist. [root_path='%s']", root_path.c_str()); - return OLAP_ERR_TEST_FILE_ERROR; - } - - string align_tag_path = root_path + ALIGN_TAG_PREFIX; - if (access(align_tag_path.c_str(), F_OK) == 0) { - OLAP_LOG_WARNING("disk with align tag find. [root_path='%s']", root_path.c_str()); - return OLAP_ERR_INVALID_ROOT_PATH; - } - - res = _check_existed_root_path(root_path, &root_path_info->capacity); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to check existed root path. [res=%d]", res); - return res; - } - - root_path_info->storage_medium = TStorageMedium::HDD; - if (is_ssd_disk(root_path)) { - root_path_info->storage_medium = TStorageMedium::SSD; - } - - res = _get_root_path_file_system(root_path, &root_path_info->file_system); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get file system.[root path='%s']", root_path.c_str()); - return res; - } - - res = _get_root_path_current_shard(root_path, &root_path_info->current_shard); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get current shard.[root path='%s']", root_path.c_str()); - return res; - } - - return res; -} - -bool OLAPRootPath::is_ssd_disk(const std::string& file_path) { - path boost_root_path = file_path; - string extension = canonical(boost_root_path).extension().string(); - if (extension == ".SSD" || extension == ".ssd") { - return true; - } - return false; -} - -void OLAPRootPath::_delete_tables_on_unused_root_path() { - vector table_info_vec; - uint32_t unused_root_path_num = 0; - uint32_t total_root_path_num = 0; - _mutex.lock(); - - for (RootPathMap::iterator iter_root_path = _root_paths.begin(); - iter_root_path != _root_paths.end(); ++iter_root_path) { - total_root_path_num++; - - if (!iter_root_path->second.is_used) { - unused_root_path_num++; - - for (set::iterator iter_table = iter_root_path->second.table_set.begin(); - iter_table != iter_root_path->second.table_set.end(); ++iter_table) { - table_info_vec.push_back(*iter_table); - } - - iter_root_path->second.table_set.clear(); - } - } - - _mutex.unlock(); - - if (_used_disk_not_enough(unused_root_path_num, total_root_path_num)) { - OLAP_LOG_FATAL("engine stop running, because more than %d disks error." - "[total_disks=%d error_disks=%d]", - _min_percentage_of_error_disk, - total_root_path_num, - unused_root_path_num); - exit(0); - } - - if (!table_info_vec.empty()) { - _is_drop_tables = true; - } - - OLAPEngine::get_instance()->drop_tables_on_error_root_path(table_info_vec); -} - -void OLAPRootPath::_detect_unused_flag() { - set unused_falg_files; - if (!check_dir_existed(_unused_flag_path)) { - if (create_dir(_unused_flag_path) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create unused flag path.[path='%s']", - _unused_flag_path.c_str()); - } - - return; - } - - dir_walk(_unused_flag_path, NULL, &unused_falg_files); - - OLAPRootPath::RootPathVec root_paths_to_be_loaded; - - _mutex.lock(); - - for (RootPathMap::iterator it = _root_paths.begin(); it != _root_paths.end(); ++it) { - set::iterator jt = unused_falg_files.find(it->second.unused_flag_file); - if (!it->second.is_used) { - if (jt == unused_falg_files.end()) { - it->second.is_used = true; - root_paths_to_be_loaded.push_back(it->first); - OLAPStatus res = _update_root_path_info(it->first, &it->second); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to update the root path info.[root path='%s']", - it->first.c_str()); - it->second.is_used = false; - root_paths_to_be_loaded.pop_back(); - _create_unused_flag_file(it->second.unused_flag_file); - } else { - res = _check_recover_root_path_cluster_id(it->first); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to check cluster id. [res=%d]", res); - it->second.is_used = false; - root_paths_to_be_loaded.pop_back(); - _create_unused_flag_file(it->second.unused_flag_file); - } - } - } - } else { - if (jt != unused_falg_files.end()) { - if (it->second.is_used) { - OLAP_LOG_WARNING("detect unused flag, unuse the rootpath." - "[root_path='%s' flag='%s']", - it->first.c_str(), - jt->c_str()); - } - - it->second.is_used = false; - } - } - } - - _mutex.unlock(); - - _update_storage_medium_type_count(); - - if (root_paths_to_be_loaded.size() > 0) { - OLAPEngine::get_instance()->load_root_paths(root_paths_to_be_loaded); - } -} - -void OLAPRootPath::_remove_all_unused_flag_file() { - set unused_falg_files; - dir_walk(_unused_flag_path, NULL, &unused_falg_files); - - for (set::iterator it = unused_falg_files.begin(); - it != unused_falg_files.end(); ++it) { - if (it->find(kUnusedFlagFilePrefix) == 0) { - string unused_flag_file = _unused_flag_path + "/" + it->c_str(); - if (remove(unused_flag_file.c_str()) != 0) { - OLAP_LOG_WARNING("fail to remove unused flag file.[file='%s']", it->c_str()); - } - } - } -} - -void OLAPRootPath::get_root_path_for_create_table( - TStorageMedium::type storage_medium, RootPathVec *root_path) { - root_path->clear(); - - _mutex.lock(); - for (RootPathMap::iterator it = _root_paths.begin(); it != _root_paths.end(); ++it) { - if (it->second.is_used) { - if (_available_storage_medium_type_count == 1 - || it->second.storage_medium == storage_medium) { - root_path->push_back(it->first); - } - } - } - _mutex.unlock(); - - random_device rd; - srand(rd()); - random_shuffle(root_path->begin(), root_path->end()); -} - -void OLAPRootPath::get_table_data_path(std::vector* data_paths) { - _mutex.lock(); - - for (RootPathMap::iterator it = _root_paths.begin(); - it != _root_paths.end(); - ++it) { - data_paths->push_back(it->first); - } - - _mutex.unlock(); -} - -OLAPStatus OLAPRootPath::get_root_path_shard(const std::string& root_path, uint64_t* shard) { - OLAPStatus res = OLAP_SUCCESS; - AutoMutexLock auto_lock(&_mutex); - - RootPathMap::iterator it = _root_paths.find(root_path); - if (it == _root_paths.end()) { - OLAP_LOG_WARNING("fail to find root path. [root_path='%s']", root_path.c_str()); - return OLAP_ERR_NO_AVAILABLE_ROOT_PATH; - } - - uint64_t current_shard = it->second.current_shard; - stringstream shard_path_stream; - shard_path_stream << root_path << DATA_PREFIX << "/" << current_shard; - string shard_path = shard_path_stream.str(); - if (!check_dir_existed(shard_path)) { - res = create_dir(shard_path); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create path. [path='%s']", shard_path.c_str()); - return res; - } - } - - set tablets; - res = dir_walk(shard_path, &tablets, NULL); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to dir walk root path. [res=%d root_path='%s']", - res, root_path.c_str()); - return res; - } - - if (tablets.size() > config::max_tablet_num_per_shard) { - ++current_shard; - it->second.current_shard = current_shard; - } - - *shard = current_shard; - return OLAP_SUCCESS; -} - -void OLAPRootPath::_update_storage_medium_type_count() { - set total_storage_medium_types; - set available_storage_medium_types; - - _mutex.lock(); - for (RootPathMap::iterator it = _root_paths.begin(); it != _root_paths.end(); ++it) { - total_storage_medium_types.insert(it->second.storage_medium); - if (it->second.is_used) { - available_storage_medium_types.insert(it->second.storage_medium); - } - } - _mutex.unlock(); - - _total_storage_medium_type_count = total_storage_medium_types.size(); - _available_storage_medium_type_count = available_storage_medium_types.size(); -} - -OLAPStatus OLAPRootPath::_get_cluster_id_path_vec( - vector* cluster_id_path_vec) { - OLAPStatus res = OLAP_SUCCESS; - - vector root_path_info_vec; - res = get_all_root_path_info(&root_path_info_vec, false); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get root path info. [res=%d]", res); - return res; - } - - for (const auto& info: root_path_info_vec) { - if (info.is_used) { - cluster_id_path_vec->push_back(info.path + CLUSTER_ID_PREFIX); - } - } - - return res; -} - -OLAPStatus OLAPRootPath::_get_cluster_id_from_path(const string& path, int32_t* cluster_id) { - OLAPStatus res = OLAP_SUCCESS; - int32_t tmp_cluster_id = -1; - - fstream fs(path.c_str(), fstream::in); - if (!fs.is_open()) { - OLAP_LOG_WARNING("fail to open cluster id path. [path='%s']", path.c_str()); - return OLAP_ERR_IO_ERROR; - } - - fs >> tmp_cluster_id; - fs.close(); - - if (tmp_cluster_id == -1 && (fs.rdstate() & fstream::eofbit) != 0) { - *cluster_id = -1; - return res; - } else if (tmp_cluster_id >= 0 && (fs.rdstate() & fstream::eofbit) != 0) { - *cluster_id = tmp_cluster_id; - return res; - } else { - OLAP_LOG_WARNING("fail to read cluster id from file. " - "[id=%d eofbit=%d failbit=%d badbit=%d]", - tmp_cluster_id, - fs.rdstate() & fstream::eofbit, - fs.rdstate() & fstream::failbit, - fs.rdstate() & fstream::badbit); - return OLAP_ERR_IO_ERROR; - } -} - -OLAPStatus OLAPRootPath::_write_cluster_id_to_path(const string& path, int32_t cluster_id) { - OLAPStatus res = OLAP_SUCCESS; - - fstream fs(path.c_str(), fstream::out); - if (!fs.is_open()) { - OLAP_LOG_WARNING("fail to open cluster id path. [path='%s']", path.c_str()); - return OLAP_ERR_IO_ERROR; - } - - fs << cluster_id; - fs.close(); - - return res; -} - -OLAPStatus OLAPRootPath::_judge_and_update_effective_cluster_id(int32_t cluster_id) { - OLAPStatus res = OLAP_SUCCESS; - - if (cluster_id == -1 && _effective_cluster_id == -1) { - // maybe this is a new cluster, cluster id will get from heartbeate - return res; - } else if (cluster_id != -1 && _effective_cluster_id == -1) { - _effective_cluster_id = cluster_id; - } else if (cluster_id == -1 && _effective_cluster_id != -1) { - // _effective_cluster_id is the right effective cluster id - return res; - } else { - if (cluster_id != _effective_cluster_id) { - OLAP_LOG_WARNING("multiple cluster ids is not equal. [id1=%d id2=%d]", - _effective_cluster_id, cluster_id); - return OLAP_ERR_INVALID_CLUSTER_INFO; - } - } - - return res; -} - -OLAPStatus OLAPRootPath::_check_recover_root_path_cluster_id(const std::string& root_path) { - OLAPStatus res = OLAP_SUCCESS; - - // prepare: check cluster id file exist, if not, create it - string path = root_path + CLUSTER_ID_PREFIX; - if (access(path.c_str(), F_OK) != 0) { - int fd = open(path.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); - if (fd < 0 || close(fd) < 0) { - OLAP_LOG_WARNING("fail to create file. [path='%s' err='%m']", path.c_str()); - return OLAP_ERR_OTHER_ERROR; - } - } - - // obtain lock of cluster id path - FILE* fp = NULL; - fp = fopen(path.c_str(), "r+b"); - if (fp == NULL) { - OLAP_LOG_WARNING("fail to open cluster id path. [path='%s']", path.c_str()); - return OLAP_ERR_IO_ERROR; - } - - int lock_res = flock(fp->_fileno, LOCK_EX | LOCK_NB); - if (lock_res < 0) { - OLAP_LOG_WARNING("fail to lock file descriptor. [path='%s']", path.c_str()); - fclose(fp); - fp = NULL; - return OLAP_ERR_TRY_LOCK_FAILED; - } - - // obtain cluster id of root path - int32_t cluster_id = -1; - res = _get_cluster_id_from_path(path, &cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get cluster id from path. [res=%d]", res); - fclose(fp); - fp = NULL; - return res; - } else if (cluster_id == -1 || _effective_cluster_id == -1) { - _is_all_cluster_id_exist = false; - } - - // judge and update effective cluster id - res = _judge_and_update_effective_cluster_id(cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to judge and update effective cluster id. [res=%d]", res); - fclose(fp); - fp = NULL; - return res; - } - - // write cluster id into cluster_id_path if get effective cluster id success - if (_effective_cluster_id != -1 && !_is_all_cluster_id_exist) { - res = set_cluster_id(_effective_cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to write cluster id to path. [res=%d]", res); - fclose(fp); - fp = NULL; - return res; - } - _is_all_cluster_id_exist = true; - } - - fclose(fp); - fp = NULL; - return res; -} - -bool OLAPRootPath::_check_root_path_exist(const string& root_path) { - bool is_exist = true; - DIR* dirp = opendir(root_path.c_str()); - do { - if (NULL == dirp) { - is_exist = false; - OLAP_LOG_WARNING("can't open root path. [root_path=%s]", root_path.c_str()); - break; - } - - if (readdir(dirp) == NULL) { - is_exist = false; - OLAP_LOG_WARNING("can't read root path. [root_path=%s]", root_path.c_str()); - break; - } - } while (0); - - closedir(dirp); - return is_exist; -} - -OLAPStatus OLAPRootPath::check_all_root_path_cluster_id( - const vector& root_path_vec, - const vector& is_accessable_vec) { - OLAPStatus res = OLAP_SUCCESS; - - // prepare: check cluster id file exist, if not, create it - vector cluster_id_path_vec; - for (size_t i = 0; i < root_path_vec.size(); ++i) { - if (!is_accessable_vec[i]) { - continue; - } - cluster_id_path_vec.push_back(root_path_vec[i] + CLUSTER_ID_PREFIX); - } - - for (const auto& path : cluster_id_path_vec) { - if (access(path.c_str(), F_OK) != 0) { - int fd = open(path.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); - if (fd < 0 || close(fd) < 0) { - OLAP_LOG_WARNING("fail to create file. [path='%s' err='%m']", path.c_str()); - return OLAP_ERR_OTHER_ERROR; - } - } - } - - // obtain lock of all cluster id paths - for (size_t i = 0; i < root_path_vec.size(); ++i) { - RootPathMap::iterator it = _root_paths.find(root_path_vec[i]); - if (is_accessable_vec[i] && it == _root_paths.end()) { - FILE* fp = NULL; - string path = root_path_vec[i] + CLUSTER_ID_PREFIX; - fp = fopen(path.c_str(), "r+b"); - if (fp == NULL) { - OLAP_LOG_WARNING("fail to open cluster id path. [path='%s']", path.c_str()); - return OLAP_ERR_IO_ERROR; - } - - int lock_res = flock(fp->_fileno, LOCK_EX | LOCK_NB); - if (lock_res < 0) { - OLAP_LOG_WARNING("fail to lock file descriptor. [path='%s']", path.c_str()); - fclose(fp); - fp = NULL; - return OLAP_ERR_TRY_LOCK_FAILED; - } - } - } - - // obtain cluster id of all root paths - int32_t cluster_id = -1; - for (const auto& path : cluster_id_path_vec) { - int32_t tmp_cluster_id = -1; - res = _get_cluster_id_from_path(path, &tmp_cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get cluster id from path. [res=%d]", res); - return res; - } else if (tmp_cluster_id == -1) { - _is_all_cluster_id_exist = false; - } else if (tmp_cluster_id == cluster_id) { - // both hava right cluster id, do nothing - } else if (cluster_id == -1) { - cluster_id = tmp_cluster_id; - } else { - OLAP_LOG_WARNING("multiple cluster ids is not equal. [id1=%d id2=%d]", - cluster_id, tmp_cluster_id); - return OLAP_ERR_INVALID_CLUSTER_INFO; - } - } - - // judge and get effective cluster id - res = _judge_and_update_effective_cluster_id(cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to judge and update effective cluster id. [res=%d]", res); - return res; - } - - // write cluster id into cluster_id_path if get effective cluster id success - if (_effective_cluster_id != -1 && !_is_all_cluster_id_exist) { - for (const string& path : cluster_id_path_vec) { - res = _write_cluster_id_to_path(path, _effective_cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to write cluster id to path. [res=%d]", res); - return res; - } - } - _is_all_cluster_id_exist = true; - } - - return res; -} - -OLAPStatus OLAPRootPath::set_cluster_id(int32_t cluster_id) { - OLAPStatus res = OLAP_SUCCESS; - - vector cluster_id_path_vec; - res = _get_cluster_id_path_vec(&cluster_id_path_vec); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to get all cluster id path. [res=%d]", res); - return res; - } - - for (const string& path : cluster_id_path_vec) { - res = _write_cluster_id_to_path(path, cluster_id); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to write cluster id to path. [res=%d]", res); - return res; - } - } - - _effective_cluster_id = cluster_id; - _is_all_cluster_id_exist = true; - return res; -} - -} // namespace palo diff --git a/be/src/olap/olap_rootpath.h b/be/src/olap/olap_rootpath.h deleted file mode 100644 index 2ff8ceea02..0000000000 --- a/be/src/olap/olap_rootpath.h +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_OLAP_OLAP_ROOTPATH_H -#define BDG_PALO_BE_SRC_OLAP_OLAP_ROOTPATH_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "olap/olap_cond.h" -#include "olap/olap_define.h" - -namespace palo { - -struct RootPathInfo { - RootPathInfo(): - capacity(1), - available(0), - data_used_capacity(0), - current_shard(0), - is_used(false), - to_be_deleted(false) {} - - std::string path; - std::string file_system; // 目录对应的ç£ç›˜åˆ†åŒº - std::string unused_flag_file; // ä¸å¯ç”¨æ ‡è¯†å¯¹åº”的文件å - int64_t capacity; // 总空间,å•ä½å­—节 - int64_t available; // å¯ç”¨ç©ºé—´ï¼Œå•ä½å­—节 - int64_t data_used_capacity; - uint64_t current_shard; // shard按0,1...æ–¹å¼ç¼–å·ï¼Œæœ€å¤§çš„shardå· - bool is_used; // 是å¦å¯ç”¨æ ‡è¯† - bool to_be_deleted; // 删除标识,如在reload时删除æŸä¸€ç›®å½• - TStorageMedium::type storage_medium; // 存储介质类型:SSD|HDD - std::set table_set; -}; - - -/* - * ç›®å‰æ‰€è°“çš„RootPath指的是storage_root_path,其目录组织结构如下: - * - * Storage_root_path ==>根目录,由é…置指定 - * |-data ==>存放有效数æ®çš„目录,åˆç§°tables_root_path - * |-trash ==>存放已删除的tabletæ•°æ®ï¼ˆå›žæ”¶ç«™ï¼‰ï¼Œ - * `-snapshot ==>存放snapshot的目录 - */ -class OLAPRootPath { - DECLARE_SINGLETON(OLAPRootPath); -public: - typedef std::vector RootPathVec; - typedef std::vector CapacityVec; - - static OLAPStatus parse_root_paths_from_string( - const char* root_paths, - RootPathVec* root_path_vec, - CapacityVec* capacity_vec); - - // @brief åˆå§‹åŒ–。 - // 从é…置文件中读å–storage_root_pathä¿¡æ¯ï¼Œé‡åçš„path当æˆä¸€æ¡path, - // 校验å„root_path的目录ã€ç£ç›˜ç­‰ã€‚ - OLAPStatus init(); - - OLAPStatus clear(); - - // @brief 查询root_path是å¦å¯ç”¨ - // @param root_path [in] 被查询的root_path - // @param is_used [out] 是å¦ä½¿ç”¨çš„çŠ¶æ€ - OLAPStatus get_root_path_used_stat(const std::string& root_path, bool* is_used); - - // @brief 设置root_path是å¦å¯ç”¨ - OLAPStatus set_root_path_used_stat(const std::string& root_path, bool is_used); - - // @brief 获å–当å‰å¯ç”¨çš„root_path,并根æ®å¯ç”¨å®¹é‡å¤§å°æŽ’åº - void get_all_available_root_path(RootPathVec* all_available_root_path); - - // @brief èŽ·å–æ‰€æœ‰root_pathä¿¡æ¯ - OLAPStatus get_all_root_path_info( - std::vector* root_paths_info, - bool need_capacity = true); - - // @brief 釿–°åŠ è½½root_pathsä¿¡æ¯ï¼Œå…¨é‡æ“作。 - // 对于新增的root_path,åŒinitæ“作 - // 对于删除的root_path,è¦åŒæ—¶ä»Žå†…存中删除相关表。 - // 对于未å˜çš„,如果之å‰ä¸ºä¸å¯ä½¿ç”¨çжæ€ï¼Œåˆ™éœ€è¦é‡æ–°è¿›è¡Œroot_path检测 - // - // NOTE: be/ce scheduler policy doesn't support - OLAPStatus reload_root_paths(const char* root_paths); - - OLAPStatus register_table_into_root_path(OLAPTable* olap_table); - - OLAPStatus unregister_table_from_root_path(OLAPTable* olap_table); - - // ç£ç›˜çжæ€ç›‘测。监测unused_flag路劲新的对应root_path unused标识ä½ï¼Œ - // 当检测到有unused标识时,从内存中删除对应表信æ¯ï¼Œç£ç›˜æ•°æ®ä¸åŠ¨ã€‚ - // 当ç£ç›˜çжæ€ä¸ºä¸å¯ç”¨ï¼Œä½†æœªæ£€æµ‹åˆ°unused标识时,需è¦ä»Žroot_path上 - // 釿–°åŠ è½½æ•°æ®ã€‚ - void start_disk_stat_monitor(); - - // get root path for creating table. The returned vector of root path should be random, - // for avoiding that all the table would be deployed one disk. - void get_root_path_for_create_table( - TStorageMedium::type storage_medium, RootPathVec *root_path); - void get_table_data_path(std::vector* data_paths); - - uint32_t available_storage_medium_type_count() { - return _available_storage_medium_type_count; - } - - uint32_t total_storage_medium_type_count() { - return _total_storage_medium_type_count; - } - - int32_t effective_cluster_id() const { - return _effective_cluster_id; - } - - OLAPStatus get_root_path_shard(const std::string& root_path, uint64_t* shard); - - static bool is_ssd_disk(const std::string& file_path); - - uint32_t get_file_system_count() { - return _root_paths.size(); - } - - virtual OLAPStatus set_cluster_id(int32_t cluster_id); - - OLAPStatus check_all_root_path_cluster_id( - const std::vector& root_path_vec, - const std::vector& is_accessable_vec); - - boost::condition_variable disk_broken_cv; - std::atomic_bool is_report_disk_state_already; - std::atomic_bool is_report_olap_table_already; - -private: - typedef std::map RootPathMap; - - // 检测ç£ç›˜ã€‚主è¦é€šè¿‡å‘¨æœŸåœ°è¯»å†™4Kçš„æµ‹è¯•æ•°æ® - void _start_check_disks(); - - bool _used_disk_not_enough(uint32_t unused_num, uint32_t total_num); - - OLAPStatus _check_existed_root_path(const std::string& root_path, int64_t* capacity); - - OLAPStatus _check_root_paths( - RootPathVec& root_path_vec, - CapacityVec* capacity_vec, - std::vector* is_accessable_vec); - - OLAPStatus _get_root_path_capacity( - const std::string& root_path, - int64_t* data_used, - int64_t* disk_available); - - OLAPStatus _get_root_path_file_system(const std::string& root_path, std::string* file_system); - - OLAPStatus _get_root_path_current_shard(const std::string& root_path, uint64_t* shard); - - OLAPStatus _config_root_path_unused_flag_file( - const std::string& root_path, - std::string* unused_flag_file); - - OLAPStatus _create_unused_flag_file(std::string& unused_flag_file); - - OLAPStatus _update_root_path_info(const std::string& root_path, RootPathInfo* root_path_info); - - OLAPStatus _read_and_write_test_file(const std::string& root_path); - - void _delete_tables_on_unused_root_path(); - - void _detect_unused_flag(); - - void _remove_all_unused_flag_file(); - - void _update_storage_medium_type_count(); - - OLAPStatus _get_cluster_id_path_vec(std::vector* cluster_id_path_vec); - - OLAPStatus _get_cluster_id_from_path(const std::string& path, int32_t* cluster_id); - OLAPStatus _write_cluster_id_to_path(const std::string& path, int32_t cluster_id); - - OLAPStatus _judge_and_update_effective_cluster_id(int32_t cluster_id); - - OLAPStatus _check_recover_root_path_cluster_id(const std::string& root_path); - - bool _check_root_path_exist(const std::string& root_path); - - RootPathMap _root_paths; - std::string _unused_flag_path; - char* _test_file_write_buf; - char* _test_file_read_buf; - uint32_t _rand_seed; - uint32_t _total_storage_medium_type_count; - uint32_t _available_storage_medium_type_count; - - int32_t _effective_cluster_id; - bool _is_all_cluster_id_exist; - bool _is_drop_tables; - - // 错误ç£ç›˜æ‰€åœ¨ç™¾åˆ†æ¯”,超过设定的值,则engine需è¦é€€å‡ºè¿è¡Œ - uint32_t _min_percentage_of_error_disk; - MutexLock _mutex; - static const size_t TEST_FILE_BUF_SIZE = 4096; - static const size_t DIRECT_IO_ALIGNMENT = 512; - - DISALLOW_COPY_AND_ASSIGN(OLAPRootPath); -}; // class OLAPRootPath - -} // namespace palo - -#endif // BDG_PALO_BE_SRC_OLAP_OLAP_ROOTPATH_H - diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index 39a1fa92fc..b2cd20c4c3 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -3,8 +3,7 @@ // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 +// // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an @@ -13,7 +12,7 @@ // specific language governing permissions and limitations // under the License. -#include "olap/olap_server.h" +#include "olap/olap_engine.h" #include #include @@ -24,15 +23,12 @@ #include -#include "olap/command_executor.h" #include "olap/cumulative_compaction.h" #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/olap_engine.h" -#include "olap/olap_snapshot.h" #include "agent/cgroups_mgr.h" - using std::string; namespace palo { @@ -40,86 +36,54 @@ namespace palo { // number of running SCHEMA-CHANGE threads volatile uint32_t g_schema_change_active_threads = 0; -MutexLock OLAPServer::_s_garbage_sweeper_mutex = MutexLock(); -Condition OLAPServer::_s_garbage_sweeper_cond = Condition(OLAPServer::_s_garbage_sweeper_mutex); - -MutexLock OLAPServer::_s_disk_stat_monitor_mutex = MutexLock(); -Condition OLAPServer::_s_disk_stat_monitor_cond = Condition(OLAPServer::_s_disk_stat_monitor_mutex); - -MutexLock OLAPServer::_s_unused_index_mutex = MutexLock(); -Condition OLAPServer::_s_unused_index_cond = Condition(OLAPServer::_s_unused_index_mutex); - -MutexLock OLAPServer::_s_check_disks_mutex = MutexLock(); -Condition OLAPServer::_s_check_disks_cond = Condition(OLAPServer::_s_check_disks_mutex); - -MutexLock OLAPServer::_s_session_timeout_mutex = MutexLock(); -Condition OLAPServer::_s_session_timeout_cond = Condition(OLAPServer::_s_session_timeout_mutex); - -OLAPServer::OLAPServer() { } - -OLAPStatus OLAPServer::init(const char* config_path, const char* config_file) { +OLAPStatus OLAPEngine::_start_bg_worker() { // start thread for monitoring the snapshot and trash folder - if (pthread_create(&_garbage_sweeper_thread, - NULL, - _garbage_sweeper_thread_callback, - NULL) != 0) { - OLAP_LOG_FATAL("failed to start garbage sweeper thread."); - return OLAP_ERR_INIT_FAILED; - } + _garbage_sweeper_thread = std::thread( + [this] { + _garbage_sweeper_thread_callback(nullptr); + }); // start thread for monitoring the table with io error - if (pthread_create(&_disk_stat_monitor_thread, - NULL, - _disk_stat_monitor_thread_callback, - NULL) != 0) { - OLAP_LOG_FATAL("failed to start disk stat monitor thread."); - return OLAP_ERR_INIT_FAILED; - } + _disk_stat_monitor_thread = std::thread( + [this] { + _disk_stat_monitor_thread_callback(nullptr); + }); // start thread for monitoring the unused index - if (pthread_create(&_unused_index_thread, - NULL, - _unused_index_thread_callback, - NULL) != 0) { - OLAP_LOG_FATAL("failed to start unused index thread."); - return OLAP_ERR_INIT_FAILED; - } + _unused_index_thread = std::thread( + [this] { + _unused_index_thread_callback(nullptr); + }); // start be and ce threads for merge data int32_t base_compaction_num_threads = config::base_compaction_num_threads; - _base_compaction_threads.resize(base_compaction_num_threads, -1); + _base_compaction_threads.reserve(base_compaction_num_threads); for (uint32_t i = 0; i < base_compaction_num_threads; ++i) { - if (0 != pthread_create(&_base_compaction_threads[i], - NULL, - _base_compaction_thread_callback, - NULL)) { - OLAP_LOG_FATAL("failed to start base compaction thread. [id=%u]", i); - return OLAP_ERR_INIT_FAILED; - } + _base_compaction_threads.emplace_back( + [this] { + _base_compaction_thread_callback(nullptr); + }); } int32_t cumulative_compaction_num_threads = config::cumulative_compaction_num_threads; - _cumulative_compaction_threads.resize(cumulative_compaction_num_threads, -1); + _cumulative_compaction_threads.reserve(cumulative_compaction_num_threads); for (uint32_t i = 0; i < cumulative_compaction_num_threads; ++i) { - if (0 != pthread_create(&(_cumulative_compaction_threads[i]), - NULL, - _cumulative_compaction_thread_callback, - NULL)) { - OLAP_LOG_FATAL("failed to start cumulative thread. [id=%u]", i); - return OLAP_ERR_INIT_FAILED; - } + _cumulative_compaction_threads.emplace_back( + [this] { + _cumulative_compaction_thread_callback(nullptr); + }); } - if (0 != pthread_create(&_fd_cache_clean_thread, NULL, _fd_cache_clean_callback, NULL)) { - OLAP_LOG_FATAL("failed to start fd_cache_clean thread"); - return OLAP_ERR_INIT_FAILED; - } + _fd_cache_clean_thread = std::thread( + [this] { + _fd_cache_clean_callback(nullptr); + }); OLAP_LOG_TRACE("init finished."); return OLAP_SUCCESS; } -void* OLAPServer::_fd_cache_clean_callback(void* arg) { +void* OLAPEngine::_fd_cache_clean_callback(void* arg) { #ifdef GOOGLE_PROFILER ProfilerRegisterThread(); #endif @@ -131,13 +95,13 @@ void* OLAPServer::_fd_cache_clean_callback(void* arg) { } while (true) { sleep(interval); - OLAPEngine::get_instance()->start_clean_fd_cache(); + start_clean_fd_cache(); } return NULL; } -void* OLAPServer::_base_compaction_thread_callback(void* arg) { +void* OLAPEngine::_base_compaction_thread_callback(void* arg) { #ifdef GOOGLE_PROFILER ProfilerRegisterThread(); #endif @@ -148,14 +112,14 @@ void* OLAPServer::_base_compaction_thread_callback(void* arg) { interval = 1; } - string last_base_compaction_fs; - TTabletId last_base_compaction_tablet_id = -1; + //string last_base_compaction_fs; + //TTabletId last_base_compaction_tablet_id = -1; while (true) { // must be here, because this thread is start on start and // cgroup is not initialized at this time // add tid to cgroup CgroupsMgr::apply_system_cgroup(); - OLAPEngine::get_instance()->start_base_compaction(&last_base_compaction_fs, &last_base_compaction_tablet_id); + perform_base_compaction(); usleep(interval * 1000000); } @@ -163,13 +127,12 @@ void* OLAPServer::_base_compaction_thread_callback(void* arg) { return NULL; } -void* OLAPServer::_garbage_sweeper_thread_callback(void* arg) { +void* OLAPEngine::_garbage_sweeper_thread_callback(void* arg) { #ifdef GOOGLE_PROFILER ProfilerRegisterThread(); #endif uint32_t max_interval = config::max_garbage_sweep_interval; uint32_t min_interval = config::min_garbage_sweep_interval; - AutoMutexLock l(&_s_garbage_sweeper_mutex); if (!(max_interval >= min_interval && min_interval > 0)) { OLAP_LOG_WARNING("garbage sweep interval config is illegal: [max=%d min=%d].", @@ -195,10 +158,10 @@ void* OLAPServer::_garbage_sweeper_thread_callback(void* arg) { // 此时的特性,当usage<60%时,curr_interval的时间接近max_interval, // 当usage > 80%时,curr_interval接近min_interval curr_interval = curr_interval > min_interval ? curr_interval : min_interval; - _s_garbage_sweeper_cond.wait_for_seconds(curr_interval); + sleep(curr_interval); // 开始清ç†ï¼Œå¹¶å¾—到清ç†åŽçš„ç£ç›˜ä½¿ç”¨çއ - OLAPStatus res = OLAPEngine::get_instance()->start_trash_sweep(&usage); + OLAPStatus res = start_trash_sweep(&usage); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("one or more errors occur when sweep trash." "see previous message for detail. [err code=%d]", res); @@ -209,13 +172,12 @@ void* OLAPServer::_garbage_sweeper_thread_callback(void* arg) { return NULL; } -void* OLAPServer::_disk_stat_monitor_thread_callback(void* arg) { +void* OLAPEngine::_disk_stat_monitor_thread_callback(void* arg) { #ifdef GOOGLE_PROFILER ProfilerRegisterThread(); #endif uint32_t interval = config::disk_stat_monitor_interval; - AutoMutexLock l(&_s_disk_stat_monitor_mutex); if (interval <= 0) { OLAP_LOG_WARNING("disk_stat_monitor_interval config is illegal: [%d], " @@ -224,20 +186,19 @@ void* OLAPServer::_disk_stat_monitor_thread_callback(void* arg) { } while (true) { - OLAPRootPath::get_instance()->start_disk_stat_monitor(); - _s_disk_stat_monitor_cond.wait_for_seconds(interval); + start_disk_stat_monitor(); + sleep(interval); } return NULL; } -void* OLAPServer::_unused_index_thread_callback(void* arg) { +void* OLAPEngine::_unused_index_thread_callback(void* arg) { #ifdef GOOGLE_PROFILER ProfilerRegisterThread(); #endif uint32_t interval = config::unused_index_monitor_interval; - AutoMutexLock l(&_s_unused_index_mutex); if (interval <= 0) { OLAP_LOG_WARNING("unused_index_monitor_interval config is illegal: [%d], " @@ -246,22 +207,22 @@ void* OLAPServer::_unused_index_thread_callback(void* arg) { } while (true) { - OLAPUnusedIndex::get_instance()->start_delete_unused_index(); - _s_unused_index_cond.wait_for_seconds(interval); + start_delete_unused_index(); + sleep(interval); } return NULL; } -void* OLAPServer::_cumulative_compaction_thread_callback(void* arg) { +void* OLAPEngine::_cumulative_compaction_thread_callback(void* arg) { #ifdef GOOGLE_PROFILER ProfilerRegisterThread(); #endif - OLAP_LOG_INFO("try to start cumulative compaction process!"); + LOG(INFO) << "try to start cumulative compaction process!"; uint32_t interval = config::cumulative_compaction_check_interval_seconds; if (interval <= 0) { - OLAP_LOG_WARNING("cumulative compaction check interval config is illegal: [%d], " - "force set to 1", interval); + LOG(WARNING) << "cumulative compaction check interval config is illegal:" << interval << ", " + << "will be forced set to one"; interval = 1; } @@ -270,7 +231,7 @@ void* OLAPServer::_cumulative_compaction_thread_callback(void* arg) { // cgroup is not initialized at this time // add tid to cgroup CgroupsMgr::apply_system_cgroup(); - OLAPEngine::get_instance()->start_cumulative_priority(); + perform_cumulative_compaction(); usleep(interval * 1000000); } diff --git a/be/src/olap/olap_server.h b/be/src/olap/olap_server.h deleted file mode 100644 index 6fbd76cc94..0000000000 --- a/be/src/olap/olap_server.h +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_OLAP_OLAP_SERVER_H -#define BDG_PALO_BE_SRC_OLAP_OLAP_SERVER_H - -#include "olap/atomic.h" -#include "olap/olap_define.h" -#include "olap/utils.h" - -namespace palo { -// @brief 基于ubserverçš„æœåŠ¡å™¨ç±»ï¼Œè´Ÿè´£æŽ¥æ”¶å‘é€å¤–界给OLAPEngine的请求 -class OLAPServer { -public: - OLAPServer(); - - // @brief åˆå§‹åŒ–server - // - // @param path é…置文件路径 - // @param file é…置文件å - OLAPStatus init(const char* path, const char* file); - -private: - // Thread functions - - // base compaction thread process function - static void* _base_compaction_thread_callback(void* arg); - - // garbage sweep thread process function. clear snapshot and trash folder - static void* _garbage_sweeper_thread_callback(void* arg); - - // delete table with io error process function - static void* _disk_stat_monitor_thread_callback(void* arg); - - // unused index process function - static void* _unused_index_thread_callback(void* arg); - - // cumulative process function - static void* _cumulative_compaction_thread_callback(void* arg); - - // clean file descriptors cache - static void* _fd_cache_clean_callback(void* arg); - - // thread to monitor snapshot expiry - pthread_t _garbage_sweeper_thread; - static MutexLock _s_garbage_sweeper_mutex; - static Condition _s_garbage_sweeper_cond; - - // thread to monitor disk stat - pthread_t _disk_stat_monitor_thread; - static MutexLock _s_disk_stat_monitor_mutex; - static Condition _s_disk_stat_monitor_cond; - - // thread to check disks - pthread_t _check_disks_thread; - static MutexLock _s_check_disks_mutex; - static Condition _s_check_disks_cond; - - // thread to monitor unused index - pthread_t _unused_index_thread; - static MutexLock _s_unused_index_mutex; - static Condition _s_unused_index_cond; - - // thread to check session timeout - pthread_t _session_timeout_thread; - static MutexLock _s_session_timeout_mutex; - static Condition _s_session_timeout_cond; - - // thread to run base compaction - std::vector _base_compaction_threads; - - // thread to check cumulative - std::vector _cumulative_compaction_threads; - - pthread_t _fd_cache_clean_thread; - - static atomic_t _s_request_number; -}; - -} // namespace palo - -#endif // BDG_PALO_BE_SRC_OLAP_OLAP_SERVER_H diff --git a/be/src/olap/olap_snapshot.cpp b/be/src/olap/olap_snapshot.cpp index a13c8d6755..1826615844 100644 --- a/be/src/olap/olap_snapshot.cpp +++ b/be/src/olap/olap_snapshot.cpp @@ -13,7 +13,7 @@ // specific language governing permissions and limitations // under the License. -#include "olap/olap_snapshot.h" +#include "olap/olap_engine.h" #include #include @@ -33,11 +33,13 @@ #include "olap/olap_common.h" #include "olap/olap_data.h" #include "olap/olap_define.h" -#include "olap/olap_engine.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "olap/olap_table.h" +#include "olap/olap_header_manager.h" #include "olap/push_handler.h" +#include "olap/store.h" #include "util/file_utils.h" +#include "util/palo_metrics.h" using boost::filesystem::canonical; using boost::filesystem::copy_file; @@ -53,11 +55,7 @@ using std::list; namespace palo { -OLAPSnapshot::OLAPSnapshot(): _base_id(0) {} - -OLAPSnapshot::~OLAPSnapshot() {} - -OLAPStatus OLAPSnapshot::make_snapshot( +OLAPStatus OLAPEngine::make_snapshot( const TSnapshotRequest& request, string* snapshot_path) { OLAPStatus res = OLAP_SUCCESS; @@ -66,51 +64,55 @@ OLAPStatus OLAPSnapshot::make_snapshot( return OLAP_ERR_INPUT_PARAMETER_ERROR; } - SmartOLAPTable ref_olap_table = - OLAPEngine::get_instance()->get_table(request.tablet_id, request.schema_hash); + OLAPTablePtr ref_olap_table = get_table(request.tablet_id, request.schema_hash); if (ref_olap_table.get() == NULL) { OLAP_LOG_WARNING("failed to get olap table. [table=%ld schema_hash=%d]", request.tablet_id, request.schema_hash); return OLAP_ERR_TABLE_NOT_FOUND; } - res = _create_snapshot_files(ref_olap_table, request, snapshot_path); + if (request.__isset.missing_version) { + res = _create_incremental_snapshot_files(ref_olap_table, request, snapshot_path); + // if all nodes has been upgraded, it can be removed + (const_cast(request)).__set_allow_incremental_clone(true); + } else { + res = _create_snapshot_files(ref_olap_table, request, snapshot_path); + } + if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("failed to make snapshot. [res=%d table=%ld schema_hash=%d]", res, request.tablet_id, request.schema_hash); return res; } - OLAP_LOG_TRACE("success to make snapshot. [path='%s']", snapshot_path->c_str()); + VLOG(3) << "success to make snapshot. [path='" << snapshot_path << "']"; return res; } -OLAPStatus OLAPSnapshot::release_snapshot(const string& snapshot_path) { +OLAPStatus OLAPEngine::release_snapshot(const string& snapshot_path) { // 如果请求的snapshot_pathä½äºŽroot/snapshotæ–‡ä»¶å¤¹ä¸‹ï¼Œåˆ™è®¤ä¸ºæ˜¯åˆæ³•的,å¯ä»¥åˆ é™¤ // å¦åˆ™è®¤ä¸ºæ˜¯éžæ³•请求,返回错误结果 - OLAPRootPath::RootPathVec all_available_root_path; - OLAPRootPath::get_instance()->get_all_available_root_path(&all_available_root_path); - - for (std::string& root_path : all_available_root_path) { - path boost_root_path(root_path); + auto stores = get_stores(); + for (auto store : stores) { + path boost_root_path(store->path()); string abs_path = canonical(boost_root_path).string(); if (snapshot_path.compare(0, abs_path.size(), abs_path) == 0 && snapshot_path.compare(abs_path.size(), SNAPSHOT_PREFIX.size(), SNAPSHOT_PREFIX) == 0) { remove_all_dir(snapshot_path); - OLAP_LOG_TRACE("success to release snapshot path. [path='%s']", snapshot_path.c_str()); + VLOG(3) << "success to release snapshot path. [path='" << snapshot_path << "']"; return OLAP_SUCCESS; } } - OLAP_LOG_WARNING("released snapshot path illegal. [path='%s']", snapshot_path.c_str()); + LOG(WARNING) << "released snapshot path illegal. [path='" << snapshot_path << "']"; return OLAP_ERR_CE_CMD_PARAMS_ERROR; } -OLAPStatus OLAPSnapshot::_calc_snapshot_id_path( - const SmartOLAPTable& olap_table, +OLAPStatus OLAPEngine::_calc_snapshot_id_path( + const OLAPTablePtr& olap_table, string* out_path) { OLAPStatus res = OLAP_SUCCESS; if (out_path == nullptr) { @@ -127,15 +129,15 @@ OLAPStatus OLAPSnapshot::_calc_snapshot_id_path( } stringstream snapshot_id_path_stream; - AutoMutexLock auto_lock(&_mutex); // will automatically unlock when function return. + MutexLock auto_lock(&_snapshot_mutex); // will automatically unlock when function return. snapshot_id_path_stream << olap_table->storage_root_path_name() << SNAPSHOT_PREFIX - << "/" << time_str << "." << _base_id++; + << "/" << time_str << "." << _snapshot_base_id++; *out_path = snapshot_id_path_stream.str(); return res; } -string OLAPSnapshot::_get_schema_hash_full_path( - const SmartOLAPTable& ref_olap_table, +string OLAPEngine::_get_schema_hash_full_path( + const OLAPTablePtr& ref_olap_table, const string& location) const { stringstream schema_full_path_stream; schema_full_path_stream << location @@ -146,74 +148,76 @@ string OLAPSnapshot::_get_schema_hash_full_path( return schema_full_path; } -string OLAPSnapshot::_get_header_full_path( - const SmartOLAPTable& ref_olap_table, +string OLAPEngine::_get_header_full_path( + const OLAPTablePtr& ref_olap_table, const std::string& schema_hash_path) const { stringstream header_name_stream; header_name_stream << schema_hash_path << "/" << ref_olap_table->tablet_id() << ".hdr"; return header_name_stream.str(); } -void OLAPSnapshot::_update_header_file_info( +void OLAPEngine::_update_header_file_info( const vector& shortest_versions, - OLAPHeader* olap_header) { + OLAPHeader* header) { // clear schema_change_status - olap_header->clear_schema_change_status(); + header->clear_schema_change_status(); // remove all old version and add new version - olap_header->delete_all_versions(); + header->delete_all_versions(); - for (uint32_t i = 0; i < shortest_versions.size(); ++i) { - if (shortest_versions[i].column_statistics.size() == 0) { - olap_header->add_version( - shortest_versions[i].version, - shortest_versions[i].version_hash, - shortest_versions[i].num_segments, - 0, - shortest_versions[i].index_size, - shortest_versions[i].data_size, - shortest_versions[i].num_rows); - } else { - olap_header->add_version( - shortest_versions[i].version, - shortest_versions[i].version_hash, - shortest_versions[i].num_segments, - 0, - shortest_versions[i].index_size, - shortest_versions[i].data_size, - shortest_versions[i].num_rows, - &shortest_versions[i].column_statistics); + for (const VersionEntity& entity : shortest_versions) { + Version version = entity.version; + VersionHash v_hash = entity.version_hash; + for (RowSetEntity rowset : entity.rowset_vec) { + int32_t rowset_id = rowset.rowset_id; + const std::vector* column_statistics = nullptr; + if (!rowset.key_ranges.empty()) { + column_statistics = &(rowset.key_ranges); + } + header->add_version(version, v_hash, rowset_id, rowset.num_segments, rowset.index_size, + rowset.data_size, rowset.num_rows, rowset.empty, column_statistics); } } } -OLAPStatus OLAPSnapshot::_link_index_and_data_files( - const string& header_path, - const SmartOLAPTable& ref_olap_table, +OLAPStatus OLAPEngine::_link_index_and_data_files( + const string& schema_hash_path, + const OLAPTablePtr& ref_olap_table, const vector& version_entity_vec) { OLAPStatus res = OLAP_SUCCESS; + std::stringstream prefix_stream; + prefix_stream << schema_hash_path << "/" << ref_olap_table->tablet_id(); + std::string tablet_path_prefix = prefix_stream.str(); for (const VersionEntity& entity : version_entity_vec) { - for (uint32_t i = 0; i < entity.num_segments; ++i) { - string index_path = _construct_index_file_path( - header_path, entity.version, entity.version_hash, i); - string ref_table_index_path = ref_olap_table->construct_index_file_path( - entity.version, entity.version_hash, i); - res = _create_hard_link(ref_table_index_path, index_path); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create hard link. [header_path=%s from_path=%s to_path=%s]", - header_path.c_str(), ref_table_index_path.c_str(), index_path.c_str()); - return res; - } + Version version = entity.version; + VersionHash v_hash = entity.version_hash; + for (RowSetEntity rowset : entity.rowset_vec) { + int32_t rowset_id = rowset.rowset_id; + for (int seg_id = 0; seg_id < rowset.num_segments; ++seg_id) { + std::string index_path = + _construct_index_file_path(tablet_path_prefix, version, v_hash, rowset_id, seg_id); + std::string ref_table_index_path = + ref_olap_table->construct_index_file_path(version, v_hash, rowset_id, seg_id); + res = _create_hard_link(ref_table_index_path, index_path); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to create hard link. " + << " schema_hash_path=" << schema_hash_path + << " from_path=" << ref_table_index_path + << " to_path=" << index_path; + return res; + } - string data_path = _construct_data_file_path( - header_path, entity.version, entity.version_hash, i); - string ref_table_data_path = ref_olap_table->construct_data_file_path( - entity.version, entity.version_hash, i); - res = _create_hard_link(ref_table_data_path, data_path); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create hard link. [header_path=%s from_path=%s to_path=%s]", - header_path.c_str(), ref_table_data_path.c_str(), data_path.c_str()); - return res; + std:: string data_path = + _construct_data_file_path(tablet_path_prefix, version, v_hash, rowset_id, seg_id); + std::string ref_table_data_path = + ref_olap_table->construct_data_file_path(version, v_hash, rowset_id, seg_id); + res = _create_hard_link(ref_table_data_path, data_path); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to create hard link." + << "tablet_path_prefix=" << tablet_path_prefix << ", " + << "from_path=" << ref_table_data_path << ", to_path=" << data_path; + return res; + } } } } @@ -221,36 +225,42 @@ OLAPStatus OLAPSnapshot::_link_index_and_data_files( return res; } -OLAPStatus OLAPSnapshot::_copy_index_and_data_files( - const string& header_path, - const SmartOLAPTable& ref_olap_table, +OLAPStatus OLAPEngine::_copy_index_and_data_files( + const string& schema_hash_path, + const OLAPTablePtr& ref_olap_table, vector& version_entity_vec) { - for (const VersionEntity& entity : version_entity_vec) { - for (uint32_t i = 0; i < entity.num_segments; ++i) { - string index_path; - string data_path; - string ref_table_index_path; - string ref_table_data_path; + std::stringstream prefix_stream; + prefix_stream << schema_hash_path << "/" << ref_olap_table->tablet_id(); + std::string tablet_path_prefix = prefix_stream.str(); + for (VersionEntity& entity : version_entity_vec) { + Version version = entity.version; + VersionHash v_hash = entity.version_hash; + for (RowSetEntity rowset : entity.rowset_vec) { + int32_t rowset_id = rowset.rowset_id; + for (int seg_id = 0; seg_id < rowset.num_segments; ++seg_id) { + string index_path = + _construct_index_file_path(tablet_path_prefix, version, v_hash, rowset_id, seg_id); + string ref_table_index_path = ref_olap_table->construct_index_file_path( + version, v_hash, rowset_id, seg_id); + Status res = FileUtils::copy_file(ref_table_index_path, index_path); + if (!res.ok()) { + LOG(WARNING) << "fail to copy index file." + << "dest=" << index_path << ", " + << "src=" << ref_table_index_path; + return OLAP_ERR_COPY_FILE_ERROR; + } - index_path = _construct_index_file_path( - header_path, entity.version, entity.version_hash, i); - ref_table_index_path = ref_olap_table->construct_index_file_path( - entity.version, entity.version_hash, i); - Status status = FileUtils::copy_file(ref_table_index_path, index_path); - if (!status.ok()) { - OLAP_LOG_WARNING("fail to copy file. [src='%s' dest='%s']", - ref_table_index_path.c_str(), index_path.c_str()); - return OLAP_ERR_COPY_FILE_ERROR; - } - data_path = _construct_data_file_path( - header_path, entity.version, entity.version_hash, i); - ref_table_data_path = ref_olap_table->construct_data_file_path( - entity.version, entity.version_hash, i); - status = FileUtils::copy_file(ref_table_data_path, data_path); - if (!status.ok()) { - OLAP_LOG_WARNING("fail to copy file. [src='%s' dest='%s']", - ref_table_data_path.c_str(), data_path.c_str()); - return OLAP_ERR_COPY_FILE_ERROR; + string data_path = + _construct_data_file_path(tablet_path_prefix, version, v_hash, rowset_id, seg_id); + string ref_table_data_path = ref_olap_table->construct_data_file_path( + version, v_hash, rowset_id, seg_id); + res = FileUtils::copy_file(ref_table_data_path, data_path); + if (!res.ok()) { + LOG(WARNING) << "fail to copy data file." + << "dest=" << index_path << ", " + << "src=" << ref_table_index_path; + return OLAP_ERR_COPY_FILE_ERROR; + } } } } @@ -258,8 +268,8 @@ OLAPStatus OLAPSnapshot::_copy_index_and_data_files( return OLAP_SUCCESS; } -OLAPStatus OLAPSnapshot::_create_snapshot_files( - const SmartOLAPTable& ref_olap_table, +OLAPStatus OLAPEngine::_create_snapshot_files( + const OLAPTablePtr& ref_olap_table, const TSnapshotRequest& request, string* snapshot_path) { OLAPStatus res = OLAP_SUCCESS; @@ -296,25 +306,25 @@ OLAPStatus OLAPSnapshot::_create_snapshot_files( OLAPHeader* new_olap_header = nullptr; do { // get latest version - const FileVersionMessage* latest_version = NULL; - latest_version = ref_olap_table->latest_version(); - if (latest_version == NULL) { - OLAP_LOG_WARNING("table does not have any version. [path='%s']", + const PDelta* lastest_version = NULL; + lastest_version = ref_olap_table->lastest_version(); + if (lastest_version == NULL) { + OLAP_LOG_WARNING("table has not any version. [path='%s']", ref_olap_table->full_name().c_str()); res = OLAP_ERR_VERSION_NOT_EXIST; break; } // get snapshot version, use request.version if specified - int32_t version = latest_version->end_version(); + int32_t version = lastest_version->end_version(); if (request.__isset.version) { - if (latest_version->end_version() < request.version - || (latest_version->start_version() == latest_version->end_version() - && latest_version->end_version() == request.version - && latest_version->version_hash() != request.version_hash)) { + if (lastest_version->end_version() < request.version + || (lastest_version->start_version() == lastest_version->end_version() + && lastest_version->end_version() == request.version + && lastest_version->version_hash() != request.version_hash)) { OLAP_LOG_WARNING("invalid make snapshot request. " "[version=%d version_hash=%ld req_version=%d req_version_hash=%ld]", - latest_version->end_version(), latest_version->version_hash(), + lastest_version->end_version(), lastest_version->version_hash(), request.version, request.version_hash); res = OLAP_ERR_INPUT_PARAMETER_ERROR; break; @@ -346,18 +356,18 @@ OLAPStatus OLAPSnapshot::_create_snapshot_files( } // load table header, in order to remove versions that not in shortest version path - string old_header_path = ref_olap_table->header_file_name(); - new_olap_header = new(nothrow) OLAPHeader(old_header_path); + OlapStore* store = ref_olap_table->store(); + new_olap_header = new(nothrow) OLAPHeader(); if (new_olap_header == NULL) { OLAP_LOG_WARNING("fail to malloc OLAPHeader."); res = OLAP_ERR_MALLOC_ERROR; break; } - res = new_olap_header->load(); + res = OlapHeaderManager::get_header(store, ref_olap_table->tablet_id(), ref_olap_table->schema_hash(), new_olap_header); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load header. [res=%d header_file=%s]", - res, old_header_path.c_str()); + LOG(WARNING) << "fail to load header. res=" << res + << "tablet_id=" << ref_olap_table->tablet_id() << ", schema_hash=" << ref_olap_table->schema_hash(); break; } @@ -366,15 +376,25 @@ OLAPStatus OLAPSnapshot::_create_snapshot_files( _update_header_file_info(shortest_versions, new_olap_header); // save new header - if ((res = new_olap_header->save(header_path)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("save header error. [table='%s' path='%s']", - ref_olap_table->full_name().c_str(), header_path.c_str()); + if ((res = OlapHeaderManager::save(store, ref_olap_table->tablet_id(), + ref_olap_table->schema_hash(), new_olap_header)) != OLAP_SUCCESS) { + LOG(WARNING) << "save header error. [table=" << ref_olap_table->full_name() + << "tablet_id=" << ref_olap_table->full_name() + << ", schema_hash=" << ref_olap_table->schema_hash(); + break; + } + + // save new header to snapshot header path + res = new_olap_header->save(header_path); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to save header. [res=%d tablet_id=%ld, schema_hash=%d, headerpath=%s]", + res, ref_olap_table->tablet_id(), ref_olap_table->schema_hash(), header_path.c_str()); break; } - res = _link_index_and_data_files(header_path, ref_olap_table, shortest_versions); + res = _link_index_and_data_files(schema_full_path, ref_olap_table, shortest_versions); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create hard link. [path=%s]", snapshot_id_path.c_str()); + LOG(WARNING) << "fail to create hard link. [path=" << snapshot_id_path << "]"; break; } @@ -383,7 +403,7 @@ OLAPStatus OLAPSnapshot::_create_snapshot_files( for (const VersionEntity& entity : shortest_versions) { if (entity.version.second == request.version) { if (entity.version.first != request.version) { - res = _append_single_delta(request, header_path); + res = _append_single_delta(request, store); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to append single delta. [res=%d]", res); } @@ -411,7 +431,7 @@ OLAPStatus OLAPSnapshot::_create_snapshot_files( snapshot_id_path.c_str()); if (check_dir_existed(snapshot_id_path)) { - OLAP_LOG_DEBUG("remove snapshot path. [path=%s]", snapshot_id_path.c_str()); + VLOG(3) << "remove snapshot path. [path=" << snapshot_id_path << "]"; remove_all_dir(snapshot_id_path); } } else { @@ -421,26 +441,149 @@ OLAPStatus OLAPSnapshot::_create_snapshot_files( return res; } -OLAPStatus OLAPSnapshot::_append_single_delta(const TSnapshotRequest& request, const string& header_path) { +OLAPStatus OLAPEngine::_create_incremental_snapshot_files( + const OLAPTablePtr& ref_olap_table, + const TSnapshotRequest& request, + string* snapshot_path) { + OLAP_LOG_INFO("begin to create incremental snapshot files. [table=%ld schema_hash=%d]", + request.tablet_id, request.schema_hash); OLAPStatus res = OLAP_SUCCESS; - OLAPTable* tablet = OLAPTable::create_from_header_file( - request.tablet_id, request.schema_hash, header_path); + if (snapshot_path == nullptr) { + OLAP_LOG_WARNING("output parameter cannot be NULL"); + return OLAP_ERR_INPUT_PARAMETER_ERROR; + } + + string snapshot_id_path; + res = _calc_snapshot_id_path(ref_olap_table, &snapshot_id_path); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("failed to calc snapshot_id_path, [ref table=%s]", + ref_olap_table->storage_root_path_name().c_str()); + return res; + } + + string schema_full_path = _get_schema_hash_full_path(ref_olap_table, snapshot_id_path); + if (check_dir_existed(schema_full_path)) { + OLAP_LOG_TRACE("remove the old schema_full_path."); + remove_all_dir(schema_full_path); + } + create_dirs(schema_full_path); + + path boost_path(snapshot_id_path); + string snapshot_id = canonical(boost_path).string(); + + ref_olap_table->obtain_header_rdlock(); + + do { + // save header to snapshot path + OLAPHeader olap_header; + res = OlapHeaderManager::get_header(ref_olap_table->store(), + ref_olap_table->tablet_id(), ref_olap_table->schema_hash(), &olap_header); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to load header. res=" << res << "tablet_id=" + << ref_olap_table->tablet_id() << ", schema_hash=" << ref_olap_table->schema_hash(); + break; + } + string header_path = _get_header_full_path(ref_olap_table, schema_full_path); + res = olap_header.save(header_path); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to save header to path:" << header_path; + remove_dir(header_path); + break; + } + + for (int64_t missing_version : request.missing_version) { + + // find missing version + const PDelta* incremental_delta = + ref_olap_table->get_incremental_delta(Version(missing_version, missing_version)); + if (incremental_delta != nullptr) { + OLAP_LOG_DEBUG("success to find missing version when snapshot, " + "begin to link files. [table=%ld schema_hash=%d version=%ld]", + request.tablet_id, request.schema_hash, missing_version); + // link files + for (uint32_t i = 0; i < incremental_delta->rowset(0).num_segments(); i++) { + int32_t rowset_id = incremental_delta->rowset(0).rowset_id(); + string from = ref_olap_table->construct_incremental_index_file_path( + Version(missing_version, missing_version), + incremental_delta->version_hash(), rowset_id, i); + string to = schema_full_path + '/' + basename(from.c_str()); + if ((res = _create_hard_link(from, to)) != OLAP_SUCCESS) { + break; + } + + from = ref_olap_table->construct_incremental_data_file_path( + Version(missing_version, missing_version), + incremental_delta->version_hash(), rowset_id, i); + to = schema_full_path + '/' + basename(from.c_str()); + if ((res = _create_hard_link(from, to)) != OLAP_SUCCESS) { + break; + } + } + + if (res != OLAP_SUCCESS) { + break; + } + + } else { + OLAP_LOG_WARNING("failed to find missing version when snapshot. " + "[table=%ld schema_hash=%d version=%ld]", + request.tablet_id, request.schema_hash, missing_version); + res = OLAP_ERR_VERSION_NOT_EXIST; + break; + } + } + + } while (0); + + ref_olap_table->release_header_lock(); + + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("failed to make incremental snapshot, try to delete the snapshot path. " + "[path=%s]", snapshot_id_path.c_str()); + + if (check_dir_existed(snapshot_id_path)) { + VLOG(3) << "remove snapshot path. [path=" << snapshot_id_path << "]"; + remove_all_dir(snapshot_id_path); + } + } else { + *snapshot_path = snapshot_id; + } + + return res; +} + +OLAPStatus OLAPEngine::_append_single_delta( + const TSnapshotRequest& request, OlapStore* store) { + OLAPStatus res = OLAP_SUCCESS; + string root_path = store->path(); + OLAPHeader* new_olap_header = new(nothrow) OLAPHeader(); + if (new_olap_header == NULL) { + OLAP_LOG_WARNING("fail to malloc OLAPHeader."); + return OLAP_ERR_MALLOC_ERROR; + } + + res = OlapHeaderManager::get_header(store, request.tablet_id, request.schema_hash, new_olap_header); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to create tablet from header file. [tablet_id=%ld, schema_hash=%d]", + request.tablet_id, request.schema_hash); + return res; + } + auto tablet = OLAPTable::create_from_header(new_olap_header, store); if (tablet == NULL) { - OLAP_LOG_WARNING("fail to create tablet from header file. [header_path='%s']", - header_path.c_str()); + OLAP_LOG_WARNING("fail to load tablet. [res=%d tablet_id='%ld, schema_hash=%d']", + res, request.tablet_id, request.schema_hash); return OLAP_ERR_INPUT_PARAMETER_ERROR; } res = tablet->load(); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load tablet. [res=%d header_path='%s']", - res, header_path.c_str()); + LOG(WARNING) << "fail to load tablet. [res=" << res << " header_path=" << store->path(); return res; } - const FileVersionMessage* latest_version = tablet->latest_version(); - if (latest_version->start_version() != request.version) { + const PDelta* lastest_version = tablet->lastest_version(); + if (lastest_version->start_version() != request.version) { TPushReq empty_push; empty_push.tablet_id = request.tablet_id; empty_push.schema_hash = request.schema_hash; @@ -448,8 +591,7 @@ OLAPStatus OLAPSnapshot::_append_single_delta(const TSnapshotRequest& request, c empty_push.version_hash = 0; PushHandler handler; - SmartOLAPTable smart_tablet(tablet); - res = handler.process(smart_tablet, empty_push, PUSH_NORMAL, NULL); + res = handler.process(tablet, empty_push, PUSH_NORMAL, NULL); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to push empty version. [res=%d version=%d]", res, empty_push.version); @@ -460,23 +602,23 @@ OLAPStatus OLAPSnapshot::_append_single_delta(const TSnapshotRequest& request, c return res; } -string OLAPSnapshot::_construct_index_file_path( - const string& header_path, +string OLAPEngine::_construct_index_file_path( + const string& tablet_path_prefix, const Version& version, VersionHash version_hash, - uint32_t segment) const { - return OLAPTable::construct_file_path(header_path, version, version_hash, segment, "idx"); + int32_t rowset_id, int32_t segment) const { + return OLAPTable::construct_file_path(tablet_path_prefix, version, version_hash, rowset_id, segment, "idx"); } -string OLAPSnapshot::_construct_data_file_path( - const string& header_path, +string OLAPEngine::_construct_data_file_path( + const string& tablet_path_prefix, const Version& version, VersionHash version_hash, - uint32_t segment) const { - return OLAPTable::construct_file_path(header_path, version, version_hash, segment, "dat"); + int32_t rowset_id, int32_t segment) const { + return OLAPTable::construct_file_path(tablet_path_prefix, version, version_hash, rowset_id, segment, "dat"); } -OLAPStatus OLAPSnapshot::_create_hard_link(const string& from_path, const string& to_path) { +OLAPStatus OLAPEngine::_create_hard_link(const string& from_path, const string& to_path) { if (link(from_path.c_str(), to_path.c_str()) == 0) { OLAP_LOG_TRACE("success to create hard link from path=%s to path=%s]", from_path.c_str(), to_path.c_str()); @@ -488,11 +630,16 @@ OLAPStatus OLAPSnapshot::_create_hard_link(const string& from_path, const string } } -OLAPStatus OLAPSnapshot::storage_medium_migrate( +OLAPStatus OLAPEngine::storage_medium_migrate( TTabletId tablet_id, TSchemaHash schema_hash, TStorageMedium::type storage_medium) { + OLAP_LOG_INFO("begin to process storage media migrate. " + "[tablet_id=%ld schema_hash=%d dest_storage_medium=%d]", + tablet_id, schema_hash, storage_medium); + PaloMetrics::storage_migrate_requests_total.increment(1); + OLAPStatus res = OLAP_SUCCESS; - SmartOLAPTable tablet = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + OLAPTablePtr tablet = get_table(tablet_id, schema_hash); if (tablet.get() == NULL) { OLAP_LOG_WARNING("can't find olap table. [tablet_id=%ld schema_hash=%d]", tablet_id, schema_hash); @@ -500,18 +647,14 @@ OLAPStatus OLAPSnapshot::storage_medium_migrate( } // judge case when no need to migrate - uint32_t count = OLAPRootPath::get_instance()->available_storage_medium_type_count(); + uint32_t count = available_storage_medium_type_count(); if (count <= 1) { OLAP_LOG_INFO("available storage medium type count is less than 1, " "no need to migrate. [count=%u]", count); return OLAP_SUCCESS; } - TStorageMedium::type src_storage_medium = TStorageMedium::HDD; - if (OLAPRootPath::is_ssd_disk(tablet->storage_root_path_name())) { - src_storage_medium = TStorageMedium::SSD; - } - + TStorageMedium::type src_storage_medium = tablet->store()->storage_medium(); if (src_storage_medium == storage_medium) { OLAP_LOG_INFO("tablet is already on specified storage medium. " "[storage_medium='%d']", storage_medium); @@ -524,15 +667,15 @@ OLAPStatus OLAPSnapshot::storage_medium_migrate( do { // get all versions to be migrate tablet->obtain_header_rdlock(); - const FileVersionMessage* latest_version = tablet->latest_version(); - if (latest_version == NULL) { + const PDelta* lastest_version = tablet->lastest_version(); + if (lastest_version == NULL) { tablet->release_header_lock(); res = OLAP_ERR_VERSION_NOT_EXIST; OLAP_LOG_WARNING("tablet has not any version."); break; } - int32_t end_version = latest_version->end_version(); + int32_t end_version = lastest_version->end_version(); tablet->acquire_data_sources(Version(0, end_version), &olap_data_sources); if (olap_data_sources.size() == 0) { tablet->release_header_lock(); @@ -547,23 +690,22 @@ OLAPStatus OLAPSnapshot::storage_medium_migrate( tablet->release_header_lock(); // generate schema hash path where files will be migrated - vector root_path_vec; - OLAPRootPath::get_instance()->get_root_path_for_create_table(storage_medium, &root_path_vec); - if (root_path_vec.size() == 0) { + auto stores = get_stores_for_create_table(storage_medium); + if (stores.empty()) { res = OLAP_ERR_INVALID_ROOT_PATH; OLAP_LOG_WARNING("fail to get root path for create tablet."); break; } uint64_t shard = 0; - res = OLAPRootPath::get_instance()->get_root_path_shard(root_path_vec[0], &shard); + res = stores[0]->get_shard(&shard); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to get root path shard. [res=%d]", res); break; } stringstream root_path_stream; - root_path_stream << root_path_vec[0] << DATA_PREFIX << "/" << shard; + root_path_stream << stores[0]->path() << DATA_PREFIX << "/" << shard; string schema_hash_path = _get_schema_hash_full_path(tablet, root_path_stream.str()); if (check_dir_existed(schema_hash_path)) { OLAP_LOG_DEBUG("schema hash path already exist, remove it. [schema_hash_path='%s']", @@ -573,23 +715,32 @@ OLAPStatus OLAPSnapshot::storage_medium_migrate( create_dirs(schema_hash_path); // migrate all index and data files but header file - string new_header_path = _get_header_full_path(tablet, schema_hash_path); - res = _copy_index_and_data_files(new_header_path, tablet, version_entity_vec); + res = _copy_index_and_data_files(schema_hash_path, tablet, version_entity_vec); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to copy index and data files when migrate. [res=%d]", res); break; } // generate new header file from the old - res = _generate_new_header(tablet, new_header_path, version_entity_vec); + OLAPHeader* new_olap_header = new(std::nothrow) OLAPHeader(); + if (new_olap_header == NULL) { + OLAP_LOG_WARNING("new olap header failed"); + return OLAP_ERR_BUFFER_OVERFLOW; + } + res = _generate_new_header(stores[0], shard, tablet, version_entity_vec, new_olap_header); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to generate new header file from the old. [res=%d]", res); break; } // load the new tablet into OLAPEngine - res = OLAPEngine::get_instance()->load_one_tablet( - tablet_id, schema_hash, schema_hash_path); + auto olap_table = OLAPTable::create_from_header(new_olap_header, stores[0]); + if (olap_table == NULL) { + OLAP_LOG_WARNING("failed to create from header"); + res = OLAP_ERR_TABLE_CREATE_FROM_HEADER_ERROR; + break; + } + res = add_table(tablet_id, schema_hash, olap_table); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to add tablet to OLAPEngine. [res=%d]", res); break; @@ -597,15 +748,14 @@ OLAPStatus OLAPSnapshot::storage_medium_migrate( // if old table finished schema change, then the schema change status of the new table is DONE // else the schema change status of the new table is FAILED - SmartOLAPTable new_tablet = - OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + OLAPTablePtr new_tablet = get_table(tablet_id, schema_hash); if (new_tablet.get() == NULL) { OLAP_LOG_WARNING("get null olap table. [tablet_id=%ld schema_hash=%d]", tablet_id, schema_hash); return OLAP_ERR_TABLE_NOT_FOUND; } SchemaChangeStatus tablet_status = tablet->schema_change_status(); - if (tablet->schema_change_status().status == AlterTableStatus::ALTER_TABLE_DONE) { + if (tablet->schema_change_status().status == AlterTableStatus::ALTER_TABLE_FINISHED) { new_tablet->set_schema_change_status(tablet_status.status, tablet_status.schema_hash, tablet_status.version); @@ -622,39 +772,37 @@ OLAPStatus OLAPSnapshot::storage_medium_migrate( return res; } -OLAPStatus OLAPSnapshot::_generate_new_header( - const SmartOLAPTable& tablet, - const string& new_header_path, - const vector& version_entity_vec) { - OLAPStatus res = OLAP_SUCCESS; - OLAPHeader* new_olap_header = NULL; - - { - AutoRWLock auto_lock(tablet->get_header_lock_ptr(), true); - new_olap_header = new(nothrow) OLAPHeader(tablet->header_file_name()); - if (new_olap_header == NULL) { - OLAP_LOG_WARNING("fail to malloc OLAPHeader. [size=%d]", sizeof(OLAPHeader)); - return OLAP_ERR_MALLOC_ERROR; - } - - res = new_olap_header->load(); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load header. [res=%d header_file=%s]", - res, tablet->header_file_name().c_str()); - SAFE_DELETE(new_olap_header); - return res; - } +OLAPStatus OLAPEngine::_generate_new_header( + OlapStore* store, + const uint64_t new_shard, + const OLAPTablePtr& tablet, + const vector& version_entity_vec, OLAPHeader* new_olap_header) { + if (store == nullptr) { + LOG(WARNING) << "fail to generate new header for store is null"; + return OLAP_ERR_HEADER_INIT_FAILED; } + OLAPStatus res = OLAP_SUCCESS; + + OlapStore* ref_store = + OLAPEngine::get_instance()->get_store(tablet->storage_root_path_name()); + OlapHeaderManager::get_header(ref_store, tablet->tablet_id(), tablet->schema_hash(), new_olap_header); + _update_header_file_info(version_entity_vec, new_olap_header); + new_olap_header->set_shard(new_shard); _update_header_file_info(version_entity_vec, new_olap_header); - res = new_olap_header->save(new_header_path); + res = OlapHeaderManager::save(store, tablet->tablet_id(), tablet->schema_hash(), new_olap_header); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to save olap header to new path. [res=%d new_header_path='%s']", - res, new_header_path.c_str()); + OLAP_LOG_WARNING("fail to save olap header to new db. [res=%d]", res); + return res; } - SAFE_DELETE(new_olap_header); + // delete old header + // TODO: make sure atomic update + OlapHeaderManager::remove(ref_store, tablet->tablet_id(), tablet->schema_hash()); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to delete olap header to old db. res=" << res; + } return res; } diff --git a/be/src/olap/olap_snapshot.h b/be/src/olap/olap_snapshot.h deleted file mode 100644 index 6c959f2a98..0000000000 --- a/be/src/olap/olap_snapshot.h +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef BDG_PALO_BE_SRC_OLAP_OLAP_SNAPSHOT_H -#define BDG_PALO_BE_SRC_OLAP_OLAP_SNAPSHOT_H - -#include -#include -#include -#include -#include - -#include "olap/field.h" -#include "olap/olap_define.h" -#include "olap/olap_header.h" -#include "olap/olap_table.h" -#include "olap/utils.h" - -namespace palo { - -class OLAPSnapshot { - DECLARE_SINGLETON(OLAPSnapshot) -public: - - // @brief 创建snapshot - // @param tablet_id [in] 原表的id - // @param schema_hash [in] 原表的schema,与tablet_id傿•°åˆèµ·æ¥å”¯ä¸€ç¡®å®šä¸€å¼ è¡¨ - // @param snapshot_path [out] 新生æˆçš„snapshot的路径 - OLAPStatus make_snapshot( - const TSnapshotRequest& request, - std::string* snapshot_path); - - // @brief 释放snapshot - // @param snapshot_path [in] è¦è¢«é‡Šæ”¾çš„snapshot的路径,åªåŒ…å«åˆ°ID - OLAPStatus release_snapshot(const std::string& snapshot_path); - - // @brief è¿ç§»æ•°æ®ï¼Œä»Žä¸€ç§å­˜å‚¨ä»‹è´¨åˆ°å¦ä¸€ç§å­˜å‚¨ä»‹è´¨ - OLAPStatus storage_medium_migrate( - TTabletId tablet_id, - TSchemaHash schema_hash, - TStorageMedium::type storage_medium); - -private: - - OLAPStatus _calc_snapshot_id_path( - const SmartOLAPTable& olap_table, - std::string* out_path); - - std::string _get_schema_hash_full_path( - const SmartOLAPTable& ref_olap_table, - const std::string& location) const; - - std::string _get_header_full_path( - const SmartOLAPTable& ref_olap_table, - const std::string& schema_hash_path) const; - - void _update_header_file_info( - const std::vector& shortest_version_entity, - OLAPHeader* olap_header); - - OLAPStatus _link_index_and_data_files( - const std::string& header_path, - const SmartOLAPTable& ref_olap_table, - const std::vector& version_entity_vec); - - OLAPStatus _copy_index_and_data_files( - const std::string& header_path, - const SmartOLAPTable& ref_olap_table, - std::vector& version_entity_vec); - - OLAPStatus _create_snapshot_files( - const SmartOLAPTable& ref_olap_table, - const TSnapshotRequest& request, - std::string* snapshot_path); - - OLAPStatus _append_single_delta( - const TSnapshotRequest& request, - const std::string& header_path); - - std::string _construct_index_file_path( - const std::string& header_path, - const Version& version, - VersionHash version_hash, - uint32_t segment) const; - - std::string _construct_data_file_path( - const std::string& header_path, - const Version& version, - VersionHash version_hash, - uint32_t segment) const; - - OLAPStatus _generate_new_header( - const SmartOLAPTable& tablet, - const std::string& new_header_path, - const std::vector& version_entity_vec); - - OLAPStatus _create_hard_link(const std::string& from_path, const std::string& to_path); - - MutexLock _mutex; - uint64_t _base_id; - - DISALLOW_COPY_AND_ASSIGN(OLAPSnapshot); -}; // class OLAPSnapshot - -} // namespace palo - -#endif // BDG_PALO_BE_SRC_OLAP_OLAP_SNAPSHOT_H diff --git a/be/src/olap/olap_table.cpp b/be/src/olap/olap_table.cpp index 7b2e78466a..84cb8545f9 100644 --- a/be/src/olap/olap_table.cpp +++ b/be/src/olap/olap_table.cpp @@ -31,11 +31,17 @@ #include "olap/olap_define.h" #include "olap/olap_engine.h" #include "olap/olap_index.h" -#include "olap/olap_rootpath.h" +#include "olap/rowset.h" #include "olap/reader.h" +#include "olap/store.h" #include "olap/row_cursor.h" #include "util/defer_op.h" +#include "olap/olap_header_manager.h" +#include "olap/olap_engine.h" +#include "olap/utils.h" +#include "olap/writer.h" +using std::pair; using std::map; using std::nothrow; using std::set; @@ -47,53 +53,62 @@ using boost::filesystem::path; namespace palo { -OLAPTable* OLAPTable::create_from_header_file( - TTabletId tablet_id, TSchemaHash schema_hash, const string& header_file) { +OLAPTablePtr OLAPTable::create_from_header_file( + TTabletId tablet_id, TSchemaHash schema_hash, + const string& header_file, OlapStore* store) { OLAPHeader* olap_header = NULL; - OLAPTable* olap_table = NULL; - olap_header = new(nothrow) OLAPHeader(header_file); if (olap_header == NULL) { - OLAP_LOG_WARNING("fail to malloc OLAPHeader."); + LOG(WARNING) << "fail to malloc OLAPHeader."; return NULL; } - if (olap_header->load() != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load header. [header_file=%s]", header_file.c_str()); + if (olap_header->load_and_init() != OLAP_SUCCESS) { + LOG(WARNING) << "fail to load header. header_file=" << header_file; delete olap_header; return NULL; } - if (olap_header->data_file_type() == OLAP_DATA_FILE) { - if (config::default_num_rows_per_data_block != olap_header->num_rows_per_data_block()) { - olap_header->set_num_rows_per_data_block(config::default_num_rows_per_data_block); - olap_header->save(); - } - } + // add new fields + olap_header->set_tablet_id(tablet_id); + olap_header->set_schema_hash(schema_hash); + path header_file_path(header_file); + std::string shard_path = header_file_path.parent_path().parent_path().parent_path().string(); + std::string shard_str = shard_path.substr(shard_path.find_last_of('/') + 1); + uint64_t shard = stol(shard_str); + olap_header->set_shard(shard); - olap_table = new(nothrow) OLAPTable(olap_header); + // save header info to kv db + // header key format: tablet_id + "_" + schema_hash + OLAPStatus s = OlapHeaderManager::save(store, tablet_id, schema_hash, olap_header); + if (s != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to save header to db. [header_file=%s]", header_file.c_str()); + delete olap_header; + return NULL; + } + return create_from_header(olap_header, store); +} + +OLAPTablePtr OLAPTable::create_from_header( + OLAPHeader* header, + OlapStore* store) { + auto olap_table = std::make_shared(header, store); if (olap_table == NULL) { - OLAP_LOG_WARNING("fail to validate table. [header_file=%s]", header_file.c_str()); - delete olap_header; - return NULL; + LOG(WARNING) << "fail to malloc a table."; + return nullptr; } - olap_table->_tablet_id = tablet_id; - olap_table->_schema_hash = schema_hash; - stringstream full_name_stream; - full_name_stream << tablet_id << "." << schema_hash; - olap_table->_full_name = full_name_stream.str(); - return olap_table; } -OLAPTable::OLAPTable(OLAPHeader* header) : +OLAPTable::OLAPTable(OLAPHeader* header, OlapStore* store) : _header(header), _is_dropped(false), _num_fields(0), _num_null_fields(0), _num_key_fields(0), _id(0), + _store(store), _is_loaded(false) { if (header == NULL) { return; // for convenience of mock test. @@ -170,28 +185,16 @@ OLAPTable::OLAPTable(OLAPHeader* header) : } } - if (header->file_version_size() > 0) { - int32_t start_version = 0; - int32_t end_version = 0; - - // èŽ·å–æ‰€æœ‰å¯ä»¥æŸ¥è¯¢çš„æœ‰æ•ˆç‰ˆæœ¬ - for (int i = 0; i < header->file_version_size(); i++) { - Version version(header->file_version(i).start_version(), - header->file_version(i).end_version()); - if (version.first == 0 && version.second > 0) { - start_version = version.second; - } else if (version.first == version.second && version.second > end_version) { - end_version = version.second; - } - } - - // 考虑一个Delta都没有的特殊情况 - if (end_version == 0) { - end_version = start_version; - } - } - - _set_storage_root_path_name(); + _num_rows_per_row_block = header->num_rows_per_data_block(); + _compress_kind = header->compress_kind(); + std::stringstream tablet_path_stream; + _tablet_id = header->tablet_id(); + _schema_hash = header->schema_hash(); + tablet_path_stream << store->path() << DATA_PREFIX << "/" << header->shard(); + tablet_path_stream << "/" << _tablet_id << "/" << _schema_hash; + _tablet_path = tablet_path_stream.str(); + _storage_root_path = store->path(); + _full_name = std::to_string(header->tablet_id()) + "." + std::to_string(header->schema_hash()); } OLAPTable::~OLAPTable() { @@ -199,34 +202,52 @@ OLAPTable::~OLAPTable() { return; // for convenience of mock test. } - // ensure that there is nobody using OLAPTable, like acquiring OLAPData(OLAPIndex) + // ensure that there is nobody using OLAPTable, like acquiring OLAPData(Rowset) obtain_header_wrlock(); - for (version_olap_index_map_t::iterator it = _data_sources.begin(); - it != _data_sources.end(); ++it) { - SAFE_DELETE(it->second); - it->second = NULL; + for (auto& it : _data_sources) { + for (Rowset* rowset : it.second) { + SAFE_DELETE(rowset); + } } _data_sources.clear(); + + // clear the transactions in memory + for (auto& it : _pending_data_sources) { + // false means can't remove the transaction from header, also prevent the loading of tablet + for (Rowset* rowset : it.second) { + OLAPEngine::get_instance()->delete_transaction( + rowset->partition_id(), rowset->transaction_id(), + _tablet_id, _schema_hash, false); + SAFE_DELETE(rowset); + } + } + _pending_data_sources.clear(); release_header_lock(); - path path_name(_header->file_name()); SAFE_DELETE(_header); // 移动数æ®ç›®å½• if (_is_dropped) { - path table_path = path_name.parent_path(); + LOG(INFO) << "drop table:" << full_name() << ", tablet path:" << _tablet_path; + path table_path(_tablet_path); + std::string header_path = _tablet_path + "/" + std::to_string(_tablet_id) + ".hdr"; + OLAPStatus s = OlapHeaderManager::dump_header(_store, _tablet_id, _schema_hash, header_path); + LOG(INFO) << "dump header to path:" << header_path << ", status:" << s; + LOG(INFO) << "start to remove tablet header:" << full_name(); + s = OlapHeaderManager::remove(_store, _tablet_id, _schema_hash); + LOG(INFO) << "finish remove tablet header:" << full_name() << ", res:" << s; if (move_to_trash(table_path, table_path) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to delete table. [table_path=%s]", table_path.c_str()); + LOG(WARNING) << "fail to delete table. [table_path=" << _tablet_path << "]"; } + LOG(INFO) << "finish drop table:" << full_name(); } } OLAPStatus OLAPTable::load() { OLAPStatus res = OLAP_SUCCESS; - AutoMutexLock l(&_load_lock); + MutexLock l(&_load_lock); - path header_file_path(_header->file_name()); - string one_schema_root = header_file_path.parent_path().string(); + string one_schema_root = _tablet_path; set files; set index_files; set data_files; @@ -236,15 +257,22 @@ OLAPStatus OLAPTable::load() { } res = dir_walk(one_schema_root, NULL, &files); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to walk schema root dir. [res=%d root='%s']", - res, one_schema_root.c_str()); + // Disk Failure will triggered delete file in disk. + // IOError will drop object. File only deleted upon restart. + // TODO. Tablet should has a state to report to FE, delete tablet + // request will get from FE. + if (res == OLAP_ERR_DISK_FAILURE) { + LOG(WARNING) << "fail to walk schema root dir." + << "res=" << res << ", root=" << one_schema_root; goto EXIT; + } else if (res != OLAP_SUCCESS) { + OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true); + return res; } - res = load_indices(); + if (res != OLAP_SUCCESS) { - OLAP_LOG_FATAL("fail to load indices. [res=%d table='%s']", res, _full_name.c_str()); + LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']"; goto EXIT; } @@ -254,10 +282,10 @@ OLAPStatus OLAPTable::load() { list_data_files(&data_files); if (remove_unused_files(one_schema_root, files, - header_file_path.filename().string(), + "", index_files, data_files) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to remove unused files. [root='%s']", one_schema_root.c_str()); + LOG(WARNING) << "fail to remove unused files. [root='" << one_schema_root << "']"; } release_header_lock(); @@ -273,74 +301,64 @@ EXIT: OLAPStatus OLAPTable::load_indices() { OLAPStatus res = OLAP_SUCCESS; - obtain_header_rdlock(); + ReadLock rdlock(&_header_lock); OLAPHeader* header = _header; - OLAP_LOG_DEBUG("begin to load indices. [version_size=%d table='%s']", - header->file_version_size(), - full_name().c_str()); + VLOG(3) << "begin to load indices. table=" << full_name() << ", " + << "version_size=" << header->file_delta_size(); - for (int i = 0; i < header->file_version_size(); i++) { + for (int delta_id = 0; delta_id < header->delta_size(); ++delta_id) { + const PDelta& delta = header->delta(delta_id); Version version; - version.first = header->file_version(i).start_version(); - version.second = header->file_version(i).end_version(); - OLAPIndex* index = new(nothrow) OLAPIndex(this, - version, - header->file_version(i).version_hash(), - false, - header->file_version(i).num_segments(), - header->file_version(i).creation_time()); - if (index == NULL) { - OLAP_LOG_WARNING("fail to create olap index. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); - release_header_lock(); - return OLAP_ERR_MALLOC_ERROR; - } + version.first = delta.start_version(); + version.second = delta.end_version(); + for (int j = 0; j < delta.rowset_size(); ++j) { + const PRowSet& prowset = delta.rowset(j); + Rowset* rowset = new Rowset(this, version, delta.version_hash(), + false, prowset.rowset_id(), prowset.num_segments()); + if (rowset == nullptr) { + LOG(WARNING) << "fail to create olap rowset. [version='" << version.first + << "-" << version.second << "' table='" << full_name() << "']"; + return OLAP_ERR_MALLOC_ERROR; + } - // åœ¨æ ¡éªŒå’ŒåŠ è½½ç´¢å¼•å‰æŠŠindex放到data-source,以防止加载索引失败造æˆå†…存泄露 - _data_sources[version] = index; - // 判断indexæ˜¯å¦æ­£å¸¸, 在所有版本的都检查完æˆä¹‹åŽæ‰åŠ è½½æ‰€æœ‰ç‰ˆæœ¬çš„index - if (index->validate() != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to validate index. [version='%d-%d' version_hash=%ld]", - version.first, - version.second, - header->file_version(i).version_hash()); - // 现在åªè¦ä¸€ä¸ªindex没有被正确加载,整个table加载失败 - release_header_lock(); - return OLAP_ERR_TABLE_INDEX_VALIDATE_ERROR; - } + if (prowset.has_empty()) { + rowset->set_empty(prowset.empty()); + } + // åœ¨æ ¡éªŒå’ŒåŠ è½½ç´¢å¼•å‰æŠŠrowset放到data-source,以防止加载索引失败造æˆå†…存泄露 + _data_sources[version].push_back(rowset); - if (header->file_version(i).has_delta_pruning()) { - if (_num_key_fields != header->file_version(i).delta_pruning().column_pruning_size()) { - OLAP_LOG_WARNING("column pruning size is error." - "[column_pruning_size=%d, num_key_fields=%d]", - header->file_version(i).delta_pruning().column_pruning_size(), - _num_key_fields); - release_header_lock(); + // 判断rowsetæ˜¯å¦æ­£å¸¸, 在所有版本的都检查完æˆä¹‹åŽæ‰åŠ è½½æ‰€æœ‰ç‰ˆæœ¬çš„rowset + if (rowset->validate() != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to validate rowset. [version='%d-%d' version_hash=%ld]", + version.first, + version.second, + header->delta(delta_id).version_hash()); + // 现在åªè¦ä¸€ä¸ªrowset没有被正确加载,整个table加载失败 return OLAP_ERR_TABLE_INDEX_VALIDATE_ERROR; } - std::vector > \ - column_statistics_string(_num_key_fields); - std::vector null_flags(_num_key_fields); - for (size_t j = 0; j < _num_key_fields; ++j) { - ColumnPruning column_pruning = - header->file_version(i).delta_pruning().column_pruning(j); - column_statistics_string[j].first = column_pruning.min(); - column_statistics_string[j].second = column_pruning.max(); - if (column_pruning.has_null_flag()) { - null_flags[j] = column_pruning.null_flag(); - } else { - null_flags[j] = false; + if (prowset.column_pruning_size() != 0) { + size_t column_pruning_size = prowset.column_pruning_size(); + if (_num_key_fields != column_pruning_size) { + LOG(ERROR) << "column pruning size is error." + << "column_pruning_size=" << column_pruning_size << ", " + << "num_key_fields=" << _num_key_fields; + return OLAP_ERR_TABLE_INDEX_VALIDATE_ERROR; } - } - - res = index->set_column_statistics_from_string(column_statistics_string, null_flags); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to set column statistics. [res=%d]", res); - release_header_lock(); - return OLAP_ERR_TABLE_INDEX_VALIDATE_ERROR; + std::vector > \ + column_statistic_strings(_num_key_fields); + std::vector null_vec(_num_key_fields); + for (size_t j = 0; j < _num_key_fields; ++j) { + ColumnPruning column_pruning = prowset.column_pruning(j); + column_statistic_strings[j].first = column_pruning.min(); + column_statistic_strings[j].second = column_pruning.max(); + if (column_pruning.has_null_flag()) { + null_vec[j] = column_pruning.null_flag(); + } else { + null_vec[j] = false; + } + } + RETURN_NOT_OK(rowset->add_column_statistics(column_statistic_strings, null_vec)); } } } @@ -348,40 +366,39 @@ OLAPStatus OLAPTable::load_indices() { for (version_olap_index_map_t::const_iterator it = _data_sources.begin(); it != _data_sources.end(); ++it) { Version version = it->first; - OLAPIndex* index = it->second; + for (Rowset* rowset : it->second) { + if ((res = rowset->load()) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to load rowset. version=" << version.first << "-" << version.second << ", " + << "version_hash=" << rowset->version_hash(); + // 现在åªè¦ä¸€ä¸ªrowset没有被正确加载,整个table加载失败 + return res; + } - if ((res = index->load()) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load index. [version='%d-%d' version_hash=%ld]", - version.first, - version.second, - index->version_hash()); - // 现在åªè¦ä¸€ä¸ªindex没有被正确加载,整个table加载失败 - release_header_lock(); - return res; + VLOG(3) << "load Rowset success. table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << rowset->version_hash() << ", " + << "num_segments=" << rowset->num_segments(); } - - OLAP_LOG_DEBUG("load OLAPIndex success. " - "[version='%d-%d' version_hash=%ld num_segments=%d table='%s']", - version.first, version.second, - index->version_hash(), - index->num_segments(), - full_name().c_str()); } - // check if it was doing schema change. - // TODO(zyh) - release_header_lock(); return OLAP_SUCCESS; } +OLAPStatus OLAPTable::save_header() { + OLAPStatus res = OlapHeaderManager::save(_store, _tablet_id, _schema_hash, _header); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to save header. [res=" << res << " root=" << _storage_root_path << "]"; + } + + return res; +} + OLAPStatus OLAPTable::select_versions_to_span( const Version& version, vector* span_versions) const { OLAPStatus res = _header->select_versions_to_span(version, span_versions); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to generate shortest version path. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); + LOG(WARNING) << "fail to generate shortest version path. [version='" << version.first + << "-" << version.second << "' table='" << full_name() << "']"; } return res; } @@ -390,10 +407,8 @@ void OLAPTable::acquire_data_sources(const Version& version, vector* sou vector span_versions; if (_header->select_versions_to_span(version, &span_versions) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to generate shortest version path. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); + LOG(WARNING) << "fail to generate shortest version path. [version='" << version.first + << "-" << version.second << "' table='" << full_name() << "']"; return; } @@ -404,8 +419,7 @@ void OLAPTable::acquire_data_sources(const Version& version, vector* sou void OLAPTable::acquire_data_sources_by_versions(const vector& version_list, vector* sources) const { if (sources == NULL) { - OLAP_LOG_WARNING("output parameter for data sources is null. [table='%s']", - full_name().c_str()); + LOG(WARNING) << "output parameter for data sources is null. table=" << full_name(); return; } @@ -417,41 +431,36 @@ void OLAPTable::acquire_data_sources_by_versions(const vector& version_ it1 != version_list.end(); ++it1) { version_olap_index_map_t::const_iterator it2 = _data_sources.find(*it1); if (it2 == _data_sources.end()) { - OLAP_LOG_WARNING("fail to find OLAPIndex for version. [version='%d-%d' table='%s']", - it1->first, - it1->second, - full_name().c_str()); + LOG(WARNING) << "fail to find Rowset for version. [version='" << it1->first + << "-" << it1->second << "' table='" << full_name() << "']"; release_data_sources(sources); return; } - OLAPIndex* olap_index = it2->second; - IData* olap_data = IData::create(olap_index); - if (olap_data == NULL) { - OLAP_LOG_WARNING("fail to malloc Data. [version='%d-%d' table='%s']", - it1->first, - it1->second, - full_name().c_str()); - release_data_sources(sources); - return; - } + for (Rowset* rowset : it2->second) { + IData* olap_data = IData::create(rowset); + if (olap_data == NULL) { + LOG(WARNING) << "fail to malloc Data. [version='" << it1->first + << "-" << it1->second << "' table='" << full_name() << "']"; + release_data_sources(sources); + return; + } - sources->push_back(olap_data); + sources->push_back(olap_data); - if (olap_data->init() != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to initial olap data. [version='%d-%d' table='%s']", - it1->first, - it1->second, - full_name().c_str()); - release_data_sources(sources); - return; + if (olap_data->init() != OLAP_SUCCESS) { + LOG(WARNING) << "fail to initial olap data. [version='" << it1->first + << "-" << it1->second << "' table='" << full_name() << "']"; + release_data_sources(sources); + return; + } } } } OLAPStatus OLAPTable::release_data_sources(vector* data_sources) const { if (data_sources == NULL) { - OLAP_LOG_WARNING("parameter data_sources is null. [table='%s']", full_name().c_str()); + LOG(WARNING) << "parameter data_sources is null. [table='" << full_name() << "']"; return OLAP_ERR_INPUT_PARAMETER_ERROR; } @@ -464,100 +473,923 @@ OLAPStatus OLAPTable::release_data_sources(vector* data_sources) const { return OLAP_SUCCESS; } -OLAPStatus OLAPTable::register_data_source(OLAPIndex* index) { +OLAPStatus OLAPTable::register_data_source(const std::vector& index_vec) { OLAPStatus res = OLAP_SUCCESS; - if (index == NULL) { - OLAP_LOG_WARNING("parameter index is null. [table='%s']", full_name().c_str()); + if (index_vec.empty()) { + LOG(WARNING) << "parameter rowset is null." + << "table=" << full_name(); return OLAP_ERR_INPUT_PARAMETER_ERROR; } - Version version = index->version(); - if (_data_sources.find(version) != _data_sources.end()) { - OLAP_LOG_WARNING("olap index for version exists. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); - return OLAP_ERR_TABLE_VERSION_DUPLICATE_ERROR; + for (Rowset* rowset : index_vec) { + Version version = rowset->version(); + const std::vector* column_statistics = nullptr; + if (rowset->has_column_statistics()) { + column_statistics = &rowset->get_column_statistics(); + } + res = _header->add_version(version, rowset->version_hash(), rowset->rowset_id(), + rowset->num_segments(), rowset->index_size(), rowset->data_size(), + rowset->num_rows(), rowset->empty(), column_statistics); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add version to olap header. table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second; + return res; + } + + // put the new rowset into _data_sources. + // 由于对headerçš„æ“作å¯èƒ½å¤±è´¥ï¼Œå› æ­¤å¯¹_data_sourcesè¦æ”¾åœ¨è¿™é‡Œ + _data_sources[version].push_back(rowset); + VLOG(3) << "succeed to register data source. table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << rowset->version_hash() << ", " + << "rowset_id=" << rowset->rowset_id() << ", " + << "num_segments=" << rowset->num_segments(); } - // add a reference to the data source in the header file - if (index->has_column_statistics()) { - res = _header->add_version( - version, index->version_hash(), index->num_segments(), index->max_timestamp(), - index->index_size(), index->data_size(), index->num_rows(), - &index->get_column_statistics()); - } else { - res = _header->add_version( - version, index->version_hash(), index->num_segments(), index->max_timestamp(), - index->index_size(), index->data_size(), index->num_rows()); - } - - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to add version to olap header. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); - return res; - } - - // put the new index into _data_sources. - // 由于对headerçš„æ“作å¯èƒ½å¤±è´¥ï¼Œå› æ­¤å¯¹_data_sourcesè¦æ”¾åœ¨è¿™é‡Œ - _data_sources[version] = index; - - OLAP_LOG_DEBUG("succeed to register data source. " - "[version='%d-%d' version_hash=%ld num_segments=%d table='%s']", - version.first, - version.second, - index->version_hash(), - index->num_segments(), - full_name().c_str()); - return OLAP_SUCCESS; } -OLAPStatus OLAPTable::unregister_data_source(const Version& version, OLAPIndex** index) { +OLAPStatus OLAPTable::unregister_data_source(const Version& version, std::vector* index_vec) { OLAPStatus res = OLAP_SUCCESS; version_olap_index_map_t::iterator it = _data_sources.find(version); if (it == _data_sources.end()) { - OLAP_LOG_WARNING("olap index for version does not exists. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); + LOG(WARNING) << "olap rowset for version does not exists. [version='" << version.first + << "-" << version.second << "' table='" << full_name() << "']"; return OLAP_ERR_VERSION_NOT_EXIST; } // delete a reference to the data source in the header file if ((res = _header->delete_version(version)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to delete version from olap header. [version='%d-%d' table='%s']", - version.first, - version.second, - full_name().c_str()); + LOG(WARNING) << "fail to delete version from olap header. [version='" << version.first + << "-" << version.second << "' table='" << full_name() << "']"; return res; } - *index = it->second; + *index_vec = it->second; _data_sources.erase(it); - - OLAP_LOG_DEBUG("unregister data source success. " - "[version='%d-%d' version_hash=%ld num_segments=%d table='%s']", - version.first, - version.second, - (*index)->version_hash(), - (*index)->num_segments(), - full_name().c_str()); - return OLAP_SUCCESS; } +OLAPStatus OLAPTable::add_pending_version(int64_t partition_id, int64_t transaction_id, + const std::vector* delete_conditions) { + WriteLock wrlock(&_header_lock); + OLAPStatus res = _header->add_pending_version(partition_id, transaction_id, delete_conditions); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add pending delta to header." + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id; + return res; + } + res = save_header(); + if (res != OLAP_SUCCESS) { + _header->delete_pending_delta(transaction_id); + LOG(FATAL) << "fail to save header when add pending rowset. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + return res; + } + return OLAP_SUCCESS; +} + +OLAPStatus OLAPTable::add_pending_rowset(Rowset* rowset) { + if (rowset == nullptr) { + LOG(WARNING) << "parameter rowset is null. [table=" << full_name() << "]"; + return OLAP_ERR_INPUT_PARAMETER_ERROR; + } + + int64_t transaction_id = rowset->transaction_id(); + obtain_header_wrlock(); + OLAPStatus res = OLAP_SUCCESS; + + // add to header + const std::vector* column_statistics = nullptr; + if (rowset->has_column_statistics()) { + column_statistics = &(rowset->get_column_statistics()); + } + res = _header->add_pending_rowset(transaction_id, rowset->num_segments(), + rowset->rowset_id(), rowset->load_id(), + rowset->empty(), column_statistics); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add pending rowset to header. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + release_header_lock(); + return res; + } + + // save header + res = save_header(); + if (res != OLAP_SUCCESS) { + _header->delete_pending_delta(transaction_id); + LOG(FATAL) << "fail to save header when add pending rowset. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + release_header_lock(); + return res; + } + + // add to data sources + _pending_data_sources[transaction_id].push_back(rowset); + release_header_lock(); + VLOG(3) << "add pending data to tablet successfully." + << "table=" << full_name() << ", transaction_id=" << transaction_id; + + return res; +} + +int32_t OLAPTable::current_pending_rowset_id(int64_t transaction_id) { + ReadLock rdlock(&_header_lock); + int32_t rowset_id = -1; + if (_pending_data_sources.find(transaction_id) != _pending_data_sources.end()) { + for (Rowset* rowset : _pending_data_sources[transaction_id]) { + if (rowset->rowset_id() > rowset_id) { + rowset_id = rowset->rowset_id(); + } + } + } + return rowset_id; +} + +OLAPStatus OLAPTable::add_pending_data(Rowset* rowset, const std::vector* delete_conditions) { + if (rowset == nullptr) { + LOG(WARNING) << "parameter rowset is null. table=" << full_name(); + return OLAP_ERR_INPUT_PARAMETER_ERROR; + } + + obtain_header_wrlock(); + int64_t transaction_id = rowset->transaction_id(); + if (_pending_data_sources.find(transaction_id) != _pending_data_sources.end()) { + LOG(WARNING) << "find pending data existed when add to tablet. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + release_header_lock(); + return OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST; + } + OLAPStatus res = OLAP_SUCCESS; + + // if push for delete, construct sub conditions + vector condition_strs; + if (delete_conditions != nullptr) { + DeleteConditionHandler del_cond_handler; + for (const TCondition& condition : *delete_conditions) { + condition_strs.push_back(del_cond_handler.construct_sub_conditions(condition)); + } + } + + if (!condition_strs.empty()) { + res = _header->add_pending_version(rowset->partition_id(), transaction_id, &condition_strs); + } else { + res = _header->add_pending_version(rowset->partition_id(), transaction_id, nullptr); + } + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add pending delta to header." + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id; + release_header_lock(); + return res; + } + + // add to header + const std::vector* column_statistics = nullptr; + if (rowset->has_column_statistics()) { + column_statistics = &(rowset->get_column_statistics()); + } + res = _header->add_pending_rowset(transaction_id, rowset->num_segments(), + rowset->rowset_id(), rowset->load_id(), + rowset->empty(), column_statistics); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add pending rowset to header. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + release_header_lock(); + return res; + } + + // save header + res = save_header(); + if (res != OLAP_SUCCESS) { + _header->delete_pending_delta(transaction_id); + LOG(FATAL) << "fail to save header when add pending rowset. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + release_header_lock(); + return res; + } + + // add to data sources + _pending_data_sources[transaction_id].push_back(rowset); + release_header_lock(); + VLOG(3) << "add pending data to tablet successfully." + << "table=" << full_name() << ", transaction_id=" << transaction_id; + return res; + +} + +bool OLAPTable::has_pending_data(int64_t transaction_id) { + ReadLock rdlock(&_header_lock); + return _pending_data_sources.find(transaction_id) != _pending_data_sources.end(); +} + +void OLAPTable::delete_pending_data(int64_t transaction_id) { + obtain_header_wrlock(); + + auto it = _pending_data_sources.find(transaction_id); + if (it == _pending_data_sources.end()) { + release_header_lock(); + return; + } + + // delete from data sources + for (Rowset* rowset : it->second) { + rowset->release(); + OLAPEngine::get_instance()->add_unused_index(rowset); + } + _pending_data_sources.erase(it); + + // delete from header + _header->delete_pending_delta(transaction_id); + + // save header + if (save_header() != OLAP_SUCCESS) { + LOG(FATAL) << "failed to save header when delete pending data. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + } + + release_header_lock(); + LOG(INFO) << "delete pending data from tablet. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + +} + +void OLAPTable::get_expire_pending_data(vector* transaction_ids) { + time_t now = time(NULL); + ReadLock rdlock(&_header_lock); + + for (auto& it : _header->pending_delta()) { + double diff = difftime(now, it.creation_time()); + if (diff >= config::pending_data_expire_time_sec) { + transaction_ids->push_back(it.transaction_id()); + VLOG(3) << "find expire pending data. table=" << full_name() << ", " + << "transaction_id=" << it.transaction_id() << " exist_sec=" << diff; + } + } +} + +void OLAPTable::load_pending_data() { + LOG(INFO) << "begin to load pending_data. table=" << full_name() << ", " + << "pending_delta size=" << _header->pending_delta_size(); + MutexLock load_lock(&_load_lock); + + // if a olap rowset loads failed, delete it from header + std::set error_pending_data; + + for (const PPendingDelta& pending_delta : _header->pending_delta()) { + for (const PPendingRowSet& pending_rowset : pending_delta.pending_rowset()) { + Rowset* rowset = new Rowset(this, false, pending_rowset.pending_rowset_id(), + pending_rowset.num_segments(), true, + pending_delta.partition_id(), pending_delta.transaction_id()); + DCHECK(rowset != nullptr); + rowset->set_load_id(pending_rowset.load_id()); + if (pending_rowset.has_empty()) { + rowset->set_empty(pending_rowset.empty()); + } + _pending_data_sources[rowset->transaction_id()].push_back(rowset); + + if (rowset->validate() != OLAP_SUCCESS) { + LOG(WARNING) << "fail to validate rowset when load pending data." + << "table=" << full_name() << ", " + << "transaction_id=" << rowset->transaction_id(); + error_pending_data.insert(rowset->transaction_id()); + break; + } + + if (_num_key_fields != pending_rowset.column_pruning_size()) { + LOG(WARNING) << "column pruning size is error when load pending data." + << "column_pruning_size=" << pending_rowset.column_pruning_size() << ", " + << "num_key_fields=" << _num_key_fields; + error_pending_data.insert(rowset->transaction_id()); + break; + } + std::vector> column_statistics_string(_num_key_fields); + std::vector null_vec(_num_key_fields); + for (size_t j = 0; j < _num_key_fields; ++j) { + ColumnPruning column_pruning = pending_rowset.column_pruning(j); + column_statistics_string[j].first = column_pruning.min(); + column_statistics_string[j].second = column_pruning.max(); + if (column_pruning.has_null_flag()) { + null_vec[j] = column_pruning.null_flag(); + } else { + null_vec[j] = false; + } + } + + if (rowset->add_column_statistics(column_statistics_string, null_vec) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to set column statistics when load pending data"; + error_pending_data.insert(pending_delta.transaction_id()); + break; + } + + if (rowset->load() != OLAP_SUCCESS) { + LOG(WARNING) << "fail to load rowset when load pending data." + << "table=" << full_name() << ", transaction_id=" << pending_delta.transaction_id(); + error_pending_data.insert(pending_delta.transaction_id()); + break; + } + + OLAPStatus add_status = OLAPEngine::get_instance()->add_transaction( + pending_delta.partition_id(), pending_delta.transaction_id(), + _tablet_id, _schema_hash, pending_rowset.load_id()); + + if (add_status != OLAP_SUCCESS) { + LOG(WARNING) << "find transaction exists in engine when load pending data. [table=" << full_name() + << " transaction_id=" << pending_delta.transaction_id() << "]"; + error_pending_data.insert(pending_delta.transaction_id()); + break; + } + } + + if (error_pending_data.find(pending_delta.transaction_id()) != error_pending_data.end()) { + continue; + } + + VLOG(3) << "load pending data successfully. table=" << full_name() << ", " + << "partition_id=" << pending_delta.partition_id() << ", " + << "transaction_id=" << pending_delta.transaction_id(); + } + + LOG(INFO) << "finish to load pending data. table=" << full_name() << ", " + << "error_data_size=" << error_pending_data.size(); + + for (int64_t error_data : error_pending_data) { + delete_pending_data(error_data); + } +} + +// 1. need to replace local data if same version existed +// 2. move pending data to version data +// 3. move pending data to incremental data, it won't be merged, so we can do incremental clone +OLAPStatus OLAPTable::publish_version(int64_t transaction_id, Version version, + VersionHash version_hash) { + WriteLock wrlock(&_header_lock); + if (_pending_data_sources.find(transaction_id) == _pending_data_sources.end()) { + LOG(WARNING) << "pending data not exists in tablet, not finished or deleted." + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id; + return OLAP_ERR_TRANSACTION_NOT_EXIST; + } + RETURN_NOT_OK(_handle_existed_version(transaction_id, version, version_hash)); + std::vector index_vec; + vector linked_files; + OLAPStatus res = OLAP_SUCCESS; + for (Rowset* rowset : _pending_data_sources[transaction_id]) { + int32_t rowset_id = rowset->rowset_id(); + for (int32_t seg_id = 0; seg_id < rowset->num_segments(); ++seg_id) { + std::string pending_index_path = rowset->construct_index_file_path(rowset_id, seg_id); + std::string index_path = construct_index_file_path(version, version_hash, rowset_id, seg_id); + res = _create_hard_link(pending_index_path, index_path, &linked_files); + if (res != OLAP_SUCCESS) { remove_files(linked_files); return res; } + + std::string pending_data_path = rowset->construct_data_file_path(rowset_id, seg_id); + std::string data_path = construct_data_file_path(version, version_hash, rowset_id, seg_id); + res = _create_hard_link(pending_data_path, data_path, &linked_files); + if (res != OLAP_SUCCESS) { remove_files(linked_files); return res; } + } + + rowset->publish_version(version, version_hash); + index_vec.push_back(rowset); + } + + res = register_data_source(index_vec); + if (res != OLAP_SUCCESS) { remove_files(linked_files); return res; } + + const PPendingDelta* pending_delta = _header->get_pending_delta(transaction_id); + if (pending_delta->has_delete_condition()) { + const DeleteConditionMessage& delete_condition = pending_delta->delete_condition(); + _header->add_delete_condition(delete_condition, version.first); + } + + // add incremental version, if failed, ignore it + res = _add_incremental_data(index_vec, transaction_id, version, version_hash); + VLOG(3) << "finish to add incremental version. res=" << res << ", " + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id << ", " + << "version=" << version.first << "-" << version.second; + + // save header + res = save_header(); + if (res != OLAP_SUCCESS) { + LOG(FATAL) << "fail to save header when publish version. res=" << res << ", " + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id; + std::vector delete_index_vec; + // if failed, clear new data + unregister_data_source(version, &delete_index_vec); + _delete_incremental_data(version, version_hash); + remove_files(linked_files); + return res; + } + + _header->delete_pending_delta(transaction_id); + res = save_header(); + if (res != OLAP_SUCCESS) { + remove_files(linked_files); + LOG(FATAL) << "fail to save header when publish version. res=" << res << ", " + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id; + return res; + } + for (Rowset* rowset : _pending_data_sources[transaction_id]) { + rowset->delete_all_files(); + rowset->set_pending_finished(); + } + _pending_data_sources.erase(transaction_id); + + return res; +} + +// 1. if version is same and version_hash different, delete local data, save header +// 2. if version_hash is same or version is merged, publish success, delete transaction, save header +OLAPStatus OLAPTable::_handle_existed_version(int64_t transaction_id, const Version& version, + const VersionHash& version_hash) { + const PDelta* existed_delta = nullptr; + for (int i = 0; i < file_delta_size(); ++i) { + const PDelta* delta = _header->get_delta(i); + if (version.first >= delta->start_version() + && version.second <= delta->end_version()) { + existed_delta = delta; + } + + } + + if (existed_delta == nullptr) { + return OLAP_SUCCESS; + } + + OLAPStatus res = OLAP_SUCCESS; + // if version is same and version_hash different, delete local data + if (existed_delta->start_version() == version.first + && existed_delta->end_version() == version.second + && existed_delta->version_hash() != version_hash) { + LOG(INFO) << "version_hash is different when publish version, delete local data. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + // remove delete condition if current type is PUSH_FOR_DELETE, + // this occurs when user cancel delete_data soon after submit it + bool push_for_delete = false; + res = is_push_for_delete(transaction_id, &push_for_delete); + if (res != OLAP_SUCCESS) { + return res; + } else if (!push_for_delete) { + DeleteConditionHandler del_cond_handler; + OLAPTablePtr olap_table_ptr = + OLAPEngine::get_instance()->get_table(_tablet_id, _schema_hash); + if (olap_table_ptr.get() != nullptr) { + del_cond_handler.delete_cond(olap_table_ptr, version.first, false); + } + } + // delete local data + //Rowset *existed_index = NULL; + std::vector existed_index_vec; + _delete_incremental_data(version, version_hash); + res = unregister_data_source(version, &existed_index_vec); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to unregister data when publish version. [table=" << full_name() + << " version=" << version.first << "-" << version.second << " res=" << res << "]"; + return res; + } + // save header + res = save_header(); + if (res != OLAP_SUCCESS) { + LOG(FATAL) << "fail to save header when unregister data. [tablet=" << full_name() + << " transaction_id=" << transaction_id << "]"; + } + // use OLAPEngine to delete this rowset + if (!existed_index_vec.empty()) { + OLAPEngine *unused_index = OLAPEngine::get_instance(); + for (Rowset* rowset : existed_index_vec) { + unused_index->add_unused_index(rowset); + } + } + // if version_hash is same or version is merged, publish success + } else { + LOG(INFO) << "version_hash is same when publish version, publish success. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + res = OLAP_ERR_PUSH_VERSION_ALREADY_EXIST; + } + return res; +} + +OLAPStatus OLAPTable::_add_incremental_data(std::vector& index_vec, int64_t transaction_id, + const Version& version, const VersionHash& version_hash) { + if (index_vec.empty()) { + LOG(WARNING) << "no parameter when add incremental data. table=" << full_name(); + return OLAP_ERR_INPUT_PARAMETER_ERROR; + } + + // create incremental rowset's dir + std::string dir_path = construct_incremental_delta_dir_path(); + OLAPStatus res = OLAP_SUCCESS; + if (!check_dir_existed(dir_path)) { + res = create_dirs(dir_path); + if (res != OLAP_SUCCESS && !check_dir_existed(dir_path)) { + LOG(WARNING) << "fail to create rowset dir. table=" << full_name() << ", " + << " transaction_id=" << transaction_id; + return res; + } + } + std::vector linked_files; + for (Rowset* rowset : index_vec) { + for (int32_t seg_id = 0; seg_id < rowset->num_segments(); ++seg_id) { + int32_t rowset_id = rowset->rowset_id(); + std::string index_path = rowset->construct_index_file_path(rowset_id, seg_id); + std::string incremental_index_path = + construct_incremental_index_file_path(version, version_hash, rowset_id, seg_id); + res = _create_hard_link(index_path, incremental_index_path, &linked_files); + if (res != OLAP_SUCCESS) { remove_files(linked_files); return res; } + + std::string data_path = rowset->construct_data_file_path(rowset_id, seg_id); + std::string incremental_data_path = + construct_incremental_data_file_path(version, version_hash, rowset_id, seg_id); + res = _create_hard_link(data_path, incremental_data_path, &linked_files); + if (res != OLAP_SUCCESS) { remove_files(linked_files); return res; } + } + + const std::vector* column_statistics = nullptr; + if (rowset->has_column_statistics()) { + column_statistics = &(rowset->get_column_statistics()); + } + res = _header->add_incremental_version( + rowset->version(), rowset->version_hash(), + rowset->rowset_id(), rowset->num_segments(), + rowset->index_size(), rowset->data_size(), + rowset->num_rows(), rowset->empty(), column_statistics); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add incremental data. res=" << res << ", " + << "table=" << full_name() << ", " + << "transaction_id=" << transaction_id << ", " + << "version=" << version.first << "-" << version.second; + remove_files(linked_files); + return res; + } + } + + return res; +} + +void OLAPTable::delete_expire_incremental_data() { + time_t now = time(NULL); + std::vector> expire_versions; + WriteLock wrlock(&_header_lock); + for (auto& it : _header->incremental_delta()) { + double diff = difftime(now, it.creation_time()); + if (diff >= config::incremental_delta_expire_time_sec) { + Version version(it.start_version(), it.end_version()); + expire_versions.push_back(std::make_pair(version, it.version_hash())); + VLOG(3) << "find expire incremental rowset. tablet=" << full_name() << ", " + << "version=" << it.start_version() << "-" << it.end_version() << ", " + << "exist_sec=" << diff; + } + } + for (auto& it : expire_versions) { + _delete_incremental_data(it.first, it.second); + VLOG(3) << "delete expire incremental data. table=" << full_name() << ", " + << "version=" << it.first.first << "-" << it.first.second; + } + + if (save_header() != OLAP_SUCCESS) { + LOG(FATAL) << "fail to save header when delete expire incremental data." + << "table=" << full_name(); + } +} + +void OLAPTable::_delete_incremental_data(const Version& version, const VersionHash& version_hash) { + const PDelta* incremental_delta = get_incremental_delta(version); + if (incremental_delta == nullptr) { return; } + + vector files_to_delete; + for (const PRowSet& prowset : incremental_delta->rowset()) { + int32_t rowset_id = prowset.rowset_id(); + for (int seg_id = 0; seg_id < prowset.num_segments(); seg_id++) { + std::string incremental_index_path = + construct_incremental_index_file_path(version, version_hash, rowset_id, seg_id); + files_to_delete.emplace_back(incremental_index_path); + + std::string incremental_data_path = + construct_incremental_data_file_path(version, version_hash, rowset_id, seg_id); + files_to_delete.emplace_back(incremental_data_path); + } + } + + remove_files(files_to_delete); + _header->delete_incremental_delta(version); + VLOG(3) << "delete incremental data. table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second; +} + +void OLAPTable::get_missing_versions_with_header_locked( + int64_t until_version, std::vector* missing_versions) const { + DCHECK(until_version > 0) << "invalid until_version: " << until_version; + std::list existing_versions; + for (int i = 0; i < _header->file_delta_size(); ++i) { + const PDelta* delta = _header->get_delta(i); + existing_versions.emplace_back(delta->start_version(), delta->end_version()); + } + + // sort the existing versions in ascending order + existing_versions.sort([](const Version& a, const Version& b) { + // simple because 2 versions are certainly not overlapping + return a.first < b.first; + }); + + // find the missing version until until_version + int64_t last_version = -1; + for (const Version& version : existing_versions) { + if (version.first > last_version + 1) { + for (int64_t i = last_version + 1; i < version.first; ++i) { + missing_versions->emplace_back(i, i); + } + } + last_version = version.second; + if (until_version <= last_version) { + break; + } + } + for (int64_t i = last_version + 1; i <= until_version; ++i) { + missing_versions->emplace_back(i, i); + } +} + +const PDelta* OLAPTable::least_complete_version( + const vector& missing_versions) const { + + const PDelta* least_delta = nullptr; + if (!missing_versions.empty()) { + Version version = missing_versions.front(); + for (int i = 0; i < _header->file_delta_size(); ++i) { + const PDelta* delta = _header->get_delta(i); + if (delta->end_version() == version.first - 1) { + LOG(INFO) << "find least complete version. table=" << full_name() << ", " + << "version=" << delta->start_version() << "-" << delta->end_version() << ", " + << "version_hash=" << delta->version_hash() << ", " + << "first_missing_version=" << version.first << "-" << version.second; + least_delta = delta; + break; + } + } + } else { + least_delta = lastest_version(); + } + + return least_delta; +} + +OLAPStatus OLAPTable::is_push_for_delete( + int64_t transaction_id, bool* is_push_for_delete) const { + + const PPendingDelta* pending_delta = _header->get_pending_delta(transaction_id); + if (pending_delta == nullptr) { + LOG(WARNING) << "pending rowset not found when check push for delete. [table=" << full_name() + << " transaction_id=" << transaction_id << "]"; + return OLAP_ERR_TRANSACTION_NOT_EXIST; + } + *is_push_for_delete = pending_delta->has_delete_condition(); + return OLAP_SUCCESS; +} + +Rowset* OLAPTable::_construct_index_from_version(const PDelta* delta, int32_t rowset_id) { + VLOG(3) << "begin to construct rowset from version." + << "table=" << full_name() << ", " + << "version=" << delta->start_version() << "-" << delta->end_version() << ", " + << "version_hash=" << delta->version_hash(); + Version version(delta->start_version(), delta->end_version()); + const PRowSet* prowset = nullptr; + if (rowset_id == -1) { + // Previous FileVersionMessage will be convert to PDelta and PRowset. + // In PRowset, this is rowset_id is set to minus one. + // When to get it, should used rowset + 1 as index. + prowset = &(delta->rowset().Get(rowset_id + 1)); + } else { + prowset = &(delta->rowset().Get(rowset_id)); + } + Rowset* rowset = new Rowset(this, version, delta->version_hash(), + false, rowset_id, prowset->num_segments()); + if (prowset->has_empty()) { + rowset->set_empty(prowset->empty()); + } + DCHECK(rowset != nullptr) << "malloc error when construct rowset." + << "table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << delta->version_hash(); + OLAPStatus res = rowset->validate(); + if (res != OLAP_SUCCESS) { + SAFE_DELETE(rowset); + return nullptr; + } + + if (_num_key_fields != prowset->column_pruning_size()) { + LOG(WARNING) << "column pruning size error, " << "table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << delta->version_hash() << ", " + << "column_pruning_size=" << prowset->column_pruning_size() << ", " + << "num_key_fields=" << _num_key_fields; + SAFE_DELETE(rowset); + return nullptr; + } + vector> column_statistic_strings(_num_key_fields); + std::vector null_vec(_num_key_fields); + for (size_t j = 0; j < _num_key_fields; ++j) { + ColumnPruning column_pruning = prowset->column_pruning(j); + column_statistic_strings[j].first = column_pruning.min(); + column_statistic_strings[j].second = column_pruning.max(); + if (column_pruning.has_null_flag()) { + null_vec[j] = column_pruning.null_flag(); + } else { + null_vec[j] = false; + } + } + + res = rowset->add_column_statistics(column_statistic_strings, null_vec); + if (res != OLAP_SUCCESS) { + SAFE_DELETE(rowset); + return nullptr; + } + + res = rowset->load(); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to load rowset. res=" << res << ", " + << "table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << delta->version_hash(); + SAFE_DELETE(rowset); + return nullptr; + } + + VLOG(3) << "finish to construct rowset from version." + << "table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second; + return rowset; +} + +OLAPStatus OLAPTable::_create_hard_link(const string& from, const string& to, + vector* linked_success_files) { + if (link(from.c_str(), to.c_str()) != 0) { + LOG(WARNING) << "fail to create hard link. from=" << from << ", " + << "to=" << to << ", " << "errno=" << Errno::no(); + return OLAP_ERR_OS_ERROR; + } + linked_success_files->push_back(to); + VLOG(3) << "success to create hard link. [from=" << from << " to=" << to << "]"; + return OLAP_SUCCESS; +} + +OLAPStatus OLAPTable::clone_data(const OLAPHeader& clone_header, + const vector& clone_deltas, + const vector& versions_to_delete) { + LOG(INFO) << "begin to clone data to tablet. table=" << full_name() << ", " + << "clone_versions_size=" << clone_deltas.size() << ", " + << "versions_to_delete_size=" << versions_to_delete.size(); + OLAPStatus res = OLAP_SUCCESS; + version_olap_index_map_t tmp_data_sources; + + do { + // load new local header to operate on + OLAPHeader new_local_header; + OlapHeaderManager::get_header(_store, _tablet_id, _schema_hash, &new_local_header); + + // delete versions from new local header + for (const Version& version : versions_to_delete) { + res = new_local_header.delete_version(version); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to delete version from new local header. [table=" << full_name() + << " version=" << version.first << "-" << version.second << "]"; + break; + } + LOG(INFO) << "delete version from new local header when clone. [table='" << full_name() + << "', version=" << version.first << "-" << version.second << "]"; + } + + if (res != OLAP_SUCCESS) { + break; + } + + for (const PDelta* clone_delta : clone_deltas) { + Version version(clone_delta->start_version(), + clone_delta->end_version()); + + // construct new rowset + for (const PRowSet& prowset : clone_delta->rowset()) { + Rowset* tmp_index = _construct_index_from_version(clone_delta, prowset.rowset_id()); + if (tmp_index == NULL) { + LOG(WARNING) << "fail to construct rowset when clone data. table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << clone_delta->version_hash(); + res = OLAP_ERR_INDEX_LOAD_ERROR; + break; + } + + tmp_data_sources[version].push_back(tmp_index); + + // add version to new local header + const std::vector* column_statistics = nullptr; + if (tmp_index->has_column_statistics()) { + column_statistics = &(tmp_index->get_column_statistics()); + } + res = new_local_header.add_version(version, tmp_index->version_hash(), + tmp_index->rowset_id(), + tmp_index->num_segments(), + tmp_index->index_size(), + tmp_index->data_size(), + tmp_index->num_rows(), + tmp_index->empty(), + column_statistics); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "fail to add version to new local header when clone." + << "res=" << res << ", " + << "table=" << full_name() << ", " + << "version=" << version.first << "-" << version.second << ", " + << "version_hash=" << clone_delta->version_hash(); + break; + } + } + + if (res != OLAP_SUCCESS) { break; } + + // add delete conditions to new local header, if it exists in clone_header + if (version.first == version.second) { + for (google::protobuf::RepeatedPtrField::const_iterator it + = clone_header.delete_data_conditions().begin(); + it != clone_header.delete_data_conditions().end(); ++it) { + if (it->version() == version.first) { + // add it + new_local_header.add_delete_condition(*it, version.first); + LOG(INFO) << "add delete condition when clone. [table=" << full_name() + << " version=" << it->version() << "]"; + break; + } + } + } + } + + if (res != OLAP_SUCCESS) { + break; + } + VLOG(3) << "load indices successfully when clone. table=" << full_name() << ", " + << "add_versions_size=" << clone_deltas.size() << ", " + << "new_indices_size=" << tmp_data_sources.size(); + // save and reload header + res = OlapHeaderManager::save(_store, _tablet_id, _schema_hash, &new_local_header); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to save new local header when clone. res:" << res; + break; + } + res = OlapHeaderManager::get_header(_store, _tablet_id, _schema_hash, _header); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "failed to reload original header when clone. [table=" << full_name() + << " res=" << res << "]"; + break; + } + + } while (0); + + // if success, update local data sources + if (res == OLAP_SUCCESS) { + + // delete local data source + for (const Version& version_to_delete : versions_to_delete) { + version_olap_index_map_t::iterator it = _data_sources.find(version_to_delete); + if (it != _data_sources.end()) { + std::vector index_to_delete_vec = it->second; + _data_sources.erase(it); + OLAPEngine* unused_index = OLAPEngine::get_instance(); + for (Rowset* rowset : index_to_delete_vec) { + unused_index->add_unused_index(rowset); + } + } + } + + // add new data source + for (auto& it : tmp_data_sources) { + for (Rowset* rowset : it.second) { + _data_sources[rowset->version()].push_back(rowset); + } + } + + // clear tmp indices if failed + } else { + for (auto& it : tmp_data_sources) { + for (Rowset* rowset : it.second) { + SAFE_DELETE(rowset); + } + } + } + + LOG(INFO) << "finish to clone data to tablet. res=" << res << ", " + << "table=" << full_name() << ", " + << "clone_versions_size=" << clone_deltas.size(); + return res; +} + OLAPStatus OLAPTable::replace_data_sources(const vector* old_versions, - const vector* new_data_sources, - vector* old_data_sources) { + const vector* new_data_sources, + vector* old_data_sources) { OLAPStatus res = OLAP_SUCCESS; if (old_versions == NULL || new_data_sources == NULL) { - OLAP_LOG_WARNING("parameter old_versions or new_data_sources is null. [table='%s']", - full_name().c_str()); + LOG(WARNING) << "parameter old_versions or new_data_sources is null. table=" << full_name(); return OLAP_ERR_INPUT_PARAMETER_ERROR; } @@ -568,20 +1400,19 @@ OLAPStatus OLAPTable::replace_data_sources(const vector* old_versions, it != old_versions->end(); ++it) { version_olap_index_map_t::iterator data_source_it = _data_sources.find(*it); if (data_source_it == _data_sources.end()) { - OLAP_LOG_WARNING("olap index for version does not exists. [version='%d-%d' table='%s']", - it->first, - it->second, - full_name().c_str()); + LOG(WARNING) << "olap rowset for version does not exists. [version='" << it->first + << "-" << it->second << "' table='" << full_name() << "']"; return OLAP_ERR_VERSION_NOT_EXIST; } } // check new versions not existed - for (vector::const_iterator it = new_data_sources->begin(); + for (vector::const_iterator it = new_data_sources->begin(); it != new_data_sources->end(); ++it) { if (_data_sources.find((*it)->version()) != _data_sources.end()) { bool to_be_deleted = false; + for (vector::const_iterator old_it = old_versions->begin(); old_it != old_versions->end(); ++old_it) { if (*old_it == (*it)->version()) { @@ -591,10 +1422,8 @@ OLAPStatus OLAPTable::replace_data_sources(const vector* old_versions, } if (!to_be_deleted) { - OLAP_LOG_WARNING("olap index for version exists. [version='%d-%d' table='%s']", - (*it)->version().first, - (*it)->version().second, - full_name().c_str()); + LOG(WARNING) << "olap rowset for version exists. [version='" << (*it)->version().first + << "-" << (*it)->version().second << "' table='" << full_name() << "']"; return OLAP_ERR_TABLE_VERSION_DUPLICATE_ERROR; } } @@ -605,54 +1434,45 @@ OLAPStatus OLAPTable::replace_data_sources(const vector* old_versions, it != old_versions->end(); ++it) { version_olap_index_map_t::iterator data_source_it = _data_sources.find(*it); if (data_source_it != _data_sources.end()) { - old_data_sources->push_back(data_source_it->second); + for (Rowset* rowset : data_source_it->second) { + old_data_sources->push_back(rowset); + } _data_sources.erase(data_source_it); } // åˆ é™¤å¤±è´¥ä¼šå¯¼è‡´è„æ•°æ® if ((res = _header->delete_version(*it)) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to delete version from olap header.[version='%d-%d' table='%s']", - it->first, - it->second, - full_name().c_str()); + LOG(WARNING) << "fail to delete version from olap header.[version='" << it->first + << "-" << it->second << "' table='" << full_name() << "']"; return res; } - OLAP_LOG_TRACE("delete version from olap header.[version='%d-%d' table='%s']", - it->first, - it->second, - full_name().c_str()); + VLOG(3) << "delete version from olap header. table=" << full_name() << ", " + << "version=" << it->first << "-" << it->second; } - for (vector::const_iterator it = new_data_sources->begin(); + for (vector::const_iterator it = new_data_sources->begin(); it != new_data_sources->end(); ++it) { - _data_sources[(*it)->version()] = *it; + _data_sources[(*it)->version()].push_back(*it); // æ–°å¢žå¤±è´¥ä¼šå¯¼è‡´è„æ•°æ® + const std::vector* column_statistics = nullptr; if ((*it)->has_column_statistics()) { - res = _header->add_version( - (*it)->version(), (*it)->version_hash(), (*it)->num_segments(), - (*it)->max_timestamp(), (*it)->index_size(), (*it)->data_size(), - (*it)->num_rows(), &(*it)->get_column_statistics()); - } else { - res = _header->add_version( - (*it)->version(), (*it)->version_hash(), (*it)->num_segments(), - (*it)->max_timestamp(), (*it)->index_size(), (*it)->data_size(), - (*it)->num_rows()); + column_statistics = &((*it)->get_column_statistics()); } - + res = _header->add_version((*it)->version(), (*it)->version_hash(), + (*it)->rowset_id(), (*it)->num_segments(), + (*it)->index_size(), (*it)->data_size(), + (*it)->num_rows(), (*it)->empty(), column_statistics); + if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to add version to olap header.[version='%d-%d' table='%s']", - (*it)->version().first, - (*it)->version().second, - full_name().c_str()); + LOG(WARNING) << "fail to add version to olap header.[version='" << (*it)->version().first + << "-" << (*it)->version().second << "' table='" << full_name() << "']"; return res; } - OLAP_LOG_TRACE("add version to olap header.[version='%d-%d' table='%s']", - (*it)->version().first, - (*it)->version().second, - full_name().c_str()); + VLOG(3) << "add version to olap header. table=" << full_name() << ", " + << "version=" << (*it)->version().first << "-" << (*it)->version().second; } return OLAP_SUCCESS; @@ -670,46 +1490,24 @@ OLAPStatus OLAPTable::compute_all_versions_hash(const vector& versions, version_index != versions.end(); ++version_index) { version_olap_index_map_t::const_iterator temp = _data_sources.find(*version_index); if (temp == _data_sources.end()) { - OLAP_LOG_WARNING("fail to find OLAPIndex." + OLAP_LOG_WARNING("fail to find Rowset." "[start_version=%d; end_version=%d]", version_index->first, version_index->second); return OLAP_ERR_TABLE_VERSION_INDEX_MISMATCH_ERROR; } - *version_hash ^= temp->second->version_hash(); + *version_hash ^= temp->second[0]->version_hash(); } return OLAP_SUCCESS; } -OLAPStatus OLAPTable::get_selectivities(vector* selectivities) { - // num_rows and selectivities are calculated when loading and base compactioning. - if (selectivities == NULL) { - OLAP_LOG_WARNING("parameter num_rows or selectivity is null."); - return OLAP_ERR_INPUT_PARAMETER_ERROR; - } - - for (int i = 0; i < _header->selectivity_size(); ++i) { - selectivities->push_back(_header->selectivity(i)); - } - - return OLAP_SUCCESS; -} - -void OLAPTable::set_selectivities(const vector& selectivities) { - _header->clear_selectivity(); - - for (size_t i = 0; i < selectivities.size(); ++i) { - _header->add_selectivity(selectivities[i]); - } -} - OLAPStatus OLAPTable::merge_header(const OLAPHeader& hdr, int to_version) { obtain_header_wrlock(); DeferOp release_lock(std::bind(&OLAPTable::release_header_lock, this)); - const FileVersionMessage* base_version = _header->get_base_version(); + const PDelta* base_version = _header->get_base_version(); if (base_version->end_version() != to_version) { return OLAP_ERR_VERSION_NOT_EXIST; } @@ -718,61 +1516,61 @@ OLAPStatus OLAPTable::merge_header(const OLAPHeader& hdr, int to_version) { Version base = { base_version->start_version(), base_version->end_version() }; OLAPStatus st = _header->delete_version(base); if (st != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to delete version [%d-%d] from header", - base_version->start_version(), base_version->end_version()); + LOG(WARNING) << "failed to delete version from header" << ", " + << "version=" << base_version->start_version() << ", " + << base_version->end_version(); return st; } - OLAP_LOG_DEBUG("finished to delete version [%d-%d] from header", - base_version->start_version(), base_version->end_version()); + VLOG(3) << "finished to delete version from header" + << "version=" << base_version->start_version() << "-" + << base_version->end_version(); // add new versions - int version_num = hdr.file_version_size(); - for (int i = 0; i < version_num; ++i) { - const FileVersionMessage& v = hdr.file_version(i); - if (v.end_version() > to_version) { + for (int i = 0; i < hdr.file_delta_size(); ++i) { + const PDelta* delta = hdr.get_delta(i); + if (delta->end_version() > to_version) { break; } - - st = _header->add_version( - { v.start_version(), v.end_version() }, - v.version_hash(), - v.max_timestamp(), - v.num_segments(), - v.index_size(), - v.data_size(), - v.num_rows()); - - if (st != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to add version [%d-%d] to header", - v.start_version(), v.end_version()); - return st; + Version version = { delta->start_version(), delta->end_version() }; + VersionHash v_hash = delta->version_hash(); + for (int j = 0; j < delta->rowset_size(); ++j) { + const PRowSet& rowset = delta->rowset(j); + st = _header->add_version(version, v_hash, rowset.rowset_id(), + rowset.num_segments(), rowset.index_size(), rowset.data_size(), + rowset.num_rows(), rowset.empty(), nullptr); + if (st != OLAP_SUCCESS) { + LOG(WARNING) << "failed to add version to header" << ", " + << "version=" << version.first << "-" << version.second; + return st; + } } - OLAP_LOG_WARNING("finished to add version [%d-%d] to header", - v.start_version(), v.end_version()); } - st = _header->save(); if (st != OLAP_SUCCESS) { - OLAP_LOG_FATAL("failed to save header when merging. tablet: %d", _tablet_id); + LOG(FATAL) << "failed to save header when merging. tablet:" << _tablet_id; return st; } - OLAP_LOG_DEBUG("finished to merge header to version: %d", to_version); + VLOG(3) << "finished to merge header to version:" << to_version << "-" << to_version; return OLAP_SUCCESS; } -OLAPIndex* OLAPTable::_get_largest_index() { - OLAPIndex* largest_index = NULL; +Rowset* OLAPTable::_get_largest_index() { + Rowset* largest_index = NULL; size_t largest_index_sizes = 0; - for (version_olap_index_map_t::iterator it = _data_sources.begin(); - it != _data_sources.end(); ++it) { - // use index of base file as target index when base is not empty, - // or try to find the biggest index. - if (!it->second->empty() && it->second->index_size() > largest_index_sizes) { - largest_index = it->second; - largest_index_sizes = it->second->index_size(); + for (auto& it : _data_sources) { + // use rowset of base file as target rowset when base is not empty, + // or try to find the biggest rowset. + for (Rowset* rowset : it.second) { + if (rowset->empty() || rowset->zero_num_rows()) { + continue; + } + if (rowset->index_size() > largest_index_sizes) { + largest_index = rowset; + largest_index_sizes = rowset->index_size(); + } } } @@ -845,10 +1643,10 @@ OLAPStatus OLAPTable::split_range( end_key.build_max_key(); } - AutoRWLock auto_lock(get_header_lock_ptr(), true); - OLAPIndex* base_index = _get_largest_index(); + ReadLock rdlock(get_header_lock_ptr()); + Rowset* base_index = _get_largest_index(); - // 如果找ä¸åˆ°åˆé€‚çš„index,就直接返回startkey,endkey + // 如果找ä¸åˆ°åˆé€‚çš„rowset,就直接返回startkey,endkey if (base_index == NULL) { OLAP_LOG_DEBUG("there is no base file now, may be tablet is empty."); // it may be right if the table is empty, so we return success. @@ -945,22 +1743,26 @@ void OLAPTable::list_index_files(set* file_names) const { void OLAPTable::_list_files_with_suffix(const string& file_suffix, set* file_names) const { if (file_names == NULL) { - OLAP_LOG_WARNING("parameter filenames is null. [table='%s']", full_name().c_str()); + LOG(WARNING) << "parameter filenames is null. [table='" << full_name() << "']"; return; } file_names->clear(); - for (version_olap_index_map_t::const_iterator it = _data_sources.begin(); - it != _data_sources.end(); ++it) { + stringstream prefix_stream; + prefix_stream << _tablet_path << "/" << _tablet_id; + string tablet_path_prefix = prefix_stream.str(); + for (auto& it : _data_sources) { // every data segment has its file name. - OLAPIndex* index = it->second; - for (uint32_t i = 0; i < index->num_segments(); ++i) { - file_names->insert(basename(construct_file_path(_header->file_name(), - index->version(), - index->version_hash(), - i, - file_suffix).c_str())); + for (Rowset* rowset : it.second) { + for (int32_t seg_id = 0; seg_id < rowset->num_segments(); ++seg_id) { + file_names->insert(basename(construct_file_path(tablet_path_prefix, + rowset->version(), + rowset->version_hash(), + rowset->rowset_id(), + seg_id, + file_suffix).c_str())); + } } } } @@ -995,28 +1797,19 @@ void OLAPTable::list_version_entities(vector* version_entities) c // version_entities vector is not sorted. version_olap_index_map_t::const_iterator it; for (it = _data_sources.begin(); it != _data_sources.end(); ++it) { - if (it->second->has_column_statistics()) { - version_entities->push_back(VersionEntity( - it->first, - it->second->version_hash(), - it->second->num_segments(), - it->second->ref_count(), - it->second->num_rows(), - it->second->data_size(), - it->second->index_size(), - it->second->empty(), - it->second->get_column_statistics())); - } else { - version_entities->push_back(VersionEntity( - it->first, - it->second->version_hash(), - it->second->num_segments(), - it->second->ref_count(), - it->second->num_rows(), - it->second->data_size(), - it->second->index_size(), - it->second->empty())); + const std::vector& index_vec = it->second; + VersionEntity version_entity(it->first, index_vec[0]->version_hash()); + for (Rowset* rowset : index_vec) { + const std::vector* column_statistics = nullptr; + if (rowset->has_column_statistics()) { + column_statistics = &(rowset->get_column_statistics()); + } + RowSetEntity rowset_entity(rowset->rowset_id(), rowset->num_segments(), + rowset->num_rows(), rowset->data_size(), + rowset->index_size(), rowset->empty(), column_statistics); + version_entity.add_rowset_entity(rowset_entity); } + version_entities->push_back(version_entity); } } @@ -1029,76 +1822,132 @@ void OLAPTable::delete_all_files() { // remove indices and data files, release related resources. for (vector::const_iterator it = versions.begin(); it != versions.end(); ++it) { - OLAPIndex* data_source = NULL; - if (unregister_data_source(*it, &data_source) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to unregister data source for version. [start=%d end=%d]", - it->first, - it->second); + std::vector index_vec; + if (unregister_data_source(*it, &index_vec) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to unregister version." + << "version=" << it->first << "-" << it->second; return; } - data_source->delete_all_files(); - delete data_source; + for (Rowset* rowset : index_vec) { + rowset->delete_all_files(); + delete rowset; + } } // remove olap header file, _header object will be delete in OLAPTable.destructor - if (remove_parent_dir(_header->file_name()) != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to delete header file and directory. [header_path='%s']", - _header->file_name().c_str()); + if (remove_parent_dir(_tablet_path) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to delete header file and directory. header_path=" << _tablet_path; } } string OLAPTable::construct_index_file_path(const Version& version, VersionHash version_hash, - uint32_t segment) const { - return construct_file_path(_header->file_name(), version, version_hash, segment, "idx"); + int32_t rowset_id, + int32_t segment) const { + stringstream prefix_stream; + prefix_stream << _tablet_path << "/" << _tablet_id; + string tablet_path_prefix = prefix_stream.str(); + return construct_file_path(tablet_path_prefix, version, version_hash, rowset_id, segment, "idx"); } string OLAPTable::construct_data_file_path(const Version& version, VersionHash version_hash, - uint32_t segment) const { - return construct_file_path(_header->file_name(), version, version_hash, segment, "dat"); + int32_t rowset_id, + int32_t segment) const { + stringstream prefix_stream; + prefix_stream << _tablet_path << "/" << _tablet_id; + string tablet_path_prefix = prefix_stream.str(); + return construct_file_path(tablet_path_prefix, version, version_hash, rowset_id, segment, "dat"); } -string OLAPTable::construct_file_path(const string& header_path, +string OLAPTable::construct_file_path(const string& tablet_path_prefix, const Version& version, VersionHash version_hash, - uint32_t segment, + int32_t rowset_id, int32_t segment, const string& suffix) { - string file_path_prefix = header_path; - size_t header_file_name_len = header_path.length(); - - // header file must be ended with ".hdr" - if (header_file_name_len <= 4) { - OLAP_LOG_WARNING("invalid header file name. [file='%s']", header_path.c_str()); - return ""; - } - - file_path_prefix.erase(header_file_name_len - 4, 4); char file_path[OLAP_MAX_PATH_LEN]; - snprintf(file_path, - sizeof(file_path), - "%s_%d_%d_%ld_%u.%s", - file_path_prefix.c_str(), - version.first, - version.second, - version_hash, - segment, - suffix.c_str()); + if (rowset_id == -1) { + snprintf(file_path, + sizeof(file_path), + "%s_%ld_%ld_%ld_%d.%s", + tablet_path_prefix.c_str(), + version.first, + version.second, + version_hash, + segment, + suffix.c_str()); + } else { + snprintf(file_path, + sizeof(file_path), + "%s_%ld_%ld_%ld_%d_%d.%s", + tablet_path_prefix.c_str(), + version.first, + version.second, + version_hash, + rowset_id, segment, + suffix.c_str()); + } return file_path; } +string OLAPTable::construct_incremental_delta_dir_path() const { + stringstream rowset_dir_path; + rowset_dir_path << _tablet_path << INCREMENTAL_DELTA_PREFIX; + + return rowset_dir_path.str(); +} +string OLAPTable::construct_incremental_index_file_path(Version version, VersionHash version_hash, + int32_t rowset_id, int32_t segment) const { + string rowset_dir_path = construct_incremental_delta_dir_path(); + stringstream rowset_file_path; + rowset_file_path << rowset_dir_path << "/" + << construct_file_name(version, version_hash, rowset_id, segment, "idx"); + return rowset_file_path.str(); +} +string OLAPTable::construct_incremental_data_file_path(Version version, VersionHash version_hash, + int32_t rowset_id, int32_t segment) const { + string rowset_dir_path = construct_incremental_delta_dir_path(); + stringstream rowset_file_path; + rowset_file_path << rowset_dir_path << "/" + << construct_file_name(version, version_hash, rowset_id, segment, "dat"); + return rowset_file_path.str(); +} +string OLAPTable::construct_pending_data_dir_path() const { + return _tablet_path + PENDING_DELTA_PREFIX; +} +string OLAPTable::construct_pending_index_file_path(TTransactionId transaction_id, + int32_t rowset_id, int32_t segment) const { + string dir_path = construct_pending_data_dir_path(); + stringstream file_path; + file_path << dir_path << "/" + << transaction_id << "_" + << rowset_id << "_" << segment << ".idx"; + + return file_path.str(); +} +string OLAPTable::construct_pending_data_file_path(TTransactionId transaction_id, + int32_t rowset_id, int32_t segment) const { + string dir_path = construct_pending_data_dir_path(); + stringstream file_path; + file_path << dir_path << "/" + << transaction_id << "_" + << rowset_id << "_" << segment << ".dat"; + + return file_path.str(); +} + string OLAPTable::construct_file_name(const Version& version, VersionHash version_hash, - uint32_t segment, - const string& suffix) { + int32_t rowset_id, int32_t segment, + const string& suffix) const { char file_name[OLAP_MAX_PATH_LEN]; snprintf(file_name, sizeof(file_name), - "%ld_%d_%d_%d_%ld_%u.%s", + "%ld_%ld_%ld_%ld_%d_%d.%s", _tablet_id, - _schema_hash, version.first, version.second, version_hash, + rowset_id, segment, suffix.c_str()); @@ -1106,14 +1955,13 @@ string OLAPTable::construct_file_name(const Version& version, } string OLAPTable::construct_dir_path() const { - path path_name(_header->file_name()); - return path_name.parent_path().string(); + return _tablet_path; } int32_t OLAPTable::get_field_index(const string& field_name) const { field_index_map_t::const_iterator res_iterator = _field_index_map.find(field_name); if (res_iterator == _field_index_map.end()) { - OLAP_LOG_WARNING("invalid field name. [name='%s']", field_name.c_str()); + LOG(WARNING) << "invalid field name. [name='" << field_name << "']"; return -1; } @@ -1123,12 +1971,12 @@ int32_t OLAPTable::get_field_index(const string& field_name) const { size_t OLAPTable::get_field_size(const string& field_name) const { field_index_map_t::const_iterator res_iterator = _field_index_map.find(field_name); if (res_iterator == _field_index_map.end()) { - OLAP_LOG_WARNING("invalid field name. [name='%s']", field_name.c_str()); + LOG(WARNING) << "invalid field name. [name='" << field_name << "']"; return 0; } if (static_cast(res_iterator->second) >= _field_sizes.size()) { - OLAP_LOG_WARNING("invalid field index. [name='%s']", field_name.c_str()); + LOG(WARNING) << "invalid field rowset. [name='" << field_name << "']"; return 0; } @@ -1138,12 +1986,12 @@ size_t OLAPTable::get_field_size(const string& field_name) const { size_t OLAPTable::get_return_column_size(const string& field_name) const { field_index_map_t::const_iterator res_iterator = _field_index_map.find(field_name); if (res_iterator == _field_index_map.end()) { - OLAP_LOG_WARNING("invalid field name. [name='%s']", field_name.c_str()); + LOG(WARNING) << "invalid field name. [name='" << field_name << "']"; return 0; } if (static_cast(res_iterator->second) >= _field_sizes.size()) { - OLAP_LOG_WARNING("invalid field index. [name='%s']", field_name.c_str()); + LOG(WARNING) << "invalid field rowset. [name='" << field_name << "']"; return 0; } @@ -1169,8 +2017,10 @@ size_t OLAPTable::get_row_size() const { int64_t OLAPTable::get_data_size() const { int64_t total_size = 0; - for (const FileVersionMessage& version : _header->file_version()) { - total_size += version.data_size(); + for (const PDelta& delta : _header->delta()) { + for (const PRowSet& prowset : delta.rowset()) { + total_size += prowset.data_size(); + } } return total_size; @@ -1178,8 +2028,10 @@ int64_t OLAPTable::get_data_size() const { int64_t OLAPTable::get_num_rows() const { int64_t num_rows = 0; - for (const FileVersionMessage& version : _header->file_version()) { - num_rows += version.num_rows(); + for (const PDelta& delta : _header->delta()) { + for (const PRowSet& prowset : delta.rowset()) { + num_rows += prowset.num_rows(); + } } return num_rows; @@ -1187,7 +2039,7 @@ int64_t OLAPTable::get_num_rows() const { bool OLAPTable::is_load_delete_version(Version version) { version_olap_index_map_t::iterator it = _data_sources.find(version); - return it->second->delete_flag(); + return it->second[0]->delete_flag(); } bool OLAPTable::is_schema_changing() { @@ -1204,7 +2056,7 @@ bool OLAPTable::is_schema_changing() { bool OLAPTable::get_schema_change_request(TTabletId* tablet_id, SchemaHash* schema_hash, - vector* versions_to_be_changed, + vector* versions_to_changed, AlterTabletType* alter_table_type) const { if (!_header->has_schema_change_status()) { return false; @@ -1217,11 +2069,11 @@ bool OLAPTable::get_schema_change_request(TTabletId* tablet_id, (alter_table_type == NULL || (*alter_table_type = static_cast(schema_change_status.schema_change_type()))); - if (versions_to_be_changed != NULL) { - versions_to_be_changed->clear(); - for (int i = 0, len = schema_change_status.versions_to_be_changed_size(); i < len; ++i) { - const FileVersionMessage& version = schema_change_status.versions_to_be_changed(i); - versions_to_be_changed->push_back( + if (versions_to_changed != NULL) { + versions_to_changed->clear(); + for (int i = 0, len = schema_change_status.versions_to_changed_size(); i < len; ++i) { + const PDelta& version = schema_change_status.versions_to_changed(i); + versions_to_changed->push_back( Version(version.start_version(), version.end_version())); } } @@ -1231,7 +2083,7 @@ bool OLAPTable::get_schema_change_request(TTabletId* tablet_id, void OLAPTable::set_schema_change_request(TTabletId tablet_id, TSchemaHash schema_hash, - const vector& versions_to_be_changed, + const vector& versions_to_changed, const AlterTabletType alter_table_type) { clear_schema_change_request(); @@ -1240,33 +2092,32 @@ void OLAPTable::set_schema_change_request(TTabletId tablet_id, schema_change_status->set_related_schema_hash(schema_hash); vector::const_iterator it; - for (it = versions_to_be_changed.begin(); it != versions_to_be_changed.end(); ++it) { - FileVersionMessage* version = schema_change_status->add_versions_to_be_changed(); - version->set_num_segments(0); + for (it = versions_to_changed.begin(); it != versions_to_changed.end(); ++it) { + PDelta* version = schema_change_status->add_versions_to_changed(); version->set_start_version(it->first); version->set_end_version(it->second); version->set_version_hash(0); - version->set_max_timestamp(0); - version->set_index_size(0); - version->set_data_size(0); version->set_creation_time(0); + //version->set_index_size(0); + //version->set_data_size(0); + //version->set_num_segments(0); } schema_change_status->set_schema_change_type(alter_table_type); } -bool OLAPTable::remove_last_schema_change_version(SmartOLAPTable new_olap_table) { +bool OLAPTable::remove_last_schema_change_version(OLAPTablePtr new_olap_table) { if (!_header->has_schema_change_status()) { return false; } if (_header->has_schema_change_status()) { SchemaChangeStatusMessage* schema_change_status = _header->mutable_schema_change_status(); - ::google::protobuf::RepeatedPtrField* versions_to_be_changed - = schema_change_status->mutable_versions_to_be_changed(); + ::google::protobuf::RepeatedPtrField* versions_to_changed + = schema_change_status->mutable_versions_to_changed(); - if (versions_to_be_changed->size() > 0) { - versions_to_be_changed->RemoveLast(); + if (versions_to_changed->size() > 0) { + versions_to_changed->RemoveLast(); } } @@ -1274,7 +2125,7 @@ bool OLAPTable::remove_last_schema_change_version(SmartOLAPTable new_olap_table) } void OLAPTable::clear_schema_change_request() { - OLAP_LOG_INFO("clear schema change status. [tablet='%s']", _full_name.c_str()); + LOG(INFO) << "clear schema change status. [tablet='" << _full_name << "']"; _header->clear_schema_change_status(); } @@ -1282,51 +2133,85 @@ void OLAPTable::set_io_error() { OLAP_LOG_WARNING("io error occur.[tablet_full_name='%s', root_path_name='%s']", _full_name.c_str(), _storage_root_path.c_str()); - OLAPRootPath::get_instance()->set_root_path_used_stat(_storage_root_path, false); + OLAPEngine::get_instance()->set_store_used_flag(_storage_root_path, false); } bool OLAPTable::is_used() { - bool is_used = false; - - if (OLAPRootPath::get_instance()->get_root_path_used_stat(_storage_root_path, &is_used) - == OLAP_SUCCESS) { - return is_used; - } else { - return false; - } + return _store->is_used(); } -void OLAPTable::_set_storage_root_path_name() { - // sample: storage_root_path/DATA_PREFIX/shard/tablet_id/schema_hash/xxxx.hdr - path header_file_path(_header->file_name()); - path schema_path = header_file_path.parent_path(); - path table_path = schema_path.parent_path(); - path db_path = table_path.parent_path(); - - _storage_root_path = db_path.parent_path().parent_path().string(); +VersionEntity OLAPTable::get_version_entity_by_version(const Version& version) { + std::vector& index_vec = _data_sources[version]; + VersionEntity version_entity(version, index_vec[0]->version_hash()); + for (Rowset* rowset : index_vec) { + const std::vector* column_statistics = nullptr; + if (rowset->has_column_statistics()) { + column_statistics = &(rowset->get_column_statistics()); + } + RowSetEntity rowset_entity(rowset->rowset_id(), rowset->num_segments(), + rowset->num_rows(), rowset->data_size(), + rowset->index_size(), rowset->empty(), column_statistics); + version_entity.add_rowset_entity(rowset_entity); + } + return version_entity; } -VersionEntity OLAPTable::get_version_entity_by_version(Version version) { - if (_data_sources[version]->has_column_statistics()) { - return VersionEntity(version, - _data_sources[version]->version_hash(), - _data_sources[version]->num_segments(), - _data_sources[version]->ref_count(), - _data_sources[version]->num_rows(), - _data_sources[version]->data_size(), - _data_sources[version]->index_size(), - _data_sources[version]->empty(), - _data_sources[version]->get_column_statistics()); - } else { - return VersionEntity(version, - _data_sources[version]->version_hash(), - _data_sources[version]->num_segments(), - _data_sources[version]->ref_count(), - _data_sources[version]->num_rows(), - _data_sources[version]->data_size(), - _data_sources[version]->index_size(), - _data_sources[version]->empty()); +size_t OLAPTable::get_version_index_size(const Version& version) { + std::vector& index_vec = _data_sources[version]; + size_t index_size = 0; + for (Rowset* rowset : index_vec) { + index_size += rowset->index_size(); } + return index_size; +} + +size_t OLAPTable::get_version_data_size(const Version& version) { + std::vector& index_vec = _data_sources[version]; + size_t data_size = 0; + for (Rowset* rowset : index_vec) { + data_size += rowset->data_size(); + } + return data_size; +} + +OLAPStatus OLAPTable::recover_tablet_until_specfic_version( + const int64_t& until_version, const int64_t& version_hash) { + std::vector missing_versions; + { + ReadLock rdlock(&_header_lock); + get_missing_versions_with_header_locked(until_version, &missing_versions); + } + + std::vector rowset_vec; + OLAPStatus res = OLAP_SUCCESS; + for (Version& missing_version : missing_versions) { + Rowset* rowset = new Rowset(this, missing_version, version_hash, false, 0, 0); + rowset->set_empty(true); + IWriter* writer = IWriter::create(std::shared_ptr(this), rowset, true); + if (res != OLAP_SUCCESS) { break; } + + res = writer->finalize(); + if (res != OLAP_SUCCESS) { break; } + rowset_vec.push_back(rowset); + } + + if (res != OLAP_SUCCESS) { + for (Rowset* rowset : rowset_vec) { + rowset->delete_all_files(); + SAFE_DELETE(rowset); + } + } else { + for (Rowset* rowset : rowset_vec) { + rowset->load(); + } + } + + { + WriteLock wrlock(&_header_lock); + RETURN_NOT_OK(register_data_source(rowset_vec)); + RETURN_NOT_OK(save_header()); + } + return OLAP_SUCCESS; } OLAPStatus OLAPTable::test_version(const Version& version) { diff --git a/be/src/olap/olap_table.h b/be/src/olap/olap_table.h index d36ee76bc8..dd5f21a7a7 100644 --- a/be/src/olap/olap_table.h +++ b/be/src/olap/olap_table.h @@ -36,12 +36,13 @@ namespace palo { class FieldInfo; class IData; class OLAPHeader; -class OLAPIndex; +class Rowset; class OLAPTable; class RowBlockPosition; +class OlapStore; // Define OLAPTable's shared_ptr. It is used for -typedef std::shared_ptr SmartOLAPTable; +typedef std::shared_ptr OLAPTablePtr; enum BaseCompactionStage { BASE_COMPACTION_WAITING = 0, @@ -89,12 +90,19 @@ struct SchemaChangeStatus { int32_t version; }; -class OLAPTable { +class OLAPTable : public std::enable_shared_from_this { public: - static OLAPTable* create_from_header_file( + static OLAPTablePtr create_from_header_file( TTabletId tablet_id, TSchemaHash schema_hash, - const std::string& header_file); + const std::string& header_file, + OlapStore* store = nullptr); + + static OLAPTablePtr create_from_header( + OLAPHeader* header, + OlapStore* store = nullptr); + + explicit OLAPTable(OLAPHeader* header, OlapStore* store); virtual ~OLAPTable(); @@ -108,13 +116,10 @@ public: OLAPStatus load_indices(); - OLAPStatus save_header() { - OLAPStatus res = _header->save(); - if (res != OLAP_SUCCESS && is_io_error(res)) { - set_io_error(); - } + OLAPStatus save_header(); - return res; + OLAPHeader* get_header() { + return _header; } OLAPStatus select_versions_to_span(const Version& version, @@ -147,18 +152,58 @@ public: // Registers a newly created data source, making it available for // querying. Adds a reference to the data source in the header file. - OLAPStatus register_data_source(OLAPIndex* index); + OLAPStatus register_data_source(const std::vector& index_vec); // Unregisters the data source for given version, frees up resources. // resources include memory, files. - // After unregister, index will point to the associated OLAPIndex. - OLAPStatus unregister_data_source(const Version& version, OLAPIndex** index); + // After unregister, index will point to the associated Rowset. + OLAPStatus unregister_data_source(const Version& version, std::vector* index_vec); + + // if pending data is push_for_delete, delete conditions is not null + OLAPStatus add_pending_version(int64_t partition_id, int64_t transaction_id, + const std::vector* delete_conditions); + OLAPStatus add_pending_rowset(Rowset* index); + int32_t current_pending_rowset_id(int64_t transaction_id); + + OLAPStatus add_pending_data(Rowset* index, const std::vector* delete_conditions); + + bool has_pending_data(int64_t transaction_id); + + void delete_pending_data(int64_t transaction_id); + + // check the pending data that still not publish version + void get_expire_pending_data(std::vector* transaction_ids); + + void delete_expire_incremental_data(); + + // don't need header lock, because it occurs before loading tablet + void load_pending_data(); + + OLAPStatus publish_version(int64_t transaction_id, Version version, VersionHash version_hash); + + const PDelta* get_incremental_delta(Version version) const { + return _header->get_incremental_version(version); + } + + // calculate holes of version + // need header rdlock outside + void get_missing_versions_with_header_locked( + int64_t until_version, std::vector* missing_versions) const; + + // check if pending data is push_for_delete + // need to obtain header rdlock outside + OLAPStatus is_push_for_delete(int64_t transaction_id, bool* is_push_for_delete) const; + + // need to obtain header wrlock outside + OLAPStatus clone_data(const OLAPHeader& clone_header, + const std::vector& clone_deltas, + const std::vector& versions_to_delete); // Atomically replaces one set of data sources with another. Returns // true on success. OLAPStatus replace_data_sources(const std::vector* old_versions, - const std::vector* new_data_sources, - std::vector* old_data_sources); + const std::vector* new_data_sources, + std::vector* old_data_sources); // Computes the cumulative hash for given versions. // Only use Base file and Delta files to compute for simplicity and @@ -171,16 +216,9 @@ public: OLAPStatus compute_all_versions_hash(const std::vector& versions, VersionHash* version_hash) const; - // Get OLAPHeader read lock before call get_selectivities() - // Get table row_count and selectivity vector for SHOW_TABLE_INFO command - OLAPStatus get_selectivities(std::vector* selectivities); - // used for restore, merge the (0, to_version) in 'hdr' OLAPStatus merge_header(const OLAPHeader& hdr, int to_version); - // Get OLAPHeader write lock before call get_selectivities() - void set_selectivities(const std::vector& selectivities); - // Used by monitoring OLAPTable void list_data_files(std::set* filenames) const; @@ -214,7 +252,7 @@ public: _header_lock.unlock(); } - RWLock* get_header_lock_ptr() { + RWMutex* get_header_lock_ptr() { return &_header_lock; } @@ -226,6 +264,10 @@ public: _push_lock.unlock(); } + Mutex* get_push_lock() { + return &_push_lock; + } + // Prevent base compaction operations execute concurrently. bool try_base_compaction_lock() { return _base_compaction_lock.trylock() == OLAP_SUCCESS; @@ -250,14 +292,6 @@ public: _cumulative_lock.unlock(); } - // Prevent sync operations execute concurrently. - bool try_sync_lock() { - return _sync_lock.trylock() == OLAP_SUCCESS; - } - void release_sync_lock() { - _sync_lock.unlock(); - } - // Construct index file path according version, version_hash and segment // We construct file path through header file name. header file name likes: // tables_root_path/db/table/index/table_index_schemaversion.hdr @@ -269,7 +303,7 @@ public: // DailyWinfoIdeaStats_PRIMARY_20120428_0_200_735382373247_1.idx std::string construct_index_file_path(const Version& version, VersionHash version_hash, - uint32_t segment) const; + int32_t rowset_id, int32_t segment) const; // Same as construct_index_file_path except that file suffix is .dat // The typical index file path is: @@ -277,23 +311,32 @@ public: // DailyWinfoIdeaStats_PRIMARY_20120428_0_200_735382373247_1.dat std::string construct_data_file_path(const Version& version, VersionHash version_hash, - uint32_t segment) const; - - // return the dir path of this tablet, include tablet id and schema hash - // eg: /path/to/data/0/100001/237480234/ - std::string construct_dir_path() const; + int32_t rowset_id, int32_t segment) const; // For index file, suffix is "idx", for data file, suffix is "dat". - static std::string construct_file_path(const std::string& header_path, + static std::string construct_file_path(const std::string& tablet_path, const Version& version, VersionHash version_hash, - uint32_t segment, + int32_t rowset_id, int32_t segment, const std::string& suffix); + std::string construct_pending_data_dir_path() const; + std::string construct_pending_index_file_path( + TTransactionId transaction_id, int32_t rowset_id, int32_t segment) const; + std::string construct_pending_data_file_path( + TTransactionId transaction_id, int32_t rowset_id, int32_t segment) const; + std::string construct_incremental_delta_dir_path() const; + std::string construct_incremental_index_file_path( + Version version, VersionHash version_hash, int32_t rowset_id, int32_t segment) const; + std::string construct_incremental_data_file_path( + Version version, VersionHash version_hash, int32_t rowset_id, int32_t segment) const; + std::string construct_file_name(const Version& version, VersionHash version_hash, - uint32_t segment, - const std::string& suffix); + int32_t rowset_id, int32_t segment, + const std::string& suffix) const; + + std::string construct_dir_path() const; // Return -1 if field name is invalid, else return field index in schema. int32_t get_field_index(const std::string& field_name) const; @@ -320,6 +363,10 @@ public: return _full_name; } + void set_full_name(std::string full_name) { + _full_name = full_name; + } + std::vector& tablet_schema() { return _tablet_schema; } @@ -353,6 +400,10 @@ public: return _tablet_id; } + void set_tablet_id(TTabletId tablet_id) { + _tablet_id = tablet_id; + } + size_t num_short_key_fields() const { return _header->num_short_key_fields(); } @@ -361,38 +412,53 @@ public: return _header->next_column_unique_id(); } - // num rows per rowBlock, typically it is 256 or 512. - size_t num_rows_per_row_block() const { - return _header->num_rows_per_data_block(); - } - TSchemaHash schema_hash() const { return _schema_hash; } - int file_version_size() const { - return _header->file_version_size(); + void set_schema_hash(TSchemaHash schema_hash) { + _schema_hash = schema_hash; } - const FileVersionMessage& file_version(int index) const { - return _header->file_version(index); + OlapStore* store() const { + return _store; } - const FileVersionMessage* lastest_delta() const { + int file_delta_size() const { + return _header->file_delta_size(); + } + + const PDelta& delta(int index) const { + return _header->delta(index); + } + + const PDelta* get_delta(int index) const { + return _header->get_delta(index); + } + + const PDelta* lastest_delta() const { return _header->get_lastest_delta_version(); } - const FileVersionMessage* latest_version() const { - return _header->get_latest_version(); + const PDelta* lastest_version() const { + return _header->get_lastest_version(); } - const FileVersionMessage* base_version() const { + // need to obtain header rdlock outside + const PDelta* least_complete_version( + const std::vector& missing_versions) const; + + const PDelta* base_version() const { return _header->get_base_version(); } // 在使用之å‰å¯¹headeråŠ é” - const uint32_t get_compaction_nice_estimate() const { - return _header->get_compaction_nice_estimate(); + const uint32_t get_cumulative_compaction_score() const { + return _header->get_cumulative_compaction_score(); + } + + const uint32_t get_base_compaction_score() const { + return _header->get_base_compaction_score(); } const OLAPStatus delete_version(const Version& version) { @@ -407,29 +473,34 @@ public: return _header->data_file_type(); } + // num rows per rowBlock, typically it is 256 or 512. + size_t num_rows_per_row_block() const { + return _num_rows_per_row_block; + } + CompressKind compress_kind() const { - return _header->compress_kind(); + return _compress_kind; } int delete_data_conditions_size() const { return _header->delete_data_conditions_size(); } - DeleteDataConditionMessage* add_delete_data_conditions() { + DeleteConditionMessage* add_delete_data_conditions() { return _header->add_delete_data_conditions(); } - const google::protobuf::RepeatedPtrField& + const google::protobuf::RepeatedPtrField& delete_data_conditions() { return _header->delete_data_conditions(); } - google::protobuf::RepeatedPtrField* + google::protobuf::RepeatedPtrField* mutable_delete_data_conditions() { return _header->mutable_delete_data_conditions(); } - DeleteDataConditionMessage* mutable_delete_data_conditions(int index) { + DeleteConditionMessage* mutable_delete_data_conditions(int index) { return _header->mutable_delete_data_conditions(index); } @@ -454,7 +525,7 @@ public: return false; } - google::protobuf::RepeatedPtrField::const_iterator it; + google::protobuf::RepeatedPtrField::const_iterator it; it = _header->delete_data_conditions().begin(); for (; it != _header->delete_data_conditions().end(); ++it) { if (it->version() == version.first) { @@ -492,52 +563,17 @@ public: bool get_schema_change_request(TTabletId* tablet_id, SchemaHash* schema_hash, - std::vector* versions_to_be_changed, + std::vector* versions_to_changed, AlterTabletType* alter_table_type) const; void set_schema_change_request(TTabletId tablet_id, TSchemaHash schema_hash, - const std::vector& versions_to_be_changed, + const std::vector& versions_to_changed, const AlterTabletType alter_table_type); - bool remove_last_schema_change_version(SmartOLAPTable new_olap_table); + bool remove_last_schema_change_version(OLAPTablePtr new_olap_table); void clear_schema_change_request(); - // Following are get/set status functions. - // Like base-compaction, push, sync, schema-change. - BaseCompactionStatus base_compaction_status() { - return _base_compaction_status; - } - - void set_base_compaction_status(BaseCompactionStage status, int32_t version) { - _base_compaction_status.status = status; - if (version > -2) { - _base_compaction_status.version = version; - } - } - - PushStatus push_status() { - return _push_status; - } - - void set_push_status(PushStage status, int32_t version) { - _push_status.status = status; - if (version > -2) { - _push_status.version = version; - } - } - - SyncStatus sync_status() { - return _sync_status; - } - - void set_sync_status(SyncStage status, int32_t version) { - _sync_status.status = status; - if (version > -2) { - _sync_status.version = version; - } - } - SchemaChangeStatus schema_change_status() { return _schema_change_status; } @@ -586,6 +622,10 @@ public: return _storage_root_path; } + std::string tablet_path() { + return _tablet_path; + } + std::string get_field_name_by_index(uint32_t index) { if (index < _tablet_schema.size()) { return _tablet_schema[index].name; @@ -612,39 +652,18 @@ public: OLAPStatus test_version(const Version& version); - VersionEntity get_version_entity_by_version(Version version); + VersionEntity get_version_entity_by_version(const Version& version); + size_t get_version_index_size(const Version& version); + size_t get_version_data_size(const Version& version); bool is_dropped() { return _is_dropped; } - bool can_do_compaction() { - // 如果table正在åšschema changeï¼Œåˆ™é€šè¿‡é€‰è·¯åˆ¤æ–­æ•°æ®æ˜¯å¦è½¬æ¢å®Œæˆ - // 如果选路æˆåŠŸï¼Œåˆ™è½¬æ¢å®Œæˆï¼Œå¯ä»¥è¿›è¡ŒBE - // å¦‚æžœé€‰è·¯å¤±è´¥ï¼Œåˆ™è½¬æ¢æœªå®Œæˆï¼Œä¸èƒ½è¿›è¡ŒBE - obtain_header_rdlock(); - const FileVersionMessage* version = latest_version(); - if (version == NULL) { - release_header_lock(); - return false; - } - - if (is_schema_changing()) { - Version test_version = Version(0, version->end_version()); - std::vector path_versions; - if (OLAP_SUCCESS != select_versions_to_span(test_version, &path_versions)) { - release_header_lock(); - return false; - } - } - release_header_lock(); - - return true; - } - - + OLAPStatus recover_tablet_until_specfic_version(const int64_t& until_version, + const int64_t& version_hash); private: - // used for hash-struct of hash_map. + // used for hash-struct of hash_map. struct HashOfVersion { uint64_t operator()(const Version& version) const { uint64_t hash_value = version.first; @@ -659,52 +678,66 @@ private: } }; - typedef std::unordered_map field_index_map_t; - typedef std::unordered_map version_olap_index_map_t; - - explicit OLAPTable(OLAPHeader* header); - // List files with suffix "idx" or "dat". void _list_files_with_suffix(const std::string& file_suffix, std::set* file_names) const; // èŽ·å–æœ€å¤§çš„index(åªçœ‹å¤§å°ï¼‰ - OLAPIndex* _get_largest_index(); + Rowset* _get_largest_index(); - void _set_storage_root_path_name(); + Rowset* _construct_index_from_version(const PDelta* delta, int32_t rowset_id); + + // check if version is same, may delete local data + OLAPStatus _handle_existed_version(int64_t transaction_id, const Version& version, + const VersionHash& version_hash); + + // like "9-9" "10-10", for incremental cloning + OLAPStatus _add_incremental_data(std::vector& index_vec, int64_t transaction_id, + const Version& version, const VersionHash& version_hash); + + void _delete_incremental_data(const Version& version, const VersionHash& version_hash); + + OLAPStatus _create_hard_link(const std::string& from, const std::string& to, + std::vector* linked_success_files); TTabletId _tablet_id; TSchemaHash _schema_hash; OLAPHeader* _header; + size_t _num_rows_per_row_block; + CompressKind _compress_kind; // Set it true when table is dropped, table files and data structures // can be used and not deleted until table is destructed. bool _is_dropped; std::string _full_name; std::vector _tablet_schema; // field info vector is table schema. - // version -> its OLAPIndex, data source can be base file, cumulative file, - // or delta file. + + // Version mapping to Rowset. + // data source can be base delta, cumulative delta, singleton delta. + using version_olap_index_map_t = std::unordered_map, HashOfVersion>; version_olap_index_map_t _data_sources; + using transaction_olap_index_map_t = std::unordered_map>; + transaction_olap_index_map_t _pending_data_sources; + size_t _num_fields; size_t _num_null_fields; size_t _num_key_fields; // filed name -> field position in row + using field_index_map_t = std::unordered_map; field_index_map_t _field_index_map; std::vector _field_sizes; // A series of status - BaseCompactionStatus _base_compaction_status; - PushStatus _push_status; - SyncStatus _sync_status; SchemaChangeStatus _schema_change_status; // related locks to ensure that commands are executed correctly. - RWLock _header_lock; - MutexLock _push_lock; - MutexLock _cumulative_lock; - MutexLock _base_compaction_lock; - MutexLock _sync_lock; + RWMutex _header_lock; + Mutex _push_lock; + Mutex _cumulative_lock; + Mutex _base_compaction_lock; size_t _id; // uniq id, used in cache std::string _storage_root_path; - volatile bool _is_loaded; - MutexLock _load_lock; + OlapStore* _store; + std::atomic _is_loaded; + Mutex _load_lock; + std::string _tablet_path; DISALLOW_COPY_AND_ASSIGN(OLAPTable); }; diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index b6a0a4668d..5a09ba5a74 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -43,7 +43,7 @@ namespace palo { // clear schema change info in both current table and related tables, // finally we will only push for current tables OLAPStatus PushHandler::process( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, const TPushReq& request, PushType push_type, vector* tablet_info_vec) { @@ -65,7 +65,7 @@ OLAPStatus PushHandler::process( TTabletId tablet_id; TSchemaHash schema_hash; AlterTabletType alter_table_type; - SmartOLAPTable related_olap_table; + OLAPTablePtr related_olap_table; _obtain_header_rdlock(); bool is_schema_changing = olap_table->get_schema_change_request( &tablet_id, &schema_hash, NULL, &alter_table_type); @@ -93,9 +93,8 @@ OLAPStatus PushHandler::process( // Obtain push lock to avoid simultaneously PUSH and // conflict with alter table operations. - for (SmartOLAPTable table : _olap_table_arr) { + for (OLAPTablePtr table : _olap_table_arr) { table->obtain_push_lock(); - table->set_push_status(PUSH_RUNNING, _request.version); } is_push_locked = true; @@ -251,7 +250,7 @@ OLAPStatus PushHandler::process( } } _release_header_lock(); - + // 6. Delete unused versions which include delta and commulative, // which, in fact, is added to list and deleted by background thread for (TableVars& table_var : table_infoes) { @@ -279,17 +278,16 @@ EXIT: continue; } - for (OLAPIndex* olap_index : table_var.added_indices) { - olap_index->delete_all_files(); - SAFE_DELETE(olap_index); + for (Rowset* rowset : table_var.added_indices) { + rowset->delete_all_files(); + SAFE_DELETE(rowset); } } // Release push lock if (is_push_locked) { - for (SmartOLAPTable table : _olap_table_arr) { + for (OLAPTablePtr table : _olap_table_arr) { table->release_push_lock(); - table->set_push_status(PUSH_WAITING, -1); } } _olap_table_arr.clear(); @@ -299,6 +297,215 @@ EXIT: return res; } +OLAPStatus PushHandler::process_realtime_push( + OLAPTablePtr olap_table, + const TPushReq& request, + PushType push_type, + vector* tablet_info_vec) { + OLAP_LOG_INFO("begin to realtime push. [table=%s transaction_id=%ld]", + olap_table->full_name().c_str(), request.transaction_id); + + OLAPStatus res = OLAP_SUCCESS; + _request = request; + vector table_infoes(1); + table_infoes[0].olap_table = olap_table; + AlterTabletType alter_table_type; + + // add transaction in engine, then check sc status + // lock, prevent sc handler checking transaction concurrently + olap_table->obtain_push_lock(); + PUniqueId load_id; + load_id.set_hi(0); + load_id.set_lo(0); + res = OLAPEngine::get_instance()->add_transaction( + request.partition_id, request.transaction_id, + olap_table->tablet_id(), olap_table->schema_hash(), load_id); + + // if transaction exists, exit + if (res == OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST) { + + // if push finished, report success to fe + if (olap_table->has_pending_data(request.transaction_id)) { + OLAP_LOG_WARNING("pending data exists in tablet, which means push finished," + "return success. [table=%s transaction_id=%ld]", + olap_table->full_name().c_str(), request.transaction_id); + res = OLAP_SUCCESS; + } + olap_table->release_push_lock(); + goto EXIT; + } + + // only when fe sends schema_change true, should consider to push related table + if (_request.is_schema_changing) { + OLAP_LOG_DEBUG("push req specify schema changing is true. [table=%s transaction_id=%ld]", + olap_table->full_name().c_str(), request.transaction_id); + TTabletId related_tablet_id; + TSchemaHash related_schema_hash; + + olap_table->obtain_header_rdlock(); + bool is_schema_changing = olap_table->get_schema_change_request( + &related_tablet_id, &related_schema_hash, NULL, &alter_table_type); + olap_table->release_header_lock(); + + if (is_schema_changing) { + OLAP_LOG_INFO("find schema_change status when realtime push. " + "[table=%s related_tablet_id=%ld related_schema_hash=%d transaction_id=%ld]", + olap_table->full_name().c_str(), + related_tablet_id, related_schema_hash, request.transaction_id); + OLAPTablePtr related_olap_table = OLAPEngine::get_instance()->get_table( + related_tablet_id, related_schema_hash); + + // if related tablet not exists, only push current tablet + if (NULL == related_olap_table.get()) { + OLAP_LOG_WARNING("can't find related table, only push current tablet. " + "[table=%s related_tablet_id=%ld related_schema_hash=%d]", + olap_table->full_name().c_str(), + related_tablet_id, related_schema_hash); + + // if current tablet is new table, only push current tablet + } else if (olap_table->creation_time() > related_olap_table->creation_time()) { + OLAP_LOG_WARNING("current table is new, only push current tablet. " + "[table=%s related_olap_table=%s]", + olap_table->full_name().c_str(), + related_olap_table->full_name().c_str()); + + // add related transaction in engine + } else { + PUniqueId load_id; + load_id.set_hi(0); + load_id.set_lo(0); + res = OLAPEngine::get_instance()->add_transaction( + request.partition_id, request.transaction_id, + related_olap_table->tablet_id(), related_olap_table->schema_hash(), load_id); + + // if related tablet's transaction exists, only push current tablet + if (res == OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST) { + OLAP_LOG_WARNING("related tablet's transaction exists in engine, " + "only push current tablet. " + "[related_table=%s transaction_id=%ld]", + related_olap_table->full_name().c_str(), + request.transaction_id); + } else { + table_infoes.push_back(TableVars()); + TableVars& new_item = table_infoes.back(); + new_item.olap_table = related_olap_table; + } + } + } + } + olap_table->release_push_lock(); + + if (table_infoes.size() == 1) { + table_infoes.resize(2); + } + + // check delete condition if push for delete + if (push_type == PUSH_FOR_DELETE) { + + for (TableVars& table_var : table_infoes) { + if (table_var.olap_table.get() == NULL) { + continue; + } + + if (request.delete_conditions.size() == 0) { + OLAP_LOG_WARNING("invalid parameters for store_cond. [condition_size=0]"); + res = OLAP_ERR_DELETE_INVALID_PARAMETERS; + goto EXIT; + } + + DeleteConditionHandler del_cond_handler; + table_var.olap_table->obtain_header_rdlock(); + for (const TCondition& cond : request.delete_conditions) { + res = del_cond_handler.check_condition_valid(table_var.olap_table, cond); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to check delete condition. [table=%s res=%d]", + table_var.olap_table->full_name().c_str(), res); + table_var.olap_table->release_header_lock(); + goto EXIT; + } + } + table_var.olap_table->release_header_lock(); + OLAP_LOG_INFO("success to check delete condition when realtime push. " + "[table=%s transaction_id=%ld]", + table_var.olap_table->full_name().c_str(), request.transaction_id); + } + } + + // write + res = _convert(table_infoes[0].olap_table, table_infoes[1].olap_table, + &(table_infoes[0].added_indices), &(table_infoes[1].added_indices), + alter_table_type); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to convert tmp file when realtime push. [res=%d]", res); + goto EXIT; + } + + // add pending data to tablet + for (TableVars& table_var : table_infoes) { + if (table_var.olap_table.get() == NULL) { + continue; + } + + for (Rowset* olap_index : table_var.added_indices) { + + res = table_var.olap_table->add_pending_data( + olap_index, push_type == PUSH_FOR_DELETE ? &request.delete_conditions : NULL); + + // if pending data exists in tablet, which means push finished + if (res == OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST) { + SAFE_DELETE(olap_index); + res = OLAP_SUCCESS; + + } else if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to add pending data to tablet. [table=%s transaction_id=%ld]", + table_var.olap_table->full_name().c_str(), request.transaction_id); + goto EXIT; + } + } + } + +EXIT: + // if transaction existed in engine but push not finished, not report to fe + if (res == OLAP_ERR_PUSH_TRANSACTION_ALREADY_EXIST) { + OLAP_LOG_WARNING("find transaction existed when realtime push, not report. ", + "[table=%s partition_id=%ld transaction_id=%ld]", + olap_table->full_name().c_str(), + request.partition_id, request.transaction_id); + return res; + } + + if (res == OLAP_SUCCESS) { + if (tablet_info_vec != NULL) { + _get_tablet_infos(table_infoes, tablet_info_vec); + } + OLAP_LOG_INFO("process realtime push successfully. " + "[table=%s partition_id=%ld transaction_id=%ld]", + olap_table->full_name().c_str(), request.partition_id, request.transaction_id); + } else { + + // error happens, clear + OLAP_LOG_WARNING("failed to process realtime push. [table=%s transaction_id=%ld]", + olap_table->full_name().c_str(), request.transaction_id); + for (TableVars& table_var : table_infoes) { + if (table_var.olap_table.get() == NULL) { + continue; + } + + OLAPEngine::get_instance()->delete_transaction( + request.partition_id, request.transaction_id, + table_var.olap_table->tablet_id(), table_var.olap_table->schema_hash()); + + // actually, olap_index may has been deleted in delete_transaction() + for (Rowset* rowset : table_var.added_indices) { + rowset->release(); + OLAPEngine::get_instance()->add_unused_index(rowset); + } + } + } + + return res; +} + void PushHandler::_get_tablet_infos( const vector& table_infoes, vector* tablet_info_vec) { @@ -316,8 +523,8 @@ void PushHandler::_get_tablet_infos( } OLAPStatus PushHandler::_convert( - SmartOLAPTable curr_olap_table, - SmartOLAPTable new_olap_table, + OLAPTablePtr curr_olap_table, + OLAPTablePtr new_olap_table, Indices* curr_olap_indices, Indices* new_olap_indices, AlterTabletType alter_table_type) { @@ -326,7 +533,7 @@ OLAPStatus PushHandler::_convert( BinaryFile raw_file; IBinaryReader* reader = NULL; IWriter* writer = NULL; - OLAPIndex* delta_index = NULL; + Rowset* delta_rowset = NULL; uint32_t num_rows = 0; do { @@ -372,48 +579,59 @@ OLAPStatus PushHandler::_convert( } } - // 2. New OLAPIndex of curr_olap_table for current push - OLAP_LOG_DEBUG("init OLAPIndex."); + // 2. New Rowset of curr_olap_table for current push + OLAP_LOG_DEBUG("init Rowset."); - if (NULL == (delta_index = new(std::nothrow) OLAPIndex( + if (_request.__isset.transaction_id) { + // create pending data dir + string dir_path = curr_olap_table->construct_pending_data_dir_path(); + if (!check_dir_existed(dir_path) && (res = create_dirs(dir_path)) != OLAP_SUCCESS) { + if (!check_dir_existed(dir_path)) { + OLAP_LOG_WARNING("fail to create pending dir. [res=%d table=%s]", + res, curr_olap_table->full_name().c_str()); + break; + } + } + + delta_rowset = new(std::nothrow) Rowset( + curr_olap_table.get(), (_request.push_type == TPushType::LOAD_DELETE), + 0, 0, true, _request.partition_id, _request.transaction_id); + } else { + delta_rowset = new(std::nothrow) Rowset( curr_olap_table.get(), Version(_request.version, _request.version), _request.version_hash, (_request.push_type == TPushType::LOAD_DELETE), - 0, 0))) { - OLAP_LOG_WARNING("fail to malloc OLAPIndex. [table='%s' size=%ld]", - curr_olap_table->full_name().c_str(), sizeof(OLAPIndex)); + 0, 0); + } + + if (NULL == delta_rowset) { + OLAP_LOG_WARNING("fail to malloc Rowset. [table='%s' size=%ld]", + curr_olap_table->full_name().c_str(), sizeof(Rowset)); res = OLAP_ERR_MALLOC_ERROR; break; } - curr_olap_indices->push_back(delta_index); + curr_olap_indices->push_back(delta_rowset); - // 3. New Writer to write data into OLAPIndex + // 3. New Writer to write data into Rowset OLAP_LOG_DEBUG("init writer. [table='%s' block_row_size=%lu]", curr_olap_table->full_name().c_str(), curr_olap_table->num_rows_per_row_block()); - if (NULL == (writer = IWriter::create(curr_olap_table, delta_index, true))) { + if (NULL == (writer = IWriter::create(curr_olap_table, delta_rowset, true))) { OLAP_LOG_WARNING("fail to create writer. [table='%s']", curr_olap_table->full_name().c_str()); res = OLAP_ERR_MALLOC_ERROR; break; - } else if (OLAP_SUCCESS != (res = writer->init())) { - OLAP_LOG_WARNING( - "fail to init writer. [res=%d table='%s' version=%u version_hash=%lu]", - res, curr_olap_table->full_name().c_str(), - _request.version, _request.version_hash); - break; } - MemPool* mem_pool = writer->mem_pool(); // 4. Init RowCursor if (OLAP_SUCCESS != (res = row.init(curr_olap_table->tablet_schema()))) { OLAP_LOG_WARNING("fail to init rowcursor. [res=%d]", res); break; } - // 5. Read data from raw file and write into OLAPIndex of curr_olap_table + // 5. Read data from raw file and write into Rowset of curr_olap_table if (_request.__isset.http_file_path) { // Convert from raw to delta OLAP_LOG_DEBUG("start to convert row file to delta."); @@ -426,7 +644,7 @@ OLAPStatus PushHandler::_convert( break; } - res = reader->next(&row, mem_pool); + res = reader->next(&row, writer->mem_pool()); if (OLAP_SUCCESS != res) { OLAP_LOG_WARNING("read next row failed. [res=%d read_rows=%u]", res, num_rows); @@ -453,17 +671,27 @@ OLAPStatus PushHandler::_convert( OLAP_LOG_DEBUG("load the index."); - if (OLAP_SUCCESS != (res = delta_index->load())) { + if (OLAP_SUCCESS != (res = delta_rowset->load())) { OLAP_LOG_WARNING("fail to load index. [res=%d table='%s' version=%ld]", res, curr_olap_table->full_name().c_str(), _request.version); break; } - _write_bytes += delta_index->data_size(); - _write_rows += delta_index->num_rows(); + _write_bytes += delta_rowset->data_size(); + _write_rows += delta_rowset->num_rows(); // 7. Convert data for schema change tables OLAP_LOG_TRACE("load to related tables of schema_change if possible. "); if (NULL != new_olap_table.get()) { + // create related tablet's pending data dir + string dir_path = new_olap_table->construct_pending_data_dir_path(); + if (!check_dir_existed(dir_path) && (res = create_dirs(dir_path)) != OLAP_SUCCESS) { + if (!check_dir_existed(dir_path)) { + OLAP_LOG_WARNING("fail to create pending dir. [res=%d table=%s]", + res, new_olap_table->full_name().c_str()); + break; + } + } + SchemaChangeHandler schema_change; res = schema_change.schema_version_convert( curr_olap_table, @@ -489,22 +717,22 @@ OLAPStatus PushHandler::_convert( } OLAPStatus PushHandler::_validate_request( - SmartOLAPTable olap_table_for_raw, - SmartOLAPTable olap_table_for_schema_change, + OLAPTablePtr olap_table_for_raw, + OLAPTablePtr olap_table_for_schema_change, bool is_new_tablet_effective, PushType push_type) { - const FileVersionMessage* latest_delta = olap_table_for_raw->lastest_delta(); + const PDelta* latest_delta = olap_table_for_raw->lastest_delta(); if (NULL == latest_delta) { - const FileVersionMessage* latest_version = olap_table_for_raw->latest_version(); + const PDelta* lastest_version = olap_table_for_raw->lastest_version(); // PUSH the first version when the version is 0, or // tablet is in alter table status. - if (NULL == latest_version + if (NULL == lastest_version && (0 == _request.version || NULL != olap_table_for_schema_change.get())) { return OLAP_SUCCESS; - } else if (NULL != latest_version - && (latest_version->end_version() + 1 == _request.version)) { + } else if (NULL != lastest_version + && (lastest_version->end_version() + 1 == _request.version)) { return OLAP_SUCCESS; } @@ -533,8 +761,8 @@ OLAPStatus PushHandler::_validate_request( || _request.version > latest_delta->start_version() + 1) { OLAP_LOG_WARNING( "try to push a delta with incorrect version. " - "[new_version=%ld latest_version=%u " - "new_version_hash=%ld latest_version_hash=%lu]", + "[new_version=%ld lastest_version=%u " + "new_version_hash=%ld lastest_version_hash=%lu]", _request.version, latest_delta->start_version(), _request.version_hash, latest_delta->version_hash()); return OLAP_ERR_PUSH_VERSION_INCORRECT; @@ -542,8 +770,8 @@ OLAPStatus PushHandler::_validate_request( && _request.version_hash == latest_delta->version_hash()) { OLAP_LOG_WARNING( "try to push a already exist delta. " - "[new_version=%ld latest_version=%u " - "new_version_hash=%ld latest_version_hash=%lu]", + "[new_version=%ld lastest_version=%u " + "new_version_hash=%ld lastest_version_hash=%lu]", _request.version, latest_delta->start_version(), _request.version_hash, latest_delta->version_hash()); return OLAP_ERR_PUSH_VERSION_ALREADY_EXIST; @@ -558,21 +786,21 @@ OLAPStatus PushHandler::_validate_request( // user submit a push job and cancel it soon, but some // tablets already push success. OLAPStatus PushHandler::_get_versions_reverted( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, bool is_new_tablet, PushType push_type, Versions* unused_versions) { - const FileVersionMessage* latest_delta = olap_table->lastest_delta(); + const PDelta* latest_delta = olap_table->lastest_delta(); if (NULL == latest_delta) { - const FileVersionMessage* latest_version = olap_table->latest_version(); + const PDelta* lastest_version = olap_table->lastest_version(); // PUSH the first version, and the version is 0 - if ((NULL == latest_version + if ((NULL == lastest_version && (0 == _request.version || is_new_tablet))) { return OLAP_SUCCESS; - } else if (NULL != latest_version - && latest_version->end_version() + 1 == _request.version) { + } else if (NULL != lastest_version + && lastest_version->end_version() + 1 == _request.version) { return OLAP_SUCCESS; } @@ -610,7 +838,7 @@ OLAPStatus PushHandler::_get_versions_reverted( } OLAPStatus PushHandler::_update_header( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, Versions* unused_versions, Indices* new_indices, Indices* unused_indices) { @@ -644,7 +872,7 @@ OLAPStatus PushHandler::_update_header( void PushHandler::_delete_old_indices(Indices* unused_indices) { if (!unused_indices->empty()) { - OLAPUnusedIndex* unused_index = OLAPUnusedIndex::get_instance(); + OLAPEngine* unused_index = OLAPEngine::get_instance(); for (Indices::iterator it = unused_indices->begin(); it != unused_indices->end(); ++it) { @@ -654,8 +882,8 @@ void PushHandler::_delete_old_indices(Indices* unused_indices) { } OLAPStatus PushHandler::_clear_alter_table_info( - SmartOLAPTable tablet, - SmartOLAPTable related_tablet) { + OLAPTablePtr tablet, + OLAPTablePtr related_tablet) { OLAPStatus res = OLAP_SUCCESS; _obtain_header_wrlock(); @@ -735,7 +963,7 @@ BinaryReader::BinaryReader() } OLAPStatus BinaryReader::init( - SmartOLAPTable table, + OLAPTablePtr table, BinaryFile* file) { OLAPStatus res = OLAP_SUCCESS; @@ -865,7 +1093,7 @@ LzoBinaryReader::LzoBinaryReader() } OLAPStatus LzoBinaryReader::init( - SmartOLAPTable table, + OLAPTablePtr table, BinaryFile* file) { OLAPStatus res = OLAP_SUCCESS; diff --git a/be/src/olap/push_handler.h b/be/src/olap/push_handler.h index 2314394ee8..cd4270c01b 100644 --- a/be/src/olap/push_handler.h +++ b/be/src/olap/push_handler.h @@ -25,14 +25,14 @@ #include "olap/file_helper.h" #include "olap/merger.h" #include "olap/olap_common.h" -#include "olap/olap_index.h" +#include "olap/rowset.h" #include "olap/row_cursor.h" #include "olap/writer.h" namespace palo { typedef std::vector DataSources; -typedef std::vector Indices; +typedef std::vector Indices; class BinaryFile; class BinaryReader; @@ -40,7 +40,7 @@ class ColumnMapping; class RowCursor; struct TableVars { - SmartOLAPTable olap_table; + OLAPTablePtr olap_table; Versions unused_versions; Indices unused_indices; Indices added_indices; @@ -55,17 +55,24 @@ public: // Load local data file into specified tablet. OLAPStatus process( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, const TPushReq& request, PushType push_type, std::vector* tablet_info_vec); + + OLAPStatus process_realtime_push( + OLAPTablePtr olap_table, + const TPushReq& request, + PushType push_type, + std::vector* tablet_info_vec); + int64_t write_bytes() const { return _write_bytes; } int64_t write_rows() const { return _write_rows; } private: // Validate request, mainly data version check. OLAPStatus _validate_request( - SmartOLAPTable olap_table_for_raw, - SmartOLAPTable olap_table_for_schema_change, + OLAPTablePtr olap_table_for_raw, + OLAPTablePtr olap_table_for_schema_change, bool is_rollup_new_table, PushType push_type); @@ -73,23 +80,23 @@ private: // user submit a push job and cancel it soon, but some // tablets already push success. OLAPStatus _get_versions_reverted( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, bool is_schema_change_tablet, PushType push_type, Versions* unused_versions); // Convert local data file to internal formatted delta, - // return new delta's OLAPIndex + // return new delta's Rowset OLAPStatus _convert( - SmartOLAPTable curr_olap_table, - SmartOLAPTable new_olap_table_vec, + OLAPTablePtr curr_olap_table, + OLAPTablePtr new_olap_table_vec, Indices* curr_olap_indices, Indices* new_olap_indices, AlterTabletType alter_table_type); // Update header info when new version add or dirty version removed. OLAPStatus _update_header( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, Versions* unused_versions, Indices* new_indices, Indices* unused_indices); @@ -99,15 +106,15 @@ private: // Clear schema change information. OLAPStatus _clear_alter_table_info( - SmartOLAPTable olap_table, - SmartOLAPTable related_olap_table); + OLAPTablePtr olap_table, + OLAPTablePtr related_olap_table); // Only for debug std::string _debug_version_list(const Versions& versions) const; // Lock tablet header before read header info. void _obtain_header_rdlock() { - for (std::list::iterator it = _olap_table_arr.begin(); + for (std::list::iterator it = _olap_table_arr.begin(); it != _olap_table_arr.end(); ++it) { OLAP_LOG_DEBUG("obtain all header locks rd. [table='%s']", (*it)->full_name().c_str()); @@ -119,7 +126,7 @@ private: // Locak tablet header before write header info. void _obtain_header_wrlock() { - for (std::list::iterator it = _olap_table_arr.begin(); + for (std::list::iterator it = _olap_table_arr.begin(); it != _olap_table_arr.end(); ++it) { OLAP_LOG_DEBUG( "obtain all header locks wr. [table='%s']", (*it)->full_name().c_str()); @@ -132,7 +139,7 @@ private: // Release tablet header lock. void _release_header_lock() { if (_header_locked) { - for (std::list::reverse_iterator it = _olap_table_arr.rbegin(); + for (std::list::reverse_iterator it = _olap_table_arr.rbegin(); it != _olap_table_arr.rend(); ++it) { OLAP_LOG_DEBUG( "release all header locks. [table='%s']", (*it)->full_name().c_str()); @@ -152,7 +159,7 @@ private: // maily contains specified tablet object // contains related tables also if in schema change, tablet split or rollup - std::list _olap_table_arr; + std::list _olap_table_arr; // lock tablet header before modify tabelt header bool _header_locked; @@ -196,7 +203,7 @@ public: static IBinaryReader* create(bool need_decompress); virtual ~IBinaryReader() {} - virtual OLAPStatus init(SmartOLAPTable table, BinaryFile* file) = 0; + virtual OLAPStatus init(OLAPTablePtr table, BinaryFile* file) = 0; virtual OLAPStatus finalize() = 0; virtual OLAPStatus next(RowCursor* row, MemPool* mem_pool) = 0; @@ -218,7 +225,7 @@ protected: } BinaryFile* _file; - SmartOLAPTable _table; + OLAPTablePtr _table; size_t _content_len; size_t _curr; uint32_t _adler_checksum; @@ -233,7 +240,7 @@ public: finalize(); } - virtual OLAPStatus init(SmartOLAPTable table, BinaryFile* file); + virtual OLAPStatus init(OLAPTablePtr table, BinaryFile* file); virtual OLAPStatus finalize(); virtual OLAPStatus next(RowCursor* row, MemPool* mem_pool); @@ -254,7 +261,7 @@ public: finalize(); } - virtual OLAPStatus init(SmartOLAPTable table, BinaryFile* file); + virtual OLAPStatus init(OLAPTablePtr table, BinaryFile* file); virtual OLAPStatus finalize(); virtual OLAPStatus next(RowCursor* row, MemPool* mem_pool); diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index d2c3aac9ab..df9a9e9c8b 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -177,7 +177,7 @@ OLAPStatus CollectIterator::init(Reader* reader) { _reader = reader; // when aggregate is enabled or key_type is DUP_KEYS, we don't merge // multiple data to aggregate for performance in user fetch - if (_reader->_reader_type == READER_FETCH && + if (_reader->_reader_type == READER_QUERY && (_reader->_aggregation || _reader->_olap_table->keys_type() == KeysType::DUP_KEYS)) { _merge = false; @@ -285,7 +285,7 @@ Reader::Reader() : _next_key_index(0), _aggregation(false), _version_locked(false), - _reader_type(READER_FETCH), + _reader_type(READER_QUERY), _next_delete_flag(false), _next_key(NULL), _merged_rows(0) { @@ -487,10 +487,8 @@ OLAPStatus Reader::_acquire_data_sources(const ReaderParams& read_params) { _olap_table->release_header_lock(); if (_own_data_sources.size() < 1) { - OLAP_LOG_WARNING("fail to acquire data sources. [table_name='%s' version=%d-%d]", - _olap_table->full_name().c_str(), - _version.first, - _version.second); + LOG(WARNING) << "fail to acquire data sources. [table_name='" << _olap_table->full_name() + << "' version=" << _version.first << "-" << _version.second << "]"; return OLAP_ERR_VERSION_NOT_EXIST; } data_sources = &_own_data_sources; @@ -499,13 +497,13 @@ OLAPStatus Reader::_acquire_data_sources(const ReaderParams& read_params) { // do not use index stream cache when be/ce/alter/checksum, // to avoid bringing down lru cache hit ratio bool is_using_cache = true; - if (read_params.reader_type != READER_FETCH) { + if (read_params.reader_type != READER_QUERY) { is_using_cache = false; } for (auto i_data: *data_sources) { // skip empty version - if (i_data->empty()) { + if (i_data->empty() || i_data->zero_num_rows()) { continue; } i_data->set_delete_handler(_delete_handler); @@ -588,7 +586,7 @@ OLAPStatus Reader::_init_params(const ReaderParams& read_params) { } OLAPStatus Reader::_init_return_columns(const ReaderParams& read_params) { - if (read_params.reader_type == READER_FETCH) { + if (read_params.reader_type == READER_QUERY) { _return_columns = read_params.return_columns; if (_delete_handler.conditions_num() != 0 && read_params.aggregation) { set column_set(_return_columns.begin(), _return_columns.end()); diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h index 1303cc127c..060034806a 100644 --- a/be/src/olap/reader.h +++ b/be/src/olap/reader.h @@ -46,7 +46,7 @@ class RuntimeState; // Params for Reader, // mainly include tablet, data version and fetch range. struct ReaderParams { - SmartOLAPTable olap_table; + OLAPTablePtr olap_table; ReaderType reader_type; bool aggregation; Version version; @@ -62,7 +62,7 @@ struct ReaderParams { RuntimeState* runtime_state; ReaderParams() : - reader_type(READER_FETCH), + reader_type(READER_QUERY), aggregation(true), profile(NULL), runtime_state(NULL) { @@ -196,7 +196,7 @@ private: Version _version; - SmartOLAPTable _olap_table; + OLAPTablePtr _olap_table; // _own_data_sources is data source that reader aquire from olap_table, so we need to // release these when reader closing diff --git a/be/src/olap/row_block.h b/be/src/olap/row_block.h index adb80af109..81cd9fffc9 100644 --- a/be/src/olap/row_block.h +++ b/be/src/olap/row_block.h @@ -83,6 +83,14 @@ public: cursor->attach(_mem_buf + row_index * _mem_row_bytes); } + inline void set_row(uint32_t row_index, const RowCursor& cursor) const { + memcpy(_mem_buf + row_index * _mem_row_bytes, cursor.get_buf(), _mem_row_bytes); + } + + inline void set_row(uint32_t row_index, const char* row) const { + memcpy(_mem_buf + row_index * _mem_row_bytes, row, _mem_row_bytes); + } + // called when finished fill this row_block OLAPStatus finalize(uint32_t row_num); diff --git a/be/src/olap/row_cursor.cpp b/be/src/olap/row_cursor.cpp index aedd6769fb..650d6a7e29 100644 --- a/be/src/olap/row_cursor.cpp +++ b/be/src/olap/row_cursor.cpp @@ -89,6 +89,7 @@ OLAPStatus RowCursor::_init(const std::vector& tablet_schema, } else if (type == OLAP_FIELD_TYPE_HLL) { _variable_len += HLL_COLUMN_DEFAULT_LEN + sizeof(HllContext*); } + _string_columns.push_back(cid); } _fixed_buf = new (nothrow) char[_fixed_len]; diff --git a/be/src/olap/row_cursor.h b/be/src/olap/row_cursor.h index 57ee24eb8a..43e11308d1 100644 --- a/be/src/olap/row_cursor.h +++ b/be/src/olap/row_cursor.h @@ -171,6 +171,8 @@ public: char* get_field_ptr(uint32_t cid) const { return _fixed_buf + _field_offsets[cid]; } char* get_field_content_ptr(uint32_t cid) const { return _fixed_buf + _field_offsets[cid] + 1; } + size_t get_field_offset(uint32_t cid) const { return _field_offsets[cid] + 1; } + std::vector& get_string_columns() { return _string_columns; } inline uint32_t hash_code(uint32_t seed) const; private: @@ -184,6 +186,7 @@ private: size_t _key_column_num; // key num in row_cursor std::vector _columns; // column_id in schema + std::vector _string_columns; // column_id in schema char* _fixed_buf = nullptr; // point to fixed buf size_t _fixed_len; char* _owned_fixed_buf = nullptr; // point to buf allocated in init function diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index c04ae7708a..b49915b3b4 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -53,12 +53,12 @@ bool version_entity_sorter(const VersionEntity& a, const VersionEntity& b) { } RowBlockChanger::RowBlockChanger(const std::vector &tablet_schema, - const SmartOLAPTable &ref_olap_table) { + const OLAPTablePtr &ref_olap_table) { _schema_mapping.resize(tablet_schema.size()); } RowBlockChanger::RowBlockChanger(const vector& tablet_schema, - const SmartOLAPTable& ref_olap_table, + const OLAPTablePtr& ref_olap_table, const DeleteHandler& delete_handler) { _schema_mapping.resize(tablet_schema.size()); _delete_handler = delete_handler; @@ -537,7 +537,7 @@ void RowBlockAllocator::release(RowBlock* row_block) { delete row_block; } -RowBlockMerger::RowBlockMerger(SmartOLAPTable olap_table) : _olap_table(olap_table) {} +RowBlockMerger::RowBlockMerger(OLAPTablePtr olap_table) : _olap_table(olap_table) {} RowBlockMerger::~RowBlockMerger() {} @@ -547,8 +547,6 @@ bool RowBlockMerger::merge( uint64_t* merged_rows) { uint64_t tmp_merged_rows = 0; RowCursor row_cursor; - MemPool* mem_pool = writer->mem_pool(); - if (row_cursor.init(_olap_table->tablet_schema()) != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to init row cursor."); goto MERGE_ERR; @@ -566,7 +564,7 @@ bool RowBlockMerger::merge( OLAP_LOG_WARNING("writer error."); goto MERGE_ERR; } - row_cursor.allocate_memory_for_string_type(_olap_table->tablet_schema(), mem_pool); + row_cursor.allocate_memory_for_string_type(_olap_table->tablet_schema(), writer->mem_pool()); row_cursor.agg_init(*(_heap.top().row_cursor)); @@ -650,12 +648,12 @@ bool RowBlockMerger::_pop_heap() { } LinkedSchemaChange::LinkedSchemaChange( - SmartOLAPTable base_olap_table, SmartOLAPTable new_olap_table) : + OLAPTablePtr base_olap_table, OLAPTablePtr new_olap_table) : _base_olap_table(base_olap_table), _new_olap_table(new_olap_table) {} SchemaChangeDirectly::SchemaChangeDirectly( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, const RowBlockChanger& row_block_changer) : _olap_table(olap_table), _row_block_changer(row_block_changer), @@ -671,7 +669,6 @@ SchemaChangeDirectly::~SchemaChangeDirectly() { } bool SchemaChangeDirectly::_write_row_block(IWriter* writer, RowBlock* row_block) { - MemPool* mem_pool = writer->mem_pool(); for (uint32_t i = 0; i < row_block->row_block_info().row_num; i++) { if (OLAP_SUCCESS != writer->attached_by(_dst_cursor)) { OLAP_LOG_WARNING("fail to attach writer"); @@ -680,58 +677,57 @@ bool SchemaChangeDirectly::_write_row_block(IWriter* writer, RowBlock* row_block row_block->get_row(i, _src_cursor); - _dst_cursor->copy(*_src_cursor, mem_pool); + _dst_cursor->copy(*_src_cursor, writer->mem_pool()); writer->next(*_dst_cursor); } return true; } -bool LinkedSchemaChange::process(IData* olap_data, OLAPIndex* new_olap_index) { +bool LinkedSchemaChange::process(IData* olap_data, Rowset* new_rowset) { for (size_t i = 0; i < olap_data->olap_index()->num_segments(); ++i) { - string index_path = _new_olap_table->construct_index_file_path( - new_olap_index->version(), new_olap_index->version_hash(), i); - string base_table_index_path = _base_olap_table->construct_index_file_path( - new_olap_index->version(), new_olap_index->version_hash(), i); + string index_path = new_rowset->construct_index_file_path(new_rowset->rowset_id(), i); + string base_table_index_path = olap_data->olap_index()->construct_index_file_path(olap_data->olap_index()->rowset_id(), i); if (link(base_table_index_path.c_str(), index_path.c_str()) == 0) { OLAP_LOG_DEBUG("success to create hard link. [from_path=%s to_path=%s]", base_table_index_path.c_str(), index_path.c_str()); } else { - OLAP_LOG_WARNING("fail to create hard link. [from_path=%s to_path=%s]", - base_table_index_path.c_str(), index_path.c_str()); + LOG(WARNING) << "fail to create hard link. [from_path=" << base_table_index_path.c_str() + << " to_path=" << index_path.c_str() + << " errno=" << Errno::no() << " errno_str=" << Errno::str() << "]"; return false; } - string data_path = _new_olap_table->construct_data_file_path( - new_olap_index->version(), new_olap_index->version_hash(), i); - string base_table_data_path = _base_olap_table->construct_data_file_path( - new_olap_index->version(), new_olap_index->version_hash(), i); + string data_path = new_rowset->construct_data_file_path(new_rowset->rowset_id(), i); + string base_table_data_path = olap_data->olap_index()->construct_data_file_path(olap_data->olap_index()->rowset_id(), i); if (link(base_table_data_path.c_str(), data_path.c_str()) == 0) { OLAP_LOG_DEBUG("success to create hard link. [from_path=%s to_path=%s]", base_table_data_path.c_str(), data_path.c_str()); } else { - OLAP_LOG_WARNING("fail to create hard link. [from_path=%s to_path=%s]", - base_table_data_path.c_str(), data_path.c_str()); + LOG(WARNING) << "fail to create hard link. [from_path=" << base_table_data_path.c_str() + << " to_path=" << data_path.c_str() + << " errno=" << Errno::no() << " errno_str=" << Errno::str() << "]"; return false; } } - new_olap_index->set_num_segments(olap_data->olap_index()->num_segments()); + new_rowset->set_empty(olap_data->empty()); + new_rowset->set_num_segments(olap_data->olap_index()->num_segments()); + new_rowset->add_column_statistics(olap_data->olap_index()->get_column_statistics()); - if (OLAP_SUCCESS != new_olap_index->load()) { + if (OLAP_SUCCESS != new_rowset->load()) { OLAP_LOG_WARNING("fail to reload index. [table='%s' version='%d-%d']", _new_olap_table->full_name().c_str(), - new_olap_index->version().first, - new_olap_index->version().second); + new_rowset->version().first, + new_rowset->version().second); return false; } return true; } -bool SchemaChangeDirectly::process(IData* olap_data, OLAPIndex* new_olap_index) { - bool result = true; - DataFileType data_file_type = new_olap_index->table()->data_file_type(); +bool SchemaChangeDirectly::process(IData* olap_data, Rowset* new_rowset) { + DataFileType data_file_type = new_rowset->table()->data_file_type(); bool null_supported = true; if (NULL == _row_block_allocator) { @@ -770,47 +766,47 @@ bool SchemaChangeDirectly::process(IData* olap_data, OLAPIndex* new_olap_index) } RowBlock* ref_row_block = NULL; - OLAPStatus res = olap_data->get_first_row_block(&ref_row_block); - if (OLAP_SUCCESS != res) { - // Create empty version when olap_data is empty - if (olap_data->eof()) { - OLAP_LOG_DEBUG("src delta is empty, create an empty version instead."); - res = create_init_version( - new_olap_index->table()->tablet_id(), - new_olap_index->table()->schema_hash(), - new_olap_index->version(), - new_olap_index->version_hash(), - new_olap_index); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create init version. [res=%d]", res); - result = false; + bool need_create_empty_version = false; + OLAPStatus res = OLAP_SUCCESS; + if (!olap_data->empty()) { + res = olap_data->get_first_row_block(&ref_row_block); + if (res != OLAP_SUCCESS) { + if (olap_data->eof()) { + need_create_empty_version = true; + } else { + LOG(WARNING) << "failed to get first row block."; + return false; } - } else { - OLAP_LOG_WARNING("failed to get first row block."); - result = false; } - - return result; + } else { + need_create_empty_version = true; } - OLAP_LOG_DEBUG("init writer. [table='%s' block_row_size=%lu]", - _olap_table->full_name().c_str(), - _olap_table->num_rows_per_row_block()); + if (need_create_empty_version) { + res = create_init_version(new_rowset->table()->tablet_id(), + new_rowset->table()->schema_hash(), + new_rowset->version(), + new_rowset->version_hash(), + new_rowset); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "create empty version for schema change failed." + << "version=" << new_rowset->version().first << "-" << new_rowset->version().second; + return false; + } + return true; + } + VLOG(3) << "init writer. table=" << _olap_table->full_name() << ", " + << "block_row_size=" << _olap_table->num_rows_per_row_block(); + bool result = true; RowBlock* new_row_block = NULL; - IWriter* writer = IWriter::create(_olap_table, new_olap_index, false); + IWriter* writer = IWriter::create(_olap_table, new_rowset, false); if (NULL == writer) { OLAP_LOG_WARNING("failed to create writer."); result = false; goto DIRECTLY_PROCESS_ERR; } - if (OLAP_SUCCESS != writer->init()) { - OLAP_LOG_WARNING("failed to init writer."); - result = false; - goto DIRECTLY_PROCESS_ERR; - } - // Reset filted_rows and merged_rows statistic reset_merged_rows(); reset_filted_rows(); @@ -864,11 +860,11 @@ bool SchemaChangeDirectly::process(IData* olap_data, OLAPIndex* new_olap_index) goto DIRECTLY_PROCESS_ERR; } - if (OLAP_SUCCESS != new_olap_index->load()) { + if (OLAP_SUCCESS != new_rowset->load()) { OLAP_LOG_WARNING("fail to reload index. [table='%s' version='%d-%d']", _olap_table->full_name().c_str(), - new_olap_index->version().first, - new_olap_index->version().second); + new_rowset->version().first, + new_rowset->version().second); result = false; goto DIRECTLY_PROCESS_ERR; } @@ -878,18 +874,18 @@ bool SchemaChangeDirectly::process(IData* olap_data, OLAPIndex* new_olap_index) // Check row num changes if (config::row_nums_check) { if (olap_data->olap_index()->num_rows() - != new_olap_index->num_rows() + merged_rows() + filted_rows()) { + != new_rowset->num_rows() + merged_rows() + filted_rows()) { OLAP_LOG_FATAL("fail to check row num! " "[source_rows=%lu merged_rows=%lu filted_rows=%lu new_index_rows=%lu]", olap_data->olap_index()->num_rows(), - merged_rows(), filted_rows(), new_olap_index->num_rows()); + merged_rows(), filted_rows(), new_rowset->num_rows()); result = false; } } else { OLAP_LOG_INFO("all row nums. " "[source_rows=%lu merged_rows=%lu filted_rows=%lu new_index_rows=%lu]", olap_data->olap_index()->num_rows(), - merged_rows(), filted_rows(), new_olap_index->num_rows()); + merged_rows(), filted_rows(), new_rowset->num_rows()); } DIRECTLY_PROCESS_ERR: @@ -898,7 +894,7 @@ DIRECTLY_PROCESS_ERR: return result; } -SchemaChangeWithSorting::SchemaChangeWithSorting(SmartOLAPTable olap_table, +SchemaChangeWithSorting::SchemaChangeWithSorting(OLAPTablePtr olap_table, const RowBlockChanger& row_block_changer, size_t memory_limitation) : _olap_table(olap_table), @@ -919,7 +915,7 @@ SchemaChangeWithSorting::~SchemaChangeWithSorting() { SAFE_DELETE(_row_block_allocator); } -bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_index) { +bool SchemaChangeWithSorting::process(IData* olap_data, Rowset* new_rowset) { if (NULL == _row_block_allocator) { if (NULL == (_row_block_allocator = new(nothrow) RowBlockAllocator( _olap_table->tablet_schema(), _memory_limitation))) { @@ -929,34 +925,42 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde } } - bool result = true; - DataFileType data_file_type = new_olap_index->table()->data_file_type(); + DataFileType data_file_type = new_rowset->table()->data_file_type(); bool null_supported = true; RowBlock* ref_row_block = NULL; - OLAPStatus res = olap_data->get_first_row_block(&ref_row_block); - - if (OLAP_SUCCESS != res) { - if (olap_data->eof()) { - OLAP_LOG_DEBUG("src delta is empty, create an empty version instead."); - res = create_init_version( - new_olap_index->table()->tablet_id(), - new_olap_index->table()->schema_hash(), - new_olap_index->version(), - new_olap_index->version_hash(), - new_olap_index); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to create init version. [res=%d]", res); - result = false; + bool need_create_empty_version = false; + OLAPStatus res = OLAP_SUCCESS; + if (!olap_data->empty()) { + res = olap_data->get_first_row_block(&ref_row_block); + if (res != OLAP_SUCCESS) { + if (olap_data->eof()) { + need_create_empty_version = true; + } else { + LOG(WARNING) << "failed to get first row block."; + return false; } - } else { - OLAP_LOG_WARNING("failed to get first row block."); - result = false; } - - return result; + } else { + need_create_empty_version = true; } + if (need_create_empty_version) { + res = create_init_version(new_rowset->table()->tablet_id(), + new_rowset->table()->schema_hash(), + new_rowset->version(), + new_rowset->version_hash(), + new_rowset); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "create empty version for schema change failed." + << "version=" << new_rowset->version().first << "-" << new_rowset->version().second; + return false; + } + return true; + } + + + bool result = true; RowBlockSorter row_block_sorter(_row_block_allocator); // for internal sorting @@ -964,7 +968,7 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde vector row_block_arr; // for external sorting - vector olap_index_arr; + vector olap_rowsets; _temp_delta_versions.first = _temp_delta_versions.second; @@ -990,18 +994,18 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde } // enter here while memory limitation is reached. - OLAPIndex* olap_index = NULL; + Rowset* rowset = NULL; if (!_internal_sorting(row_block_arr, Version(_temp_delta_versions.second, _temp_delta_versions.second), - &olap_index)) { + &rowset)) { OLAP_LOG_WARNING("failed to sorting internally."); result = false; goto SORTING_PROCESS_ERR; } - olap_index_arr.push_back(olap_index); + olap_rowsets.push_back(rowset); for (vector::iterator it = row_block_arr.begin(); it != row_block_arr.end(); ++it) { @@ -1046,17 +1050,17 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde if (!row_block_arr.empty()) { // enter here while memory limitation is reached. - OLAPIndex* olap_index = NULL; + Rowset* rowset = NULL; if (!_internal_sorting(row_block_arr, Version(_temp_delta_versions.second, _temp_delta_versions.second), - &olap_index)) { + &rowset)) { OLAP_LOG_WARNING("failed to sorting internally."); result = false; goto SORTING_PROCESS_ERR; } - olap_index_arr.push_back(olap_index); + olap_rowsets.push_back(rowset); for (vector::iterator it = row_block_arr.begin(); it != row_block_arr.end(); ++it) { @@ -1070,7 +1074,7 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde } // TODO(zyh): 如果_temp_delta_versionsåªæœ‰ä¸€ä¸ªï¼Œä¸éœ€è¦å†å¤–排 - if (!_external_sorting(olap_index_arr, new_olap_index)) { + if (!_external_sorting(olap_rowsets, new_rowset)) { OLAP_LOG_WARNING("failed to sorting externally."); result = false; goto SORTING_PROCESS_ERR; @@ -1081,23 +1085,23 @@ bool SchemaChangeWithSorting::process(IData* olap_data, OLAPIndex* new_olap_inde // Check row num changes if (config::row_nums_check) { if (olap_data->olap_index()->num_rows() - != new_olap_index->num_rows() + merged_rows() + filted_rows()) { + != new_rowset->num_rows() + merged_rows() + filted_rows()) { OLAP_LOG_WARNING("fail to check row num! " "[source_rows=%lu merged_rows=%lu filted_rows=%lu new_index_rows=%lu]", olap_data->olap_index()->num_rows(), - merged_rows(), filted_rows(), new_olap_index->num_rows()); + merged_rows(), filted_rows(), new_rowset->num_rows()); result = false; } } else { OLAP_LOG_INFO("all row nums. " "[source_rows=%lu merged_rows=%lu filted_rows=%lu new_index_rows=%lu]", olap_data->olap_index()->num_rows(), - merged_rows(), filted_rows(), new_olap_index->num_rows()); + merged_rows(), filted_rows(), new_rowset->num_rows()); } SORTING_PROCESS_ERR: - for (vector::iterator it = olap_index_arr.begin(); - it != olap_index_arr.end(); ++it) { + for (vector::iterator it = olap_rowsets.begin(); + it != olap_rowsets.end(); ++it) { (*it)->delete_all_files(); SAFE_DELETE(*it); } @@ -1113,19 +1117,18 @@ SORTING_PROCESS_ERR: bool SchemaChangeWithSorting::_internal_sorting(const vector& row_block_arr, const Version& temp_delta_versions, - OLAPIndex** temp_olap_index) { + Rowset** temp_rowset) { IWriter* writer = NULL; uint64_t merged_rows = 0; RowBlockMerger merger(_olap_table); - (*temp_olap_index) = new(nothrow) OLAPIndex(_olap_table.get(), + (*temp_rowset) = new(nothrow) Rowset(_olap_table.get(), temp_delta_versions, rand(), false, - 0, - 0); - if (NULL == (*temp_olap_index)) { - OLAP_LOG_WARNING("failed to malloc OLAPIndex. [size=%ld]", sizeof(OLAPIndex)); + 0, 0); + if (NULL == (*temp_rowset)) { + OLAP_LOG_WARNING("failed to malloc Rowset. [size=%ld]", sizeof(Rowset)); goto INTERNAL_SORTING_ERR; } @@ -1133,24 +1136,19 @@ bool SchemaChangeWithSorting::_internal_sorting(const vector& row_blo _olap_table->full_name().c_str(), _olap_table->num_rows_per_row_block()); - writer = IWriter::create(_olap_table, *temp_olap_index, false); + writer = IWriter::create(_olap_table, *temp_rowset, false); if (NULL == writer) { OLAP_LOG_WARNING("failed to create writer."); goto INTERNAL_SORTING_ERR; } - if (OLAP_SUCCESS != writer->init()) { - OLAP_LOG_WARNING("failed to init writer."); - goto INTERNAL_SORTING_ERR; - } - if (!merger.merge(row_block_arr, writer, &merged_rows)) { OLAP_LOG_WARNING("failed to merge row blocks."); goto INTERNAL_SORTING_ERR; } add_merged_rows(merged_rows); - if (OLAP_SUCCESS != (*temp_olap_index)->load()) { + if (OLAP_SUCCESS != (*temp_rowset)->load()) { OLAP_LOG_WARNING("failed to reload olap index."); goto INTERNAL_SORTING_ERR; } @@ -1161,22 +1159,22 @@ bool SchemaChangeWithSorting::_internal_sorting(const vector& row_blo INTERNAL_SORTING_ERR: SAFE_DELETE(writer); - (*temp_olap_index)->delete_all_files(); - SAFE_DELETE(*temp_olap_index); + (*temp_rowset)->delete_all_files(); + SAFE_DELETE(*temp_rowset); return false; } bool SchemaChangeWithSorting::_external_sorting( - vector& src_olap_index_arr, - OLAPIndex* dest_olap_index) { - Merger merger(_olap_table, dest_olap_index, READER_ALTER_TABLE); + vector& src_rowsets, + Rowset* dest_rowset) { + Merger merger(_olap_table, dest_rowset, READER_ALTER_TABLE); uint64_t merged_rows = 0; uint64_t filted_rows = 0; vector olap_data_arr; - for (vector::iterator it = src_olap_index_arr.begin(); - it != src_olap_index_arr.end(); ++it) { + for (vector::iterator it = src_rowsets.begin(); + it != src_rowsets.end(); ++it) { IData* olap_data = IData::create(*it); if (NULL == olap_data) { OLAP_LOG_WARNING("fail to create IData."); @@ -1194,22 +1192,21 @@ bool SchemaChangeWithSorting::_external_sorting( } } - if (OLAP_SUCCESS != merger.merge( - olap_data_arr, false, &merged_rows, &filted_rows)) { + if (OLAP_SUCCESS != merger.merge(olap_data_arr, &merged_rows, &filted_rows)) { OLAP_LOG_WARNING("fail to merge deltas. [table='%s' version='%d-%d']", _olap_table->full_name().c_str(), - dest_olap_index->version().first, - dest_olap_index->version().second); + dest_rowset->version().first, + dest_rowset->version().second); goto EXTERNAL_SORTING_ERR; } add_merged_rows(merged_rows); add_filted_rows(filted_rows); - if (OLAP_SUCCESS != dest_olap_index->load()) { + if (OLAP_SUCCESS != dest_rowset->load()) { OLAP_LOG_WARNING("fail to reload index. [table='%s' version='%d-%d']", _olap_table->full_name().c_str(), - dest_olap_index->version().first, - dest_olap_index->version().second); + dest_rowset->version().first, + dest_rowset->version().second); goto EXTERNAL_SORTING_ERR; } @@ -1226,7 +1223,7 @@ EXTERNAL_SORTING_ERR: SAFE_DELETE(*it); } - dest_olap_index->delete_all_files(); + dest_rowset->delete_all_files(); return false; } @@ -1236,12 +1233,12 @@ OLAPStatus SchemaChangeHandler::clear_schema_change_single_info( AlterTabletType* alter_table_type, bool only_one, bool check_only) { - SmartOLAPTable olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + OLAPTablePtr olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); return clear_schema_change_single_info(olap_table, alter_table_type, only_one, check_only); } OLAPStatus SchemaChangeHandler::clear_schema_change_single_info( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, AlterTabletType* type, bool only_one, bool check_only) { @@ -1273,7 +1270,7 @@ OLAPStatus SchemaChangeHandler::clear_schema_change_single_info( } OLAPStatus SchemaChangeHandler::_check_and_clear_schema_change_info( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, const TAlterTabletReq& request) { // check for schema change chain ( A->B) // broken old relation if a chain was found and there is no version to be changed @@ -1309,7 +1306,7 @@ OLAPStatus SchemaChangeHandler::_check_and_clear_schema_change_info( // clear schema change info of current tablet { - AutoRWLock auto_lock(olap_table->get_header_lock_ptr(), false); + WriteLock wrlock(olap_table->get_header_lock_ptr()); res = clear_schema_change_single_info( olap_table->tablet_id(), olap_table->schema_hash(), &type, true, false); if (res != OLAP_SUCCESS) { @@ -1327,7 +1324,7 @@ OLAPStatus SchemaChangeHandler::_check_and_clear_schema_change_info( } // clear schema change info of related tablet - SmartOLAPTable tablet = OLAPEngine::get_instance()->get_table( + OLAPTablePtr tablet = OLAPEngine::get_instance()->get_table( tablet_id, schema_hash); if (tablet.get() == NULL) { OLAP_LOG_WARNING("get null tablet! [tablet_id=%ld schema_hash=%d]", @@ -1336,7 +1333,7 @@ OLAPStatus SchemaChangeHandler::_check_and_clear_schema_change_info( } { - AutoRWLock auto_lock(tablet->get_header_lock_ptr(), false); + WriteLock wrlock(tablet->get_header_lock_ptr()); res = clear_schema_change_single_info( tablet_id, schema_hash, &type, true, false); if (res != OLAP_SUCCESS) { @@ -1370,7 +1367,7 @@ OLAPStatus SchemaChangeHandler::process_alter_table( } // 2. Get base table - SmartOLAPTable ref_olap_table = OLAPEngine::get_instance()->get_table( + OLAPTablePtr ref_olap_table = OLAPEngine::get_instance()->get_table( request.base_tablet_id, request.base_schema_hash); if (ref_olap_table.get() == NULL) { OLAP_LOG_WARNING("fail to find base table. [base_table=%ld base_schema_hash=%d]", @@ -1390,7 +1387,7 @@ OLAPStatus SchemaChangeHandler::process_alter_table( } // 4. return failed if new table already exist in OLAPEngine. - SmartOLAPTable new_tablet = OLAPEngine::get_instance()->get_table( + OLAPTablePtr new_tablet = OLAPEngine::get_instance()->get_table( request.new_tablet_req.tablet_id, request.new_tablet_req.tablet_schema.schema_hash); if (new_tablet.get() != NULL) { res = OLAP_SUCCESS; @@ -1405,10 +1402,10 @@ OLAPStatus SchemaChangeHandler::process_alter_table( OLAPStatus SchemaChangeHandler::_do_alter_table( AlterTabletType type, - SmartOLAPTable ref_olap_table, + OLAPTablePtr ref_olap_table, const TAlterTabletReq& request) { OLAPStatus res = OLAP_SUCCESS; - SmartOLAPTable new_olap_table; + OLAPTablePtr new_olap_table; string base_root_path = ref_olap_table->storage_root_path_name(); OLAP_LOG_INFO("begin to do alter tablet job. new table[%d]", request.new_tablet_req.tablet_id); @@ -1424,11 +1421,118 @@ OLAPStatus SchemaChangeHandler::_do_alter_table( return res; } + // set schema change status temporarily, + // after waiting transactions to finish, will calculate versions again + vector tmp_versions_to_be_changed; + tmp_versions_to_be_changed.push_back(Version(-1, -1)); + ref_olap_table->obtain_push_lock(); + ref_olap_table->obtain_header_wrlock(); + new_olap_table->obtain_header_wrlock(); + res = _save_schema_change_info(type, ref_olap_table, new_olap_table, tmp_versions_to_be_changed); + new_olap_table->release_header_lock(); + ref_olap_table->release_header_lock(); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to save schema change info before waiting transactions. " + "[base=%s new=%s res=%d]", ref_olap_table->full_name().c_str(), + new_olap_table->full_name().c_str(), res); + ref_olap_table->release_push_lock(); + OLAPEngine::get_instance()->drop_table( + new_olap_table->tablet_id(), new_olap_table->schema_hash()); + return res; + } + + // get current transactions + int64_t partition_id; + std::set transaction_ids; + OLAPEngine::get_instance()-> + get_transactions_by_tablet(ref_olap_table, &partition_id, &transaction_ids); + ref_olap_table->release_push_lock(); + + // wait transactions to publish version + int num = 0; + while (!transaction_ids.empty()) { + OLAP_LOG_DEBUG("wait transactions when schema change. [tablet='%s' transaction_size=%d]", + ref_olap_table->full_name().c_str(), transaction_ids.size()); + num++; + if (num % 100 == 0) { + for (int64_t transaction_id : transaction_ids) { + LOG(INFO) << "transaction_id is waiting by schema_change: " << transaction_id; + } + } + sleep(1); + // erase finished transaction + vector finished_transactions; + for (int64_t transaction_id : transaction_ids) { + if (!OLAPEngine::get_instance()->has_transaction( + partition_id, transaction_id, + ref_olap_table->tablet_id(), ref_olap_table->schema_hash())) { + finished_transactions.push_back(transaction_id); + } + } + for (int64_t transaction_id : finished_transactions) { + transaction_ids.erase(transaction_id); + OLAP_LOG_DEBUG("transaction finished when schema change is waiting. " + "[tablet=%s transaction_id=%ld transaction_size=%d]", + ref_olap_table->full_name().c_str(), transaction_id, transaction_ids.size()); + } + } + // 2. Get version_to_be_changed and store into table header ref_olap_table->obtain_push_lock(); ref_olap_table->obtain_header_wrlock(); new_olap_table->obtain_header_wrlock(); + // before calculating version_to_be_changed, + // remove all data from new tablet, prevent to rewrite data(those double pushed when wait) + OLAP_LOG_DEBUG("begin to remove all data from new tablet to prevent rewrite. [new_tablet=%s]", + new_olap_table->full_name().c_str()); + // only remove the version <= base_tablet's latest version + const PDelta* lastest_file_version = ref_olap_table->lastest_version(); + if (lastest_file_version != NULL) { + OLAP_LOG_DEBUG("find the latest version of base tablet when remove all data from new. " + "[base_tablet=%s version=%d-%d]", ref_olap_table->full_name().c_str(), + lastest_file_version->start_version(), lastest_file_version->end_version()); + vector new_tablet_versions; + new_olap_table->list_versions(&new_tablet_versions); + for (vector::const_iterator it = new_tablet_versions.begin(); + it != new_tablet_versions.end(); ++it) { + if (it->second <= lastest_file_version->end_version()) { + std::vector rowsets; + res = new_olap_table->unregister_data_source(*it, &rowsets); + if (res != OLAP_SUCCESS) { + break; + } + for (Rowset* rowset : rowsets) { + rowset->delete_all_files(); + delete rowset; + } + OLAP_LOG_DEBUG("unregister data source from new tablet when schema change. " + "[new_tablet=%s version=%d-%d res=%d]", + new_olap_table->full_name().c_str(), it->first, it->second, res); + } + } + // save header + if (res == OLAP_SUCCESS) { + res = new_olap_table->save_header(); + if (res != OLAP_SUCCESS) { + OLAP_LOG_WARNING("fail to save header after unregister data source " + "when schema change. [new_tablet=%s res=%d]", + new_olap_table->full_name().c_str(), res); + } + } + // if failed, return + if (res != OLAP_SUCCESS) { + new_olap_table->release_header_lock(); + ref_olap_table->release_header_lock(); + ref_olap_table->release_push_lock(); + OLAPEngine::get_instance()->drop_table( + new_olap_table->tablet_id(), new_olap_table->schema_hash()); + OLAP_LOG_WARNING("fail to remove data from new tablet when schema_change. " + "[new_tablet=%s]", new_olap_table->full_name().c_str()); + return res; + } + } + vector versions_to_be_changed; vector olap_data_arr; // delete handlers for new olap table @@ -1533,21 +1637,21 @@ OLAPStatus SchemaChangeHandler::_do_alter_table( } OLAPStatus SchemaChangeHandler::_create_new_olap_table( - const SmartOLAPTable ref_olap_table, + const OLAPTablePtr ref_olap_table, const TCreateTabletReq& request, const string* ref_root_path, - SmartOLAPTable* out_new_olap_table) { + OLAPTablePtr* out_new_olap_table) { OLAPStatus res = OLAP_SUCCESS; OLAPTable* new_olap_table = NULL; bool is_table_added = false; // 1. Lock to ensure that all _create_new_olap_table operation execute in serial - static MutexLock create_table_lock; + static Mutex create_table_lock; create_table_lock.lock(); do { // 2. Create table with only header, no deltas - OLAPTable* new_olap_table = OLAPEngine::get_instance()->create_table( + OLAPTablePtr new_olap_table = OLAPEngine::get_instance()->create_table( request, ref_root_path, true, ref_olap_table); if (new_olap_table == NULL) { OLAP_LOG_WARNING("failed to create table. [table=%ld xml_path=%d]", @@ -1588,8 +1692,8 @@ OLAPStatus SchemaChangeHandler::_create_new_olap_table( // 4. Register table into OLAPRootPath, so that we can manage table from // the perspective of root path. // Example: unregister all tables when a bad disk found. - res = OLAPRootPath::get_instance()->register_table_into_root_path( - new_olap_table); + res = OLAPEngine::get_instance()->register_table_into_root_path( + new_olap_table.get()); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to register table into root path. " "[root_path='%s' table='%s']", @@ -1598,7 +1702,7 @@ OLAPStatus SchemaChangeHandler::_create_new_olap_table( break; } - SmartOLAPTable olap_table; + OLAPTablePtr olap_table; olap_table = OLAPEngine::get_instance()->get_table( request.tablet_id, request.tablet_schema.schema_hash); if (olap_table.get() == NULL) { @@ -1609,13 +1713,6 @@ OLAPStatus SchemaChangeHandler::_create_new_olap_table( break; } - // 5. Copy selectivity information - res = _copy_table_attributes(ref_olap_table, olap_table); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("failed to copy table attributes"); - break; - } - if (out_new_olap_table != NULL) { *out_new_olap_table = olap_table; } @@ -1626,13 +1723,12 @@ OLAPStatus SchemaChangeHandler::_create_new_olap_table( res = OLAPEngine::get_instance()->drop_table( request.tablet_id, request.tablet_schema.schema_hash); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to drop table when create table failed. " - "[res=%d table=%s]", - res, new_olap_table->full_name().c_str()); + LOG(WARNING) << "fail to drop table when create table failed. res=" << res + << ", tablet=" << request.tablet_id + << ":" << request.tablet_schema.schema_hash; } } else if (NULL != new_olap_table) { new_olap_table->delete_all_files(); - SAFE_DELETE(new_olap_table); } } @@ -1641,11 +1737,11 @@ OLAPStatus SchemaChangeHandler::_create_new_olap_table( } OLAPStatus SchemaChangeHandler::schema_version_convert( - SmartOLAPTable src_olap_table, - SmartOLAPTable dest_olap_table, - vector* ref_olap_indices, - vector* new_olap_indices) { - if (NULL == new_olap_indices) { + OLAPTablePtr src_olap_table, + OLAPTablePtr dest_olap_table, + vector* ref_rowsets, + vector* new_rowsets) { + if (NULL == new_rowsets) { OLAP_LOG_WARNING("new_olap_index is NULL."); return OLAP_ERR_INPUT_PARAMETER_ERROR; } @@ -1701,8 +1797,8 @@ OLAPStatus SchemaChangeHandler::schema_version_convert( // c. è½¬æ¢æ•°æ® IData* olap_data = NULL; - for (vector::iterator it = ref_olap_indices->begin(); - it != ref_olap_indices->end(); ++it) { + for (vector::iterator it = ref_rowsets->begin(); + it != ref_rowsets->end(); ++it) { IData* olap_data = IData::create(*it); if (NULL == olap_data) { OLAP_LOG_WARNING("fail to create IData."); @@ -1712,26 +1808,41 @@ OLAPStatus SchemaChangeHandler::schema_version_convert( olap_data->init(); - OLAPIndex* new_olap_index = new(nothrow) OLAPIndex( - dest_olap_table.get(), - olap_data->version(), - olap_data->version_hash(), - olap_data->delete_flag(), - 0, - olap_data->max_timestamp()); + Rowset* new_rowset = nullptr; + if ((*it)->transaction_id() == 0) { + new_rowset = new Rowset(dest_olap_table.get(), + olap_data->version(), + olap_data->version_hash(), + olap_data->delete_flag(), + (*it)->rowset_id(), 0); + } else { + new_rowset = new Rowset(dest_olap_table.get(), + olap_data->delete_flag(), + (*it)->rowset_id(), 0, + (*it)->is_pending(), + (*it)->partition_id(), + (*it)->transaction_id()); + } - if (NULL == new_olap_index) { - OLAP_LOG_FATAL("failed to malloc OLAPIndex. [size=%ld]", sizeof(OLAPIndex)); + if (NULL == new_rowset) { + OLAP_LOG_FATAL("failed to malloc Rowset. [size=%ld]", sizeof(Rowset)); res = OLAP_ERR_MALLOC_ERROR; goto SCHEMA_VERSION_CONVERT_ERR; } - new_olap_indices->push_back(new_olap_index); + new_rowsets->push_back(new_rowset); - if (!sc_procedure->process(olap_data, new_olap_index)) { - OLAP_LOG_WARNING("failed to process the version. [version='%d-%d']", - (*it)->version().first, - (*it)->version().second); + if (!sc_procedure->process(olap_data, new_rowset)) { + if ((*it)->is_pending()) { + OLAP_LOG_WARNING("failed to process the transaction when schema change. " + "[table='%s' transaction=%ld]", + (*it)->table()->full_name().c_str(), + (*it)->transaction_id()); + } else { + OLAP_LOG_WARNING("failed to process the version. [version='%d-%d']", + (*it)->version().first, + (*it)->version().second); + } res = OLAP_ERR_INPUT_PARAMETER_ERROR; goto SCHEMA_VERSION_CONVERT_ERR; } @@ -1745,11 +1856,11 @@ OLAPStatus SchemaChangeHandler::schema_version_convert( return res; SCHEMA_VERSION_CONVERT_ERR: - while (!new_olap_indices->empty()) { - OLAPIndex* olap_index = new_olap_indices->back(); - olap_index->delete_all_files(); - SAFE_DELETE(olap_index); - new_olap_indices->pop_back(); + while (!new_rowsets->empty()) { + Rowset* rowset = new_rowsets->back(); + rowset->delete_all_files(); + SAFE_DELETE(rowset); + new_rowsets->pop_back(); } SAFE_DELETE(sc_procedure); @@ -1758,12 +1869,12 @@ SCHEMA_VERSION_CONVERT_ERR: } OLAPStatus SchemaChangeHandler::_get_versions_to_be_changed( - SmartOLAPTable ref_olap_table, + OLAPTablePtr ref_olap_table, vector& versions_to_be_changed) { int32_t request_version = 0; - const FileVersionMessage* latest_version = ref_olap_table->latest_version(); - if (latest_version != NULL) { - request_version = latest_version->end_version() - 1; + const PDelta* lastest_version = ref_olap_table->lastest_version(); + if (lastest_version != NULL) { + request_version = lastest_version->end_version() - 1; } else { OLAP_LOG_WARNING("Table has no version. [path='%s']", ref_olap_table->full_name().c_str()); @@ -1790,45 +1901,27 @@ OLAPStatus SchemaChangeHandler::_get_versions_to_be_changed( } } versions_to_be_changed.push_back( - Version(latest_version->start_version(), latest_version->end_version())); + Version(lastest_version->start_version(), lastest_version->end_version())); return OLAP_SUCCESS; } -// 创建新olap-table时,从ref-table中拷è´selectivityä¿¡æ¯ -OLAPStatus SchemaChangeHandler::_copy_table_attributes( - SmartOLAPTable ref_olap_table, - SmartOLAPTable new_olap_table) { - OLAPStatus res = OLAP_SUCCESS; - - vector ref_selectivities; - - if (OLAP_SUCCESS != (res = ref_olap_table->get_selectivities(&ref_selectivities))) { - OLAP_LOG_WARNING("failed to get selectivities from ref_table. [table='%s']", - ref_olap_table->full_name().c_str()); - return res; - } - - // TODO(lijiao) : 这样的selectivity是正确的么? - vector new_selectivities(new_olap_table->tablet_schema().size(), 1); - - for (size_t i = 0; i < new_olap_table->tablet_schema().size(); ++i) { - new_selectivities[i] = i < ref_selectivities.size() ? - ref_selectivities[i] : new_selectivities[i - 1]; - } - - new_olap_table->set_selectivities(new_selectivities); - - return res; -} - - // 增加A->(B|C|...) çš„schema_changeä¿¡æ¯ OLAPStatus SchemaChangeHandler::_save_schema_change_info( AlterTabletType alter_table_type, - SmartOLAPTable ref_olap_table, - SmartOLAPTable new_olap_table, + OLAPTablePtr ref_olap_table, + OLAPTablePtr new_olap_table, const vector& versions_to_be_changed) { + + // check new table exists, + // prevent to set base's status after new's dropping (clear base's status) + if (OLAPEngine::get_instance()->get_table( + new_olap_table->tablet_id(), new_olap_table->schema_hash()).get() == NULL) { + OLAP_LOG_WARNING("fail to find table before saving status. [table='%s']", + new_olap_table->full_name().c_str()); + return OLAP_ERR_TABLE_NOT_FOUND; + } + OLAPStatus res = OLAP_SUCCESS; // 1. 在新表和旧表中添加schema change标志 @@ -1943,16 +2036,15 @@ OLAPStatus SchemaChangeHandler::_alter_table(SchemaChangeParams* sc_params) { (*it)->version().second); // we create a new delta with the same version as the IData processing currently. - OLAPIndex* new_olap_index = new(nothrow) OLAPIndex( + Rowset* new_rowset = new(nothrow) Rowset( sc_params->new_olap_table.get(), (*it)->version(), (*it)->version_hash(), (*it)->delete_flag(), - 0, - (*it)->max_timestamp()); + (*it)->olap_index()->rowset_id(), 0); - if (new_olap_index == NULL) { - OLAP_LOG_WARNING("failed to malloc OLAPIndex. [size=%ld]", sizeof(OLAPIndex)); + if (new_rowset == NULL) { + OLAP_LOG_WARNING("failed to malloc Rowset. [size=%ld]", sizeof(Rowset)); res = OLAP_ERR_MALLOC_ERROR; goto PROCESS_ALTER_EXIT; } @@ -1962,12 +2054,11 @@ OLAPStatus SchemaChangeHandler::_alter_table(SchemaChangeParams* sc_params) { if (DEL_SATISFIED == del_ret) { OLAP_LOG_DEBUG("filter delta in schema change: %d, %d", (*it)->version().first, (*it)->version().second); - res = sc_procedure->create_init_version( - new_olap_index->table()->tablet_id(), - new_olap_index->table()->schema_hash(), - new_olap_index->version(), - new_olap_index->version_hash(), - new_olap_index); + res = sc_procedure->create_init_version(new_rowset->table()->tablet_id(), + new_rowset->table()->schema_hash(), + new_rowset->version(), + new_rowset->version_hash(), + new_rowset); sc_procedure->add_filted_rows((*it)->num_rows()); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to create init version. [res=%d]", res); @@ -1984,12 +2075,12 @@ OLAPStatus SchemaChangeHandler::_alter_table(SchemaChangeParams* sc_params) { (*it)->set_delete_status(DEL_NOT_SATISFIED); } - if (DEL_SATISFIED != del_ret && !sc_procedure->process(*it, new_olap_index)) { + if (DEL_SATISFIED != del_ret && !sc_procedure->process(*it, new_rowset)) { //if del_ret is DEL_SATISFIED, the new delta version has already been created in new_olap_table OLAP_LOG_WARNING("failed to process the version. [version='%d-%d']", (*it)->version().first, (*it)->version().second); - new_olap_index->delete_all_files(); - SAFE_DELETE(new_olap_index); + new_rowset->delete_all_files(); + SAFE_DELETE(new_rowset); res = OLAP_ERR_INPUT_PARAMETER_ERROR; goto PROCESS_ALTER_EXIT; @@ -2003,14 +2094,16 @@ OLAPStatus SchemaChangeHandler::_alter_table(SchemaChangeParams* sc_params) { if (!sc_params->new_olap_table->has_version((*it)->version())) { // register version - res = sc_params->new_olap_table->register_data_source(new_olap_index); + std::vector rowset_vec; + rowset_vec.push_back(new_rowset); + res = sc_params->new_olap_table->register_data_source(rowset_vec); if (OLAP_SUCCESS != res) { OLAP_LOG_WARNING("failed to register new version. [table='%s' version='%d-%d']", sc_params->new_olap_table->full_name().c_str(), (*it)->version().first, (*it)->version().second); - new_olap_index->delete_all_files(); - SAFE_DELETE(new_olap_index); + new_rowset->delete_all_files(); + SAFE_DELETE(new_rowset); sc_params->new_olap_table->release_header_lock(); sc_params->ref_olap_table->release_header_lock(); @@ -2027,8 +2120,8 @@ OLAPStatus SchemaChangeHandler::_alter_table(SchemaChangeParams* sc_params) { "[table='%s' version='%d-%d']", sc_params->new_olap_table->full_name().c_str(), (*it)->version().first, (*it)->version().second); - new_olap_index->delete_all_files(); - SAFE_DELETE(new_olap_index); + new_rowset->delete_all_files(); + SAFE_DELETE(new_rowset); } // ä¿å­˜header @@ -2087,7 +2180,7 @@ PROCESS_ALTER_EXIT: if (OLAP_SUCCESS == res) { sc_params->ref_olap_table->set_schema_change_status( - ALTER_TABLE_DONE, + ALTER_TABLE_FINISHED, sc_params->new_olap_table->schema_hash(), -1); } else { @@ -2095,7 +2188,7 @@ PROCESS_ALTER_EXIT: } sc_params->new_olap_table->set_schema_change_status( - ALTER_TABLE_DONE, + ALTER_TABLE_FINISHED, sc_params->ref_olap_table->schema_hash(), -1); OLAP_LOG_DEBUG("set alter table job status. [status=%d]", @@ -2123,8 +2216,8 @@ PROCESS_ALTER_EXIT: // @static // 分æžcolumnçš„mapping以åŠfilter keyçš„mapping -OLAPStatus SchemaChangeHandler::_parse_request(SmartOLAPTable ref_olap_table, - SmartOLAPTable new_olap_table, +OLAPStatus SchemaChangeHandler::_parse_request(OLAPTablePtr ref_olap_table, + OLAPTablePtr new_olap_table, RowBlockChanger* rb_changer, bool* sc_sorting, bool* sc_directly) { @@ -2289,11 +2382,11 @@ OLAPStatus SchemaChange::create_init_version( SchemaHash schema_hash, Version version, VersionHash version_hash, - OLAPIndex* olap_index) { + Rowset* rowset) { OLAP_LOG_DEBUG("begin to create init version. [begin=%d end=%d]", version.first, version.second); - SmartOLAPTable table; + OLAPTablePtr table; IWriter* writer = NULL; OLAPStatus res = OLAP_SUCCESS; @@ -2314,29 +2407,23 @@ OLAPStatus SchemaChange::create_init_version( } // Create writer, which write nothing to table, to generate empty data file - writer = IWriter::create(table, olap_index, false); + writer = IWriter::create(table, rowset, false); if (writer == NULL) { - OLAP_LOG_WARNING("fail to create writer. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to create writer. [table=" << table->full_name() << "]"; res = OLAP_ERR_MALLOC_ERROR; break; } - res = writer->init(); - if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to init writer. [table=%s]", table->full_name().c_str()); - break; - } - res = writer->finalize(); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to finalize writer. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to finalize writer. [table=" << table->full_name() << "]"; break; } // Load new index and add to table - res = olap_index->load(); + res = rowset->load(); if (res != OLAP_SUCCESS) { - OLAP_LOG_WARNING("fail to load new index. [table=%s]", table->full_name().c_str()); + LOG(WARNING) << "fail to load new index. [table=" << table->full_name() << "]"; break; } } while (0); diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index a9dd807c7a..da678faef7 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -55,11 +55,11 @@ public: typedef std::vector SchemaMapping; RowBlockChanger(const std::vector& tablet_schema, - const SmartOLAPTable& ref_olap_table, + const OLAPTablePtr& ref_olap_table, const DeleteHandler& delete_handler); RowBlockChanger(const std::vector& tablet_schema, - const SmartOLAPTable& ref_olap_table); + const OLAPTablePtr& ref_olap_table); virtual ~RowBlockChanger(); @@ -117,7 +117,7 @@ private: class RowBlockMerger { public: - explicit RowBlockMerger(SmartOLAPTable olap_table); + explicit RowBlockMerger(OLAPTablePtr olap_table); virtual ~RowBlockMerger(); bool merge( @@ -139,7 +139,7 @@ private: bool _make_heap(const std::vector& row_block_arr); bool _pop_heap(); - SmartOLAPTable _olap_table; + OLAPTablePtr _olap_table; std::priority_queue _heap; }; @@ -148,7 +148,7 @@ public: SchemaChange() : _filted_rows(0), _merged_rows(0) {} virtual ~SchemaChange() {} - virtual bool process(IData* olap_data, OLAPIndex* new_olap_index) = 0; + virtual bool process(IData* olap_data, Rowset* new_olap_index) = 0; void add_filted_rows(uint64_t filted_rows) { _filted_rows += filted_rows; @@ -179,7 +179,7 @@ public: TSchemaHash schema_hash, Version version, VersionHash version_hash, - OLAPIndex* olap_index); + Rowset* olap_index); private: uint64_t _filted_rows; @@ -189,14 +189,14 @@ private: class LinkedSchemaChange : public SchemaChange { public: explicit LinkedSchemaChange( - SmartOLAPTable base_olap_table, - SmartOLAPTable new_olap_table); + OLAPTablePtr base_olap_table, + OLAPTablePtr new_olap_table); ~LinkedSchemaChange() {} - bool process(IData* olap_data, OLAPIndex* new_olap_index); + bool process(IData* olap_data, Rowset* new_olap_index); private: - SmartOLAPTable _base_olap_table; - SmartOLAPTable _new_olap_table; + OLAPTablePtr _base_olap_table; + OLAPTablePtr _new_olap_table; DISALLOW_COPY_AND_ASSIGN(LinkedSchemaChange); }; @@ -206,14 +206,14 @@ public: // @params olap_table the instance of table which has new schema. // @params row_block_changer changer to modifiy the data of RowBlock explicit SchemaChangeDirectly( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, const RowBlockChanger& row_block_changer); virtual ~SchemaChangeDirectly(); - virtual bool process(IData* olap_data, OLAPIndex* new_olap_index); + virtual bool process(IData* olap_data, Rowset* new_olap_index); private: - SmartOLAPTable _olap_table; + OLAPTablePtr _olap_table; const RowBlockChanger& _row_block_changer; RowBlockAllocator* _row_block_allocator; RowCursor* _src_cursor; @@ -228,24 +228,24 @@ private: class SchemaChangeWithSorting : public SchemaChange { public: explicit SchemaChangeWithSorting( - SmartOLAPTable olap_table, + OLAPTablePtr olap_table, const RowBlockChanger& row_block_changer, size_t memory_limitation); virtual ~SchemaChangeWithSorting(); - virtual bool process(IData* olap_data, OLAPIndex* new_olap_index); + virtual bool process(IData* olap_data, Rowset* new_olap_index); private: bool _internal_sorting( const std::vector& row_block_arr, const Version& temp_delta_versions, - OLAPIndex** temp_olap_index); + Rowset** temp_olap_index); bool _external_sorting( - std::vector& src_olap_index_arr, - OLAPIndex* olap_index); + std::vector& src_olap_index_arr, + Rowset* olap_index); - SmartOLAPTable _olap_table; + OLAPTablePtr _olap_table; const RowBlockChanger& _row_block_changer; size_t _memory_limitation; Version _temp_delta_versions; @@ -262,10 +262,10 @@ public: OLAPStatus process_alter_table(AlterTabletType alter_table_type, const TAlterTabletReq& request); - OLAPStatus schema_version_convert(SmartOLAPTable ref_olap_table, - SmartOLAPTable new_olap_table, - std::vector* ref_olap_indices, - std::vector* new_olap_indices); + OLAPStatus schema_version_convert(OLAPTablePtr ref_olap_table, + OLAPTablePtr new_olap_table, + std::vector* ref_olap_indices, + std::vector* new_olap_indices); // 清空一个table下的schema_changeä¿¡æ¯ï¼šåŒ…括split_talbe以åŠå…¶ä»–schema_changeä¿¡æ¯ // è¿™é‡Œåªæ¸…ç†è‡ªèº«çš„out链,ä¸è€ƒè™‘relatedçš„table @@ -288,7 +288,7 @@ public: bool only_one, bool check_only); - static OLAPStatus clear_schema_change_single_info(SmartOLAPTable olap_table, + static OLAPStatus clear_schema_change_single_info(OLAPTablePtr olap_table, AlterTabletType* alter_table_type, bool only_one, bool check_only); @@ -300,21 +300,21 @@ private: // Returns: // æˆåŠŸï¼šå¦‚æžœå­˜åœ¨åŽ†å²ä¿¡æ¯ï¼Œæ²¡æœ‰é—®é¢˜çš„就清空;或者没有历å²ä¿¡æ¯ // 失败:å¦åˆ™å¦‚果有历å²ä¿¡æ¯ä¸”无法清空的(有version还没有完æˆï¼‰ - OLAPStatus _check_and_clear_schema_change_info(SmartOLAPTable olap_table, + OLAPStatus _check_and_clear_schema_change_info(OLAPTablePtr olap_table, const TAlterTabletReq& request); - OLAPStatus _get_versions_to_be_changed(SmartOLAPTable ref_olap_table, + OLAPStatus _get_versions_to_be_changed(OLAPTablePtr ref_olap_table, std::vector& versions_to_be_changed); OLAPStatus _do_alter_table(AlterTabletType type, - SmartOLAPTable ref_olap_table, + OLAPTablePtr ref_olap_table, const TAlterTabletReq& request); struct SchemaChangeParams { // 为了让calc_split_key也å¯ä½¿ç”¨æ™®é€šschema_change的线程,æ‰è®¾ç½®äº†æ­¤type AlterTabletType alter_table_type; - SmartOLAPTable ref_olap_table; - SmartOLAPTable new_olap_table; + OLAPTablePtr ref_olap_table; + OLAPTablePtr new_olap_table; std::vector ref_olap_data_arr; std::string debug_message; DeleteHandler delete_handler; @@ -324,25 +324,23 @@ private: }; // æ ¹æ®ç»™å®šçš„table_desc,创建OLAPTable,并挂接到OLAPEngine中 - OLAPStatus _create_new_olap_table(const SmartOLAPTable ref_olap_table, + OLAPStatus _create_new_olap_table(const OLAPTablePtr ref_olap_table, const TCreateTabletReq& create_tablet_req, const std::string* ref_root_path, - SmartOLAPTable* out_new_olap_table); - - OLAPStatus _copy_table_attributes(SmartOLAPTable ref_olap_table, SmartOLAPTable new_olap_table); + OLAPTablePtr* out_new_olap_table); // 增加A->(B|C|...) çš„schema_changeä¿¡æ¯ // 在split table时,增加split-table statusç›¸å…³çš„ä¿¡æ¯ // 其他的都增加在schema-change status中 OLAPStatus _save_schema_change_info(AlterTabletType alter_table_type, - SmartOLAPTable ref_olap_table, - SmartOLAPTable new_olap_table, + OLAPTablePtr ref_olap_table, + OLAPTablePtr new_olap_table, const std::vector& versions_to_be_changed); static OLAPStatus _alter_table(SchemaChangeParams* sc_params); - static OLAPStatus _parse_request(SmartOLAPTable ref_olap_table, - SmartOLAPTable new_olap_table, + static OLAPStatus _parse_request(OLAPTablePtr ref_olap_table, + OLAPTablePtr new_olap_table, RowBlockChanger* rb_changer, bool* sc_sorting, bool* sc_directly); diff --git a/be/src/olap/store.cpp b/be/src/olap/store.cpp old mode 100644 new mode 100755 diff --git a/be/src/olap/types.h b/be/src/olap/types.h index 706512c18f..1939c6d263 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -36,11 +36,11 @@ namespace palo { class TypeInfo { public: - inline int equal(char* left, char* right) const { + inline int equal(const char* left, const char* right) const { return _equal(left, right); } - inline int cmp(char* left, char* right) const { + inline int cmp(const char* left, const char* right) const { return _cmp(left, right); } @@ -89,16 +89,6 @@ private: extern TypeInfo* get_type_info(FieldType field_type); -// TODO: NullOffset -struct NullOffset { - int byte_offset; - uint8_t bit_mask; // to extract null - - NullOffset(int byte_offset, int bit_offset) - : byte_offset(byte_offset), - bit_mask(bit_offset == -1 ? 0 : 1 << (7 - bit_offset)) {} -}; - template struct FieldTypeTraits {}; template diff --git a/be/src/olap/utils.cpp b/be/src/olap/utils.cpp index 3c472cbf82..437c598d45 100644 --- a/be/src/olap/utils.cpp +++ b/be/src/olap/utils.cpp @@ -32,6 +32,8 @@ #include #include "common/logging.h" +#include "gutil/strings/substitute.h" +#include "olap/new_status.h" #include "olap/olap_common.h" #include "olap/olap_define.h" @@ -1044,7 +1046,7 @@ OLAPStatus move_to_trash(const boost::filesystem::path& schema_hash_root, // 2. generate new file path static uint64_t delete_counter = 0; // a global counter to avoid file name duplication. - static MutexLock lock; // lock for delete_counter + static Mutex lock; // lock for delete_counter std::stringstream new_file_dir_stream; lock.lock(); // when file_path points to a schema_path, we need to save tablet info in trash_path, @@ -1077,12 +1079,12 @@ OLAPStatus move_to_trash(const boost::filesystem::path& schema_hash_root, string source_parent_dir = schema_hash_root.parent_path().string(); // tablet_id level std::set sub_dirs, sub_files; if (dir_walk(source_parent_dir, &sub_dirs, &sub_files) != OLAP_SUCCESS) { - OLAP_LOG_INFO("access dir failed. [dir=%s]", source_parent_dir.c_str()); + LOG(INFO) << "access dir failed. [dir=" << source_parent_dir << "]"; // This error is nothing serious. so we still return success. return OLAP_SUCCESS; } - if (sub_files.empty() && sub_files.empty()) { - OLAP_LOG_INFO("remove empty dir %s", source_parent_dir.c_str()); + if (sub_dirs.empty() && sub_files.empty()) { + LOG(INFO) << "remove empty dir " << source_parent_dir; // no need to exam return status remove_dir(source_parent_dir); } @@ -1090,20 +1092,20 @@ OLAPStatus move_to_trash(const boost::filesystem::path& schema_hash_root, return OLAP_SUCCESS; } -MutexLock::MutexLock() { +Mutex::Mutex() { PTHREAD_MUTEX_INIT_WITH_LOG(&_lock, NULL); } -MutexLock::~MutexLock() { +Mutex::~Mutex() { PTHREAD_MUTEX_DESTROY_WITH_LOG(&_lock); } -OLAPStatus MutexLock::lock() { +OLAPStatus Mutex::lock() { PTHREAD_MUTEX_LOCK_WITH_LOG(&_lock); return OLAP_SUCCESS; } -OLAPStatus MutexLock::trylock() { +OLAPStatus Mutex::trylock() { if (0 != pthread_mutex_trylock(&_lock)) { OLAP_LOG_DEBUG("failed to got the mutex lock. [err='%m']"); return OLAP_ERR_RWLOCK_ERROR; @@ -1112,25 +1114,47 @@ OLAPStatus MutexLock::trylock() { return OLAP_SUCCESS; } -OLAPStatus MutexLock::unlock() { +OLAPStatus Mutex::unlock() { PTHREAD_MUTEX_UNLOCK_WITH_LOG(&_lock); return OLAP_SUCCESS; } -RWLock::RWLock() { - PTHREAD_RWLOCK_INIT_WITH_LOG(&_lock, NULL); +RWMutex::RWMutex(Priority prio) { + int kind = PTHREAD_RWLOCK_PREFER_READER_NP; + switch (prio) { + case Priority::PREFER_READING: + kind = PTHREAD_RWLOCK_PREFER_READER_NP; + break; + case Priority::PREFER_WRITING: + kind = PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP; + break; + } + + // Initialize the new rwlock with the user's preference. + pthread_rwlockattr_t attr; + int rv = pthread_rwlockattr_init(&attr); + DCHECK_EQ(0, rv) << strerror(rv); + + rv = pthread_rwlockattr_setkind_np(&attr, kind); + DCHECK_EQ(0, rv) << strerror(rv); + + rv = pthread_rwlock_init(&_lock, &attr); + DCHECK_EQ(0, rv) << strerror(rv); + + rv = pthread_rwlockattr_destroy(&attr); + DCHECK_EQ(0, rv) << strerror(rv); } -RWLock::~RWLock() { +RWMutex::~RWMutex() { PTHREAD_RWLOCK_DESTROY_WITH_LOG(&_lock); } -OLAPStatus RWLock::rdlock() { +OLAPStatus RWMutex::rdlock() { PTHREAD_RWLOCK_RDLOCK_WITH_LOG(&_lock); return OLAP_SUCCESS; } -OLAPStatus RWLock::tryrdlock() { +OLAPStatus RWMutex::tryrdlock() { if (0 != pthread_rwlock_tryrdlock(&_lock)) { OLAP_LOG_DEBUG("failed to got the rwlock rdlock. [err='%m']"); return OLAP_ERR_RWLOCK_ERROR; @@ -1139,7 +1163,7 @@ OLAPStatus RWLock::tryrdlock() { return OLAP_SUCCESS; } -OLAPStatus RWLock::trywrlock() { +OLAPStatus RWMutex::trywrlock() { if (0 != pthread_rwlock_trywrlock(&_lock)) { OLAP_LOG_DEBUG("failed to got the rwlock rdlock. [err='%m']"); return OLAP_ERR_RWLOCK_ERROR; @@ -1148,17 +1172,17 @@ OLAPStatus RWLock::trywrlock() { return OLAP_SUCCESS; } -OLAPStatus RWLock::wrlock() { +OLAPStatus RWMutex::wrlock() { PTHREAD_RWLOCK_WRLOCK_WITH_LOG(&_lock); return OLAP_SUCCESS; } -OLAPStatus RWLock::unlock() { +OLAPStatus RWMutex::unlock() { PTHREAD_RWLOCK_UNLOCK_WITH_LOG(&_lock); return OLAP_SUCCESS; } -Condition::Condition(MutexLock& mutex) : _mutex(mutex) { +Condition::Condition(Mutex& mutex) : _mutex(mutex) { PTHREAD_COND_INIT_WITH_LOG(&_cond, NULL); } @@ -1204,14 +1228,18 @@ OLAPStatus copy_file(const string& src, const string& dest) { src_fd = ::open(src.c_str(), O_RDONLY); if (src_fd < 0) { - OLAP_LOG_WARNING("failed to open file. [err=%m, file_name=%s]", src.c_str()); + char errmsg[64]; + LOG(WARNING) << "failed to open file. [err='" << strerror_r(errno, errmsg, 64) + << "' file_name=" << src << "]"; res = OLAP_ERR_FILE_NOT_EXIST; goto COPY_EXIT; } dest_fd = ::open(dest.c_str(), O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR); if (dest_fd < 0) { - OLAP_LOG_WARNING("failed to open file to write. [err=%m, file_name=%s]", dest.c_str()); + char errmsg[64]; + LOG(WARNING) << "failed to open file to write. [err='" << strerror_r(errno, errmsg, 64) + << "' file_name=" << dest << "]"; res = OLAP_ERR_FILE_NOT_EXIST; goto COPY_EXIT; } @@ -1250,7 +1278,7 @@ COPY_EXIT: ::close(dest_fd); } - OLAP_LOG_TRACE("copy file success. [src=%s dest=%s]", src.c_str(), dest.c_str()); + VLOG(3) << "copy file success. [src=" << src << " dest=" << dest << "]"; return res; } @@ -1260,17 +1288,17 @@ bool check_dir_existed(const string& path) { try { if (boost::filesystem::exists(p)) { - OLAP_LOG_TRACE("dir already existed. [path='%s']", path.c_str()); + VLOG(3) << "dir already existed. [path='" << path << "']"; return true; } else { - OLAP_LOG_TRACE("dir does not existed. [path='%s']", path.c_str()); + VLOG(3) << "dir does not existed. [path='" << path << "']"; return false; } } catch (...) { // do nothing } - OLAP_LOG_WARNING("boost exception when check exist and return false. [path=%s]", path.c_str()); + LOG(WARNING) << "boost exception when check exist and return false. [path=" << path << "]"; return false; } @@ -1280,19 +1308,19 @@ OLAPStatus create_dirs(const string& path) { try { if (boost::filesystem::create_directories(p)) { - OLAP_LOG_TRACE("create dir success. [path='%s']", path.c_str()); + VLOG(3) << "create dir success. [path='" << path << "']"; return OLAP_SUCCESS; } } catch (const boost::filesystem::filesystem_error& e) { - OLAP_LOG_WARNING("error message: [err_msg='%s']", e.code().message().c_str()); + LOG(WARNING) << "error message: [err_msg='" << e.code().message() << "']"; } catch (std::exception& e) { - OLAP_LOG_WARNING("error message: [exception='%s']", e.what()); + LOG(WARNING) << "error message: [exception='" << e.what() << "']"; } catch (...) { // do nothing OLAP_LOG_WARNING("unknown exception."); } - OLAP_LOG_WARNING("fail to create dir. [path='%s']", path.c_str()); + LOG(WARNING) << "fail to create dir. [path='" << path << "']"; return OLAP_ERR_CANNOT_CREATE_DIR; } @@ -1302,14 +1330,14 @@ OLAPStatus create_dir(const string& path) { try { if (boost::filesystem::create_directory(p)) { - OLAP_LOG_TRACE("create dir success. [path='%s']", path.c_str()); + VLOG(3) << "create dir success. [path='" << path << "']"; return OLAP_SUCCESS; } } catch (...) { // do nothing } - OLAP_LOG_WARNING("fail to create dir. [path='%s']", path.c_str()); + LOG(WARNING) << "fail to create dir. [path='" << path << "']"; return OLAP_ERR_CANNOT_CREATE_DIR; } @@ -1327,13 +1355,13 @@ OLAPStatus copy_dir(const string &src_dir, const string &dst_dir) { } if (boost::filesystem::exists(dst_path)) { - OLAP_LOG_WARNING("Dst dir already exists.[dst_path=%s]", dst_path.string().c_str()); + LOG(WARNING) << "Dst dir already exists.[dst_path=" << dst_path.string() << "]"; return OLAP_ERR_CREATE_FILE_ERROR; } // Create the destination directory if (!boost::filesystem::create_directory(dst_path)) { - OLAP_LOG_WARNING("Unable to create dst dir.[dst_path=%s]", dst_path.string().c_str()); + LOG(WARNING) << "Unable to create dst dir.[dst_path=" << dst_path.string() << "]"; return OLAP_ERR_CREATE_FILE_ERROR; } } catch (...) { @@ -1373,20 +1401,37 @@ OLAPStatus copy_dir(const string &src_dir, const string &dst_dir) { return OLAP_SUCCESS; } +void remove_files(const vector& files) { + for (const string& file : files) { + boost::filesystem::path file_path(file); + + try { + if (boost::filesystem::remove(file_path)) { + VLOG(3) << "remove file. [file=" << file << "]"; + } else { + OLAP_LOG_WARNING("failed to remove file. [file=%s errno=%d]", + file.c_str(), Errno::no()); + } + } catch (...) { + // do nothing + } + } +} + // failed when there are files or dirs under thr dir OLAPStatus remove_dir(const string& path) { boost::filesystem::path p(path.c_str()); try { if (boost::filesystem::remove(p)) { - OLAP_LOG_TRACE("success to del dir. [path='%s']", path.c_str()); + VLOG(3) << "success to del dir. [path='" << path << "']"; return OLAP_SUCCESS; } } catch (...) { // do nothing } - OLAP_LOG_WARNING("fail to del dir. [path='%s']", path.c_str()); + LOG(WARNING) << "fail to del dir. [path='" << path << "' errno=" << Errno::no() << "]"; return OLAP_ERR_CANNOT_CREATE_DIR; } @@ -1402,7 +1447,7 @@ OLAPStatus remove_parent_dir(const string& path) { boost::filesystem::remove(parent_path); } } catch (...) { - OLAP_LOG_WARNING("fail to del parent path. [chile path='%s']", path.c_str()); + LOG(WARNING) << "fail to del parent path. [chile path='" << path << "']"; res = OLAP_ERR_STL_ERROR; } @@ -1415,14 +1460,14 @@ OLAPStatus remove_all_dir(const string& path) { try { if (boost::filesystem::remove_all(p)) { - OLAP_LOG_TRACE("success to del all dir. [path='%s']", path.c_str()); + VLOG(3) << "success to del all dir. [path='" << path << "']"; return OLAP_SUCCESS; } } catch (...) { // do nothing } - OLAP_LOG_WARNING("fail to del all dir. [path='%s']", path.c_str()); + LOG(WARNING) << "fail to del all dir. [path='" << path << "' errno=" << Errno::no() << "]"; return OLAP_ERR_CANNOT_CREATE_DIR; } @@ -1453,9 +1498,14 @@ OLAPStatus dir_walk(const string& root, struct stat stat_data; struct dirent* direntp = NULL; dirp = opendir(root.c_str()); - if (NULL == dirp) { - OLAP_LOG_WARNING("can't find root dir. [dir='%s']", root.c_str()); - return OLAP_ERR_INIT_FAILED; + if (dirp == nullptr) { + NewStatus status = IOError(strings::Substitute("opendir $0 failed", root), errno); + LOG(WARNING) << status.ToString(); + if (status.IsDiskFailure()) { + return OLAP_ERR_DISK_FAILURE; + } else { + return OLAP_ERR_INIT_FAILED; + } } while ((direntp = readdir(dirp)) != NULL) { @@ -1513,7 +1563,7 @@ OLAPStatus remove_unused_files(const string& schema_hash_root, // é历所有没有使用的文件 for (set::const_iterator it = different_set.begin(); it != different_set.end(); ++it) { if (ENDSWITH(*it, ".hdr") || ENDSWITH(*it, ".idx") || ENDSWITH(*it, ".dat")) { - OLAP_LOG_INFO("delete unused file. [file='%s']", it->c_str()); + LOG(INFO) << "delete unused file. [file='" << schema_hash_root + "/" + *it << "']"; move_to_trash(boost::filesystem::path(schema_hash_root), boost::filesystem::path(schema_hash_root + "/" + *it)); } else { @@ -1566,7 +1616,7 @@ bool valid_decimal(const string &value_str, const uint32_t precision, const uint boost::regex e(decimal_pattern); boost::smatch what; if (!boost::regex_match(value_str, what, e) || what[0].str().size() != value_str.size()) { - OLAP_LOG_WARNING("invalid decimal value. [value=%s]", value_str.c_str()); + LOG(WARNING) << "invalid decimal value. [value=" << value_str << "]"; return false; } diff --git a/be/src/olap/utils.h b/be/src/olap/utils.h index 7cf7f8aebe..8b07c90188 100644 --- a/be/src/olap/utils.h +++ b/be/src/olap/utils.h @@ -195,10 +195,10 @@ OLAPStatus move_to_trash(const boost::filesystem::path& schema_hash_root, const boost::filesystem::path& file_path); // encapsulation of pthread_mutex to lock the critical sources. -class MutexLock { +class Mutex { public: - MutexLock(); - ~MutexLock(); + Mutex(); + ~Mutex(); // wait until obtain the lock OLAPStatus lock(); @@ -215,80 +215,47 @@ public: private: pthread_mutex_t _lock; + DISALLOW_COPY_AND_ASSIGN(Mutex); }; -// encapsulation of pthread_mutex to lock the critical sources. -class AutoMutexLock { +// Helper class than locks a mutex on construction +// and unlocks the mutex on descontruction. +class MutexLock { public: // wait until obtain the lock - explicit AutoMutexLock(MutexLock* mutex_lock) : _mutex_lock(mutex_lock) { - _mutex_lock->lock(); + explicit MutexLock(Mutex* mutex) : _mutex(mutex) { + _mutex->lock(); } // unlock is called after - ~AutoMutexLock() { - _mutex_lock->unlock(); + ~MutexLock() { + _mutex->unlock(); } private: - MutexLock* _mutex_lock; + Mutex* _mutex; - DISALLOW_COPY_AND_ASSIGN(AutoMutexLock); -}; - -// encapsulation of pthread_mutex to lock the critical sources. -class Condition { -public: - explicit Condition(MutexLock& mutex); - - ~Condition(); - - void wait(); - - void wait_for_seconds(uint32_t seconds); - - void notify(); - - void notify_all(); - -private: - MutexLock& _mutex; - pthread_cond_t _cond; -}; - -enum LockTypeEnum { - READER_LOCK = 0, - WRITER_LOCK = 1 -}; - -// 监控读写é”的状æ€ä¿¡æ¯ -struct RWLockInfo { - // TODO(guping) æœªæ¥æ ¹æ®æƒ…况扩展信æ¯å­—段,å¯ä»¥è€ƒè™‘文件å和代ç è¡Œæ•° - //const char* file_name; - //int32_t line_num; - RWLockInfo() : tid(0), lock_type(READER_LOCK) {} - RWLockInfo(pthread_t in_tid) : tid(in_tid), lock_type(READER_LOCK) {} - RWLockInfo(pthread_t in_tid, LockTypeEnum in_lock_type) : - tid(in_tid), - lock_type(in_lock_type) {} - - void clear() { - tid = 0; - lock_type = READER_LOCK; - } - - bool operator==(const RWLockInfo& other) const { - return tid == other.tid; - } - - pthread_t tid; - LockTypeEnum lock_type; + DISALLOW_COPY_AND_ASSIGN(MutexLock); }; // pthread_read/write_lock -class RWLock { +class RWMutex { public: - RWLock(); - ~RWLock(); + // Possible fairness policies for the RWMutex. + enum class Priority { + // The lock will prioritize readers at the expense of writers. + PREFER_READING, + + // The lock will prioritize writers at the expense of readers. + // + // Care should be taken when using this fairness policy, as it can lead to + // unexpected deadlocks (e.g. a writer waiting on the lock will prevent + // additional readers from acquiring it). + PREFER_WRITING, + }; + + // Create an RWMutex that prioritized readers by default. + RWMutex(Priority prio = Priority::PREFER_READING); + ~RWMutex(); // wait until obtaining the read lock OLAPStatus rdlock(); // try obtaining the read lock @@ -304,28 +271,40 @@ private: pthread_rwlock_t _lock; }; -// encapsulation of pthread_rwlock_t to lock the critical sources. -class AutoRWLock { +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class ReadLock { public: - // wait until obtain the lock - explicit AutoRWLock(RWLock* lock, bool is_read) - : _lock(lock) { - if (is_read) { - _lock->rdlock(); - } else { - _lock->wrlock(); - } - } - - // unlock is called after - ~AutoRWLock() { - _lock->unlock(); + explicit ReadLock(RWMutex* mutex) + : _mutex(mutex) { + this->_mutex->rdlock(); } + ~ReadLock() { this->_mutex->unlock(); } private: - RWLock* _lock; + RWMutex* _mutex; + DISALLOW_COPY_AND_ASSIGN(ReadLock); +}; - DISALLOW_COPY_AND_ASSIGN(AutoRWLock); +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { +public: + explicit WriteLock(RWMutex* mutex) + : _mutex(mutex) { + this->_mutex->wrlock(); + } + ~WriteLock() { this->_mutex->unlock(); } + +private: + RWMutex* _mutex; + DISALLOW_COPY_AND_ASSIGN(WriteLock); }; enum ComparatorEnum { @@ -333,6 +312,26 @@ enum ComparatorEnum { COMPARATOR_LARGER = 1, }; +// encapsulation of pthread_mutex to lock the critical sources. +class Condition { +public: + explicit Condition(Mutex& mutex); + + ~Condition(); + + void wait(); + + void wait_for_seconds(uint32_t seconds); + + void notify(); + + void notify_all(); + +private: + Mutex& _mutex; + pthread_cond_t _cond; +}; + // 处ç†comparator functor处ç†è¿‡ç¨‹ä¸­å‡ºçŽ°çš„é”™è¯¯ class ComparatorException : public std::exception { public: @@ -390,6 +389,8 @@ OLAPStatus create_dirs(const std::string& path); OLAPStatus copy_dir(const std::string &src_dir, const std::string &dst_dir); +void remove_files(const std::vector& files); + OLAPStatus remove_dir(const std::string& path); OLAPStatus remove_parent_dir(const std::string& path); diff --git a/be/src/olap/wrapper_field.h b/be/src/olap/wrapper_field.h index dde7777ea4..e56f72fa55 100644 --- a/be/src/olap/wrapper_field.h +++ b/be/src/olap/wrapper_field.h @@ -43,8 +43,7 @@ public: // 从传入的字符串ååºåˆ—化field的值 // 傿•°å¿…须是一个\0结尾的字符串 OLAPStatus from_string(const std::string& value_string) { - _rep->from_string(_buf, value_string); - return OLAP_SUCCESS; + return _rep->from_string(_buf, value_string); } // attach到一段buf @@ -116,6 +115,10 @@ public: return _rep->cmp(_field_buf, right); } + int cmp(bool r_null, char* right) const { + return _rep->cmp(_field_buf, r_null, right); + } + void copy(const WrapperField* field) { _rep->copy_without_pool(_field_buf, field->field_ptr()); } @@ -124,6 +127,10 @@ public: _rep->copy_without_pool(_field_buf, src); } + void copy(bool is_null, char* src) { + _rep->copy_without_pool(_field_buf, is_null, src); + } + uint32_t hash_code() const { uint32_t hash_code = 0; return _rep->hash_code(_buf + _rep->get_offset(), hash_code); diff --git a/be/src/olap/writer.cpp b/be/src/olap/writer.cpp index ceaa26e70d..f666200afc 100644 --- a/be/src/olap/writer.cpp +++ b/be/src/olap/writer.cpp @@ -24,7 +24,7 @@ namespace palo { -IWriter* IWriter::create(SmartOLAPTable table, OLAPIndex *index, bool is_push_write) { +IWriter* IWriter::create(OLAPTablePtr table, Rowset *index, bool is_push_write) { IWriter* writer = NULL; switch (table->data_file_type()) { @@ -43,7 +43,7 @@ IWriter* IWriter::create(SmartOLAPTable table, OLAPIndex *index, bool is_push_wr return writer; } -OLAPDataWriter::OLAPDataWriter(SmartOLAPTable table, OLAPIndex* index, bool is_push_write) : +OLAPDataWriter::OLAPDataWriter(OLAPTablePtr table, Rowset* index, bool is_push_write) : IWriter(is_push_write, table), _index(index), _data(NULL), @@ -77,7 +77,7 @@ OLAPStatus OLAPDataWriter::init(uint32_t num_rows_per_row_block) { _data = new (std::nothrow) OLAPData(_index); if (NULL == _data) { - OLAP_LOG_WARNING("fail to new OLAPData. [table='%s']", _table->full_name().c_str()); + LOG(WARNING) << "fail to new OLAPData. [table='" << _table->full_name() << "']"; return OLAP_ERR_MALLOC_ERROR; } @@ -90,7 +90,7 @@ OLAPStatus OLAPDataWriter::init(uint32_t num_rows_per_row_block) { _row_block = new (std::nothrow) RowBlock(_table->tablet_schema()); if (NULL == _row_block) { - OLAP_LOG_WARNING("fail to new RowBlock. [table='%s']", _table->full_name().c_str()); + LOG(WARNING) << "fail to new RowBlock. [table='" << _table->full_name() << "']"; return OLAP_ERR_MALLOC_ERROR; } @@ -129,17 +129,30 @@ OLAPStatus OLAPDataWriter::init(uint32_t num_rows_per_row_block) { OLAPStatus OLAPDataWriter::attached_by(RowCursor* row_cursor) { if (_row_index >= _table->num_rows_per_row_block()) { - if (OLAP_SUCCESS != flush()) { + if (OLAP_SUCCESS != _flush_row_block()) { OLAP_LOG_WARNING("failed to flush data while attaching row cursor."); return OLAP_ERR_OTHER_ERROR; } + RETURN_NOT_OK(_flush_segment_with_verfication()); } // Row points to the memory that needs to write in _row_block. _row_block->get_row(_row_index, row_cursor); return OLAP_SUCCESS; } -OLAPStatus OLAPDataWriter::flush() { +OLAPStatus OLAPDataWriter::write(const char* row) { + if (_row_index >= _table->num_rows_per_row_block()) { + if (OLAP_SUCCESS != _flush_row_block()) { + OLAP_LOG_WARNING("failed to flush data while attaching row cursor."); + return OLAP_ERR_OTHER_ERROR; + } + RETURN_NOT_OK(_flush_segment_with_verfication()); + } + _row_block->set_row(_row_index, row); + return OLAP_SUCCESS; +} + +OLAPStatus OLAPDataWriter::_flush_row_block() { if (_row_index < 1) { return OLAP_SUCCESS; } @@ -150,93 +163,67 @@ OLAPStatus OLAPDataWriter::flush() { } // Write a ready row block into OLAPData. - // Add one index item into OLAPIndex. - if (OLAP_SUCCESS != write_row_block(_row_block)) { - OLAP_LOG_WARNING("fail to write row block. [row_num=%u]", _row_index); + // Add one index item into Rowset. + // Add row block into olap data. + uint32_t start_offset; + uint32_t end_offset; + if (OLAP_SUCCESS != _data->add_row_block(_row_block, + &start_offset, + &end_offset)) { + OLAP_LOG_WARNING("fail to write data."); return OLAP_ERR_WRITER_DATA_WRITE_ERROR; } + // Add the corresponding index item into olap index. + if (OLAP_SUCCESS != _index->add_row_block(*_row_block, start_offset)) { + OLAP_LOG_WARNING("fail to update index."); + return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; + } + + _current_segment_size = end_offset; + _num_rows += _row_block->row_block_info().row_num; + + // In order to reuse row_block, clear the row_block after finalize + _row_block->clear(); _row_index = 0U; return OLAP_SUCCESS; } +OLAPStatus OLAPDataWriter::_flush_segment_with_verfication() { + if (UNLIKELY(_current_segment_size < _max_segment_size)) { + return OLAP_SUCCESS; + } + uint32_t data_segment_size; + if (OLAP_SUCCESS != _data->finalize_segment(&data_segment_size)) { + OLAP_LOG_WARNING("fail to finish segment from olap_data."); + return OLAP_ERR_WRITER_DATA_WRITE_ERROR; + } + + if (OLAP_SUCCESS != _index->finalize_segment(data_segment_size, _num_rows)) { + OLAP_LOG_WARNING("fail to finish segment from olap_index."); + return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; + } + + if (OLAP_SUCCESS != _data->add_segment() + || OLAP_SUCCESS != _index->add_segment()) { + OLAP_LOG_WARNING("fail to add data or index segment."); + return OLAP_ERR_OTHER_ERROR; + } + + _num_rows = 0; + _current_segment_size = 0U; + return OLAP_SUCCESS; +} + void OLAPDataWriter::sync() { _data->sync(); _index->sync(); } -OLAPStatus OLAPDataWriter::write_row_block(RowBlock* row_block) { - if (NULL == row_block || row_block->row_block_info().row_num == 0) { - return OLAP_SUCCESS; - } - - // If _current_segment_size plus row_block size without compressing data - // exceeds the max data segment size, finalize the current data/index - // segment, and add new data/index segment. - if (static_cast(_current_segment_size) + static_cast(row_block->buf_len()) - > static_cast(_max_segment_size)) { - // Finalize data and index segment. - uint32_t data_segment_size; - if (OLAP_SUCCESS != _data->finalize_segment(&data_segment_size)) { - OLAP_LOG_WARNING("fail to finish segment from olap_data."); - return OLAP_ERR_WRITER_DATA_WRITE_ERROR; - } - - if (OLAP_SUCCESS != _index->finalize_segment(data_segment_size, _num_rows)) { - OLAP_LOG_WARNING("fail to finish segment from olap_index."); - return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; - } - - if (OLAP_SUCCESS != _data->add_segment() - || OLAP_SUCCESS != _index->add_segment()) { - OLAP_LOG_WARNING("fail to add data or index segment."); - return OLAP_ERR_OTHER_ERROR; - } - - _num_rows = 0; - _current_segment_size = 0U; - } - - // Add row block into olap data. - uint32_t start_offset; - uint32_t end_offset; - if (OLAP_SUCCESS != _data->add_row_block(row_block, - &start_offset, - &end_offset)) { - OLAP_LOG_WARNING("fail to write data."); - return OLAP_ERR_WRITER_DATA_WRITE_ERROR; - } - - // Add the corresponding index item into olap index. - if (OLAP_SUCCESS != _index->add_row_block(*row_block, start_offset)) { - OLAP_LOG_WARNING("fail to update index."); - return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; - } - - _current_segment_size = end_offset; - _num_rows += row_block->row_block_info().row_num; - - if (_write_mbytes_per_sec > 0) { - uint64_t delta_time_us = _speed_limit_watch.get_elapse_time_us(); - int64_t sleep_time = - _current_segment_size / _write_mbytes_per_sec - delta_time_us; - if (sleep_time > 0) { - OLAP_LOG_DEBUG("sleep to limit merge speed. [time=%lu bytes=%lu]", - sleep_time, _current_segment_size); - usleep(sleep_time); - } - } - - // In order to reuse row_block, clear the row_block after finalize - row_block->clear(); - - return OLAP_SUCCESS; -} - // Finalize may be success in spite of write() failure. OLAPStatus OLAPDataWriter::finalize() { // Write the last row block into OLAPData - if (OLAP_SUCCESS != flush()) { + if (OLAP_SUCCESS != _flush_row_block()) { OLAP_LOG_WARNING("fail to flush row block."); return OLAP_ERR_WRITER_DATA_WRITE_ERROR; } @@ -253,13 +240,14 @@ OLAPStatus OLAPDataWriter::finalize() { return OLAP_ERR_WRITER_INDEX_WRITE_ERROR; } - OLAPStatus res = _index->set_column_statistics(_column_statistics); + OLAPStatus res = _index->add_column_statistics(_column_statistics); if (res != OLAP_SUCCESS) { OLAP_LOG_WARNING("Fail to set delta pruning![res=%d]", res); return res; } - + _num_rows = 0; + _current_segment_size = 0U; return OLAP_SUCCESS; } diff --git a/be/src/olap/writer.h b/be/src/olap/writer.h index 10677767dd..9f8f2874d5 100644 --- a/be/src/olap/writer.h +++ b/be/src/olap/writer.h @@ -17,11 +17,12 @@ #define BDG_PALO_BE_SRC_OLAP_WRITER_H #include "olap/olap_table.h" +#include "olap/schema.h" #include "olap/wrapper_field.h" namespace palo { class OLAPData; -class OLAPIndex; +class Rowset; class OLAPTable; class RowBlock; class RowCursor; @@ -31,7 +32,7 @@ class RowCursor; // å…ˆattach出内部指针,å†å¡«å…¥æ•°æ®çš„æ–¹å¼å¹¶ä¸å¯å–,但兼容的考虑先继续使用 class IWriter { public: - IWriter(bool is_push_write, SmartOLAPTable table) : + IWriter(bool is_push_write, OLAPTablePtr table) : _is_push_write(is_push_write), _table(table), _column_statistics( @@ -44,24 +45,17 @@ public: } } virtual OLAPStatus init() { - OLAPStatus res = OLAP_SUCCESS; for (size_t i = 0; i < _column_statistics.size(); ++i) { _column_statistics[i].first = WrapperField::create(_table->tablet_schema()[i]); - if (_column_statistics[i].first == NULL) { - OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i); - return OLAP_ERR_MALLOC_ERROR; - } + DCHECK(_column_statistics[i].first != nullptr) << "fail to create column statistics field."; _column_statistics[i].first->set_to_max(); _column_statistics[i].second = WrapperField::create(_table->tablet_schema()[i]); - if (_column_statistics[i].second == NULL) { - OLAP_LOG_FATAL("fail to create column statistics field. [field_id=%lu]", i); - return OLAP_ERR_MALLOC_ERROR; - } + DCHECK(_column_statistics[i].second != nullptr) << "fail to create column statistics field."; _column_statistics[i].second->set_null(); _column_statistics[i].second->set_to_min(); } - return res; + return OLAP_SUCCESS; } virtual OLAPStatus attached_by(RowCursor* row_cursor) = 0; void next(const RowCursor& row_cursor) { @@ -78,32 +72,45 @@ public: ++_row_index; } + void next(const char* row, const Schema* schema) { + for (size_t i = 0; i < _table->num_key_fields(); ++i) { + char* right = const_cast(row + schema->get_col_offset(i)); + if (_column_statistics[i].first->cmp(right) > 0) { + _column_statistics[i].first->copy(right); + } + + if (_column_statistics[i].second->cmp(right) < 0) { + _column_statistics[i].second->copy(right); + } + } + + ++_row_index; + } + virtual OLAPStatus write(const char* row) = 0; virtual OLAPStatus finalize() = 0; - virtual OLAPStatus write_row_block(RowBlock* row_block) = 0; virtual uint64_t written_bytes() = 0; virtual MemPool* mem_pool() = 0; // Factory function // 调用者获得新建的对象, å¹¶è´Ÿè´£delete释放 - static IWriter* create(SmartOLAPTable table, OLAPIndex* index, bool is_push_write); - + static IWriter* create(OLAPTablePtr table, Rowset* index, bool is_push_write); protected: bool _is_push_write; - SmartOLAPTable _table; + OLAPTablePtr _table; // first is min, second is max std::vector> _column_statistics; uint32_t _row_index; }; // OLAPDataWriter writes rows into a new version, including data and indexes files. -// OLAPDataWriter does not take OLAPIndex ownership. +// OLAPDataWriter does not take Rowset ownership. // Common usage is: -// 1. index = new OLAPIndex(table, new_version...) +// 1. index = new Rowset(table, new_version...) // 2. OLAPDataWriter writer(table, index) // 3. writer.init() // =========================================== // 4. loop: // make row block... -// writer.write_row_block(row_block) +// writer.flush_row_block(row_block) // OR---------------------------------------- // 4. RowCursor* cursor // loop: @@ -112,7 +119,7 @@ protected: // writer.next() // =========================================== // 5. writer.finalize() -// 6. if errors happen in write_row_block() or finalize() +// 6. if errors happen in flush_row_block() or finalize() // index->delete_all_files() // delete index // @@ -120,7 +127,7 @@ protected: // 8. we use index now ... class OLAPDataWriter : public IWriter { public: - OLAPDataWriter(SmartOLAPTable table, OLAPIndex* index, bool is_push_write = false); + OLAPDataWriter(OLAPTablePtr table, Rowset* index, bool is_push_write = false); virtual ~OLAPDataWriter(); @@ -129,18 +136,12 @@ public: // Init with custom num rows of row block OLAPStatus init(uint32_t num_rows_per_row_block); - // Write one row_block into OLAPData and OLAPIndex, if the segment size - // exceeds max segment size, finalize the last segment, and add new segment. - OLAPStatus write_row_block(RowBlock* row_block); - // In order to avoid memory copy while reading and writing, attach the // row_cursor to the row block being written. // If the number of rows reached maximum, the row_block will be added into - // OLAPData, and one index item will be added into OLAPIndex. + // OLAPData, and one index item will be added into Rowset. virtual OLAPStatus attached_by(RowCursor* row_cursor); - - // Flush the row block written before to OLAPIndex - OLAPStatus flush(); + virtual OLAPStatus write(const char* row); // sync data to disk, ignore error void sync(); @@ -150,9 +151,12 @@ public: virtual uint64_t written_bytes(); virtual MemPool* mem_pool(); - private: - OLAPIndex* _index; + // Flush the row block written before to Rowset + OLAPStatus _flush_row_block(); + OLAPStatus _flush_segment_with_verfication(); + + Rowset* _index; OLAPData* _data; // current OLAPData Segment size, it is used to prevent OLAPData Segment // size exceeding _max_segment_size(OLAP_MAX_SEGMENT_FILE_SIZE) diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt index 8b9c0b7df1..ab1851130c 100644 --- a/be/src/runtime/CMakeLists.txt +++ b/be/src/runtime/CMakeLists.txt @@ -85,6 +85,7 @@ add_library(Runtime STATIC buffered_tuple_stream3.cc # export_task_mgr.cpp export_sink.cpp + tablet_writer_mgr.cpp bufferpool/buffer_allocator.cc bufferpool/buffer_pool.cc bufferpool/reservation_tracker.cc @@ -93,6 +94,7 @@ add_library(Runtime STATIC bufferpool/system_allocator.cc initial_reservations.cc snapshot_loader.cpp + kafka_consumer_pipe.cpp ) # This test runs forever so should not be part of 'make test' diff --git a/be/src/runtime/data_stream_sender.cpp b/be/src/runtime/data_stream_sender.cpp index 2afa6f9bbb..30c4139de1 100644 --- a/be/src/runtime/data_stream_sender.cpp +++ b/be/src/runtime/data_stream_sender.cpp @@ -52,31 +52,10 @@ #include "util/thrift_util.h" #include "util/brpc_stub_cache.h" +#include "util/ref_count_closure.h" namespace palo { -class TransmitDataClosure : public google::protobuf::Closure { -public: - TransmitDataClosure() : _refs(0) { } - ~TransmitDataClosure() { } - - void ref() { _refs.fetch_add(1, std::memory_order_relaxed); } - - // If unref() returns true, this object should be delete - bool unref() { return _refs.fetch_sub(1, std::memory_order_relaxed) == 1; } - - void Run() override { - if (unref()) { - delete this; - } - } - - brpc::Controller cntl; - PTransmitDataResult result; -private: - std::atomic _refs; -}; - // A channel sends data asynchronously via calls to transmit_data // to a single destination ipaddress/node. // It has a fixed-capacity buffer and allows the caller either to add rows to @@ -184,7 +163,7 @@ private: PRowBatch _pb_batch; PTransmitDataParams _brpc_request; PInternalService_Stub* _brpc_stub = nullptr; - TransmitDataClosure* _closure = nullptr; + RefCountClosure* _closure = nullptr; int32_t _brpc_timeout_ms = 500; }; @@ -218,7 +197,7 @@ Status DataStreamSender::Channel::init(RuntimeState* state) { Status DataStreamSender::Channel::send_batch(PRowBatch* batch, bool eos) { if (_closure == nullptr) { - _closure = new TransmitDataClosure(); + _closure = new RefCountClosure(); _closure->ref(); } else { RETURN_IF_ERROR(_wait_last_brpc()); @@ -288,7 +267,7 @@ Status DataStreamSender::Channel::close_internal() { } VLOG_RPC << "Channel::close() instance_id=" << _fragment_instance_id << " dest_node=" << _dest_node_id - << " #rows= " << _batch->num_rows(); + << " #rows= " << ((_batch == nullptr) ? 0 : _batch->num_rows()); if (_batch != NULL && _batch->num_rows() > 0) { RETURN_IF_ERROR(send_current_batch(true)); } else { diff --git a/be/src/runtime/decimal_value.h b/be/src/runtime/decimal_value.h index f1dc86c668..cf1cc13a3b 100755 --- a/be/src/runtime/decimal_value.h +++ b/be/src/runtime/decimal_value.h @@ -314,6 +314,10 @@ public: // to - decimal where where the result will be stored // to->buf and to->len must be set. void to_max_decimal(int precision, int frac); + void to_min_decimal(int precision, int frac) { + to_max_decimal(precision, frac); + _sign = -1; + } // The maximum of fraction part is "scale". // If the length of fraction part is less than "scale", '0' will be filled. diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp index 5caaa3127a..973317fc85 100644 --- a/be/src/runtime/descriptors.cpp +++ b/be/src/runtime/descriptors.cpp @@ -30,6 +30,7 @@ #include "codegen/llvm_codegen.h" #include "common/object_pool.h" #include "gen_cpp/Descriptors_types.h" +#include "gen_cpp/descriptors.pb.h" #include "gen_cpp/PlanNodes_types.h" #include "exprs/expr.h" @@ -66,6 +67,36 @@ SlotDescriptor::SlotDescriptor(const TSlotDescriptor& tdesc) _set_null_fn(NULL) { } +SlotDescriptor::SlotDescriptor(const PSlotDescriptor& pdesc) + : _id(pdesc.id()), + _type(TypeDescriptor::from_protobuf(pdesc.slot_type())), + _parent(pdesc.parent()), + _col_pos(pdesc.column_pos()), + _tuple_offset(pdesc.byte_offset()), + _null_indicator_offset(pdesc.null_indicator_byte(), pdesc.null_indicator_bit()), + _col_name(pdesc.col_name()), + _slot_idx(pdesc.slot_idx()), + _slot_size(_type.get_slot_size()), + _field_idx(-1), + _is_materialized(pdesc.is_materialized()), + _is_null_fn(NULL), + _set_not_null_fn(NULL), + _set_null_fn(NULL) { +} + +void SlotDescriptor::to_protobuf(PSlotDescriptor* pslot) const { + pslot->set_id(_id); + pslot->set_parent(_parent); + _type.to_protobuf(pslot->mutable_slot_type()); + pslot->set_column_pos(_col_pos); + pslot->set_byte_offset(_tuple_offset); + pslot->set_null_indicator_byte(_null_indicator_offset.byte_offset); + pslot->set_null_indicator_bit(_null_indicator_offset.bit_offset); + pslot->set_col_name(_col_name); + pslot->set_slot_idx(_slot_idx); + pslot->set_is_materialized(_is_materialized); +} + std::string SlotDescriptor::debug_string() const { std::stringstream out; out << "Slot(id=" << _id << " type=" << _type @@ -177,6 +208,23 @@ TupleDescriptor::TupleDescriptor(const TTupleDescriptor& tdesc) : } } +TupleDescriptor::TupleDescriptor(const PTupleDescriptor& pdesc) + : _id(pdesc.id()), + _table_desc(NULL), + _byte_size(pdesc.byte_size()), + _num_null_bytes(pdesc.num_null_bytes()), + _num_materialized_slots(0), + _slots(), + _has_varlen_slots(false), + _llvm_struct(NULL) { + if (!pdesc.has_num_null_slots()) { + //be compatible for existing tables with no NULL value + _num_null_slots = 0; + } else { + _num_null_slots = pdesc.num_null_slots(); + } +} + void TupleDescriptor::add_slot(SlotDescriptor* slot) { _slots.push_back(slot); @@ -212,6 +260,15 @@ bool TupleDescriptor::layout_equals(const TupleDescriptor& other_desc) const { return true; } +void TupleDescriptor::to_protobuf(PTupleDescriptor* ptuple) const { + ptuple->Clear(); + ptuple->set_id(_id); + ptuple->set_byte_size(_byte_size); + ptuple->set_num_null_bytes(_num_null_bytes); + ptuple->set_table_id(-1); + ptuple->set_num_null_slots(_num_null_slots); +} + std::string TupleDescriptor::debug_string() const { std::stringstream out; out << "Tuple(id=" << _id << " size=" << _byte_size; @@ -511,7 +568,7 @@ void DescriptorTbl::get_tuple_descs(std::vector* descs) const } bool SlotDescriptor::layout_equals(const SlotDescriptor& other_desc) const { - if (type() != other_desc.type()) return false; + if (type().type != other_desc.type().type) return false; if (is_nullable() != other_desc.is_nullable()) return false; if (slot_size() != other_desc.slot_size()) return false; if (tuple_offset() != other_desc.tuple_offset()) return false; diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h index f2f778da3f..6c3e9c6e71 100644 --- a/be/src/runtime/descriptors.h +++ b/be/src/runtime/descriptors.h @@ -54,6 +54,9 @@ class TTupleDescriptor; class Expr; class RuntimeState; class SchemaScanner; +class OlapTableSchemaParam; +class PTupleDescriptor; +class PSlotDescriptor; struct LlvmTupleStruct { llvm::StructType* tuple_struct; @@ -69,10 +72,12 @@ struct LlvmTupleStruct { struct NullIndicatorOffset { int byte_offset; uint8_t bit_mask; // to extract null indicator + uint8_t bit_offset; // only used to serialize, from 1 to 8 - NullIndicatorOffset(int byte_offset, int bit_offset) + NullIndicatorOffset(int byte_offset, int bit_offset_) : byte_offset(byte_offset), - bit_mask(bit_offset == -1 ? 0 : 1 << (7 - bit_offset)) { + bit_mask(bit_offset_ == -1 ? 0 : 1 << (7 - bit_offset_)), + bit_offset(bit_offset_) { } bool equals(const NullIndicatorOffset& o) const { @@ -130,6 +135,8 @@ public: /// of other_desc, but not necessarily ids. bool layout_equals(const SlotDescriptor& other_desc) const; + void to_protobuf(PSlotDescriptor* pslot) const; + std::string debug_string() const; // Codegen for: bool IsNull(Tuple* tuple) @@ -144,6 +151,7 @@ private: friend class DescriptorTbl; friend class TupleDescriptor; friend class SchemaScanner; + friend class OlapTableSchemaParam; const SlotId _id; const TypeDescriptor _type; @@ -173,6 +181,7 @@ private: llvm::Function* _set_null_fn; SlotDescriptor(const TSlotDescriptor& tdesc); + SlotDescriptor(const PSlotDescriptor& pdesc); }; // Base class for table descriptors. @@ -331,6 +340,8 @@ public: std::string debug_string() const; + void to_protobuf(PTupleDescriptor* ptuple) const; + // Creates a typed struct description for llvm. The layout of the struct is computed // by the FE which includes the order of the fields in the resulting struct. // Returns the struct type or NULL if the type could not be created. @@ -347,6 +358,7 @@ public: private: friend class DescriptorTbl; friend class SchemaScanner; + friend class OlapTableSchemaParam; const TupleId _id; TableDescriptor* _table_desc; @@ -366,6 +378,7 @@ private: llvm::StructType* _llvm_struct; // cache for the llvm struct type for this tuple desc TupleDescriptor(const TTupleDescriptor& tdesc); + TupleDescriptor(const PTupleDescriptor& tdesc); void add_slot(SlotDescriptor* slot); /// Returns slots in their physical order. diff --git a/be/src/runtime/disk_io_mgr.h b/be/src/runtime/disk_io_mgr.h index 7f1ac975ab..e487ec762e 100644 --- a/be/src/runtime/disk_io_mgr.h +++ b/be/src/runtime/disk_io_mgr.h @@ -171,7 +171,7 @@ class MemTracker; // the cached buffer is returned (BufferDescriptor::Return()). // // Remote filesystem support (e.g. S3): -// Remote filesystems are modeled as "remote disks". That is, there is a separate disk +// Remote filesystems are modeled as "remote disks". That is, there is a seperate disk // queue for each supported remote filesystem type. In order to maximize throughput, // multiple connections are opened in parallel by having multiple threads running per // queue. Also note that reading from a remote filesystem service can be more CPU diff --git a/be/src/runtime/dpp_sink.cpp b/be/src/runtime/dpp_sink.cpp index 6a2461b3ba..efbc2c2039 100644 --- a/be/src/runtime/dpp_sink.cpp +++ b/be/src/runtime/dpp_sink.cpp @@ -643,13 +643,13 @@ void HllDppSinkMerge::update_hll_set(TupleRow* agg_row, TupleRow* row, agg_row_resolver.init(agg_row_sv->ptr, agg_row_sv->len); agg_row_resolver.parse(); if (agg_row_resolver.get_hll_data_type() == HLL_DATA_EXPLICIT) { - value->hash_set.insert(agg_row_resolver.get_expliclit_value(0)); + value->hash_set.insert(agg_row_resolver.get_explicit_value(0)); } if (row_resolver.get_hll_data_type() == HLL_DATA_EXPLICIT) { - value->hash_set.insert(row_resolver.get_expliclit_value(0)); + value->hash_set.insert(row_resolver.get_explicit_value(0)); } } else if (value->type == HLL_DATA_EXPLICIT) { - value->hash_set.insert(row_resolver.get_expliclit_value(0)); + value->hash_set.insert(row_resolver.get_explicit_value(0)); if (value->hash_set.size() > HLL_EXPLICLIT_INT64_NUM) { value->type = HLL_DATA_SPRASE; for (std::set::iterator iter = value->hash_set.begin(); iter != value->hash_set.end(); iter++) { @@ -664,7 +664,7 @@ void HllDppSinkMerge::update_hll_set(TupleRow* agg_row, TupleRow* row, } } } else if (value->type == HLL_DATA_SPRASE) { - uint64_t hash = row_resolver.get_expliclit_value(0); + uint64_t hash = row_resolver.get_explicit_value(0); int idx = hash % REGISTERS_SIZE; uint8_t first_one_bit = __builtin_ctzl(hash >> HLL_COLUMN_PRECISION) + 1; if (value->index_to_value.find(idx) != value->index_to_value.end()) { @@ -688,15 +688,15 @@ void HllDppSinkMerge::finalize_one_merge(TupleRow* agg_row, MemPool* pool, StringValue* agg_row_sv = static_cast(SlotRef::get_value(ctx->root(), agg_row)); if (rollup_schema.value_ops()[i] == TAggregationType::HLL_UNION) { HllMergeValue* value = _hll_last_row[index++]; - // expliclit set + // explicit set if (value->type == HLL_DATA_EXPLICIT) { int set_len = 1 + 1 + value->hash_set.size() * 8; char *result = (char*)pool->allocate(set_len); memset(result, 0, set_len); - HllSetHelper::set_expliclit(result, value->hash_set, set_len); + HllSetHelper::set_explicit(result, value->hash_set, set_len); agg_row_sv->replace(result, set_len); } else if (value->type == HLL_DATA_SPRASE) { - // full expliclit set + // full explicit set if (value->index_to_value.size() * (sizeof(HllSetResolver::SparseIndexType) + sizeof(HllSetResolver::SparseValueType)) + sizeof(HllSetResolver::SparseLengthValueType) > REGISTERS_SIZE) { @@ -706,7 +706,7 @@ void HllDppSinkMerge::finalize_one_merge(TupleRow* agg_row, MemPool* pool, HllSetHelper::set_full(result, value->index_to_value, REGISTERS_SIZE, set_len); agg_row_sv->replace(result, set_len); } else { - // sparse expliclit set + // sparse explicit set int set_len = 1 + sizeof(HllSetResolver::SparseLengthValueType) + 3 * value->index_to_value.size(); char *result = (char*)pool->allocate(set_len); memset(result, 0, set_len); diff --git a/be/src/runtime/dpp_writer.h b/be/src/runtime/dpp_writer.h index 74bf070c88..370adafb6e 100644 --- a/be/src/runtime/dpp_writer.h +++ b/be/src/runtime/dpp_writer.h @@ -22,7 +22,6 @@ namespace palo { -class FileHandlerWithBuf; class ExprContext; class TupleRow; class RowBatch; diff --git a/be/src/runtime/exec_env.cpp b/be/src/runtime/exec_env.cpp index 594f2057de..9bb293f9d3 100644 --- a/be/src/runtime/exec_env.cpp +++ b/be/src/runtime/exec_env.cpp @@ -34,6 +34,7 @@ #include "runtime/mem_tracker.h" #include "runtime/thread_resource_mgr.h" #include "runtime/fragment_mgr.h" +#include "runtime/tablet_writer_mgr.h" #include "runtime/tmp_file_mgr.h" #include "runtime/bufferpool/reservation_tracker.h" #include "util/metrics.h" @@ -46,20 +47,23 @@ #include "http/ev_http_server.h" #include "http/action/mini_load.h" #include "http/action/checksum_action.h" -#include "http/action/compaction_action.h" #include "http/action/health_action.h" #include "http/action/reload_tablet_action.h" +#include "http/action/restore_tablet_action.h" #include "http/action/snapshot_action.h" #include "http/action/pprof_actions.h" #include "http/action/metrics_action.h" +#include "http/action/meta_action.h" +#include "http/action/stream_load.h" #include "http/download_action.h" #include "http/monitor_action.h" #include "http/http_method.h" -#include "olap/olap_rootpath.h" +#include "olap/olap_engine.h" #include "util/network_util.h" #include "util/bfd_parser.h" #include "runtime/etl_job_mgr.h" #include "runtime/load_path_mgr.h" +#include "runtime/load_stream_mgr.h" #include "runtime/pull_load_task_mgr.h" #include "runtime/snapshot_loader.h" #include "util/pretty_printer.h" @@ -74,7 +78,15 @@ namespace palo { ExecEnv* ExecEnv::_exec_env = nullptr; -ExecEnv::ExecEnv() : +ExecEnv::ExecEnv() + : _thread_mgr(new ThreadResourceMgr), + _master_info(new TMasterInfo()), + _load_stream_mgr(new LoadStreamMgr()), + _brpc_stub_cache(new BrpcStubCache()) { +} + +ExecEnv::ExecEnv(const std::vector& paths) : + _store_paths(paths), _stream_mgr(new DataStreamMgr()), _result_mgr(new ResultBufferMgr()), _client_cache(new BackendServiceClientCache()), @@ -95,12 +107,14 @@ ExecEnv::ExecEnv() : _fragment_mgr(new FragmentMgr(this)), _master_info(new TMasterInfo()), _etl_job_mgr(new EtlJobMgr(this)), - _load_path_mgr(new LoadPathMgr()), + _load_path_mgr(new LoadPathMgr(this)), _disk_io_mgr(new DiskIoMgr()), - _tmp_file_mgr(new TmpFileMgr), + _tmp_file_mgr(new TmpFileMgr(this)), _bfd_parser(BfdParser::create()), _pull_load_task_mgr(new PullLoadTaskMgr(config::pull_load_task_dir)), _broker_mgr(new BrokerMgr(this)), + _tablet_writer_mgr(new TabletWriterMgr(this)), + _load_stream_mgr(new LoadStreamMgr()), _snapshot_loader(new SnapshotLoader(this)), _brpc_stub_cache(new BrpcStubCache()), _enable_webserver(true), @@ -204,9 +218,14 @@ Status ExecEnv::start_webserver() { _ev_http_server->register_handler(HttpMethod::PUT, "/api/{db}/{table}/_load", new MiniLoadAction(this)); + _ev_http_server->register_handler(HttpMethod::PUT, + "/api/{db}/{table}/_stream_load", + new StreamLoadAction(this)); std::vector allow_paths; - OLAPRootPath::get_instance()->get_all_available_root_path(&allow_paths); + for (auto& path : _store_paths) { + allow_paths.emplace_back(path.path); + } DownloadAction* download_action = new DownloadAction(this, allow_paths); // = new DownloadAction(this, config::mini_load_download_path); _ev_http_server->register_handler(HttpMethod::GET, "/api/_download_load", download_action); @@ -245,22 +264,24 @@ Status ExecEnv::start_webserver() { _ev_http_server->register_handler(HttpMethod::GET, "/metrics", action); } + MetaAction* meta_action = new MetaAction(HEADER); + _ev_http_server->register_handler(HttpMethod::GET, "/api/meta/header/{tablet_id}/{schema_hash}", meta_action); + #ifndef BE_TEST // Register BE checksum action ChecksumAction* checksum_action = new ChecksumAction(this); - _ev_http_server->register_handler(HttpMethod::POST, "/api/checksum", checksum_action); + _ev_http_server->register_handler(HttpMethod::GET, "/api/checksum", checksum_action); // Register BE reload tablet action ReloadTabletAction* reload_tablet_action = new ReloadTabletAction(this); - _ev_http_server->register_handler(HttpMethod::POST, "/api/reload_tablet", reload_tablet_action); + _ev_http_server->register_handler(HttpMethod::GET, "/api/reload_tablet", reload_tablet_action); + + RestoreTabletAction* restore_tablet_action = new RestoreTabletAction(this); + _ev_http_server->register_handler(HttpMethod::POST, "/api/restore_tablet", restore_tablet_action); // Register BE snapshot action SnapshotAction* snapshot_action = new SnapshotAction(this); - _ev_http_server->register_handler(HttpMethod::POST, "/api/snapshot", snapshot_action); - - // Register BE compaction action - CompactionAction* compaction_action = new CompactionAction(); - _ev_http_server->register_handler(HttpMethod::POST, "/api/compaction", compaction_action); + _ev_http_server->register_handler(HttpMethod::GET, "/api/snapshot", snapshot_action); #endif RETURN_IF_ERROR(_ev_http_server->start()); @@ -268,7 +289,7 @@ Status ExecEnv::start_webserver() { } uint32_t ExecEnv::cluster_id() { - return OLAPRootPath::get_instance()->effective_cluster_id(); + return OLAPEngine::get_instance()->effective_cluster_id(); } void ExecEnv::init_buffer_pool(int64_t min_page_size, int64_t capacity, int64_t clean_pages_limit) { diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index f831b95f29..f5d3e06043 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -34,6 +34,7 @@ #include "util/thread_pool.hpp" #include "util/priority_thread_pool.hpp" #include "util/thread_pool.hpp" +#include "olap/options.h" namespace palo { @@ -57,9 +58,12 @@ class BrokerMgr; class MetricRegistry; class BufferPool; class ReservationTracker; +class TabletWriterMgr; +class LoadStreamMgr; class ConnectionManager; class SnapshotLoader; class BrpcStubCache; +class OLAPEngine; // Execution environment for queries/plan fragments. // Contains all required global structures, and handles to @@ -67,6 +71,9 @@ class BrpcStubCache; // once to properly initialise service state. class ExecEnv { public: + ExecEnv(const std::vector& store_paths); + + // only used for test ExecEnv(); /// Returns the first created exec env instance. In a normal impalad, this is @@ -177,8 +184,29 @@ public: return _buffer_pool.get(); } + TabletWriterMgr* tablet_writer_mgr() { + return _tablet_writer_mgr.get(); + } + + LoadStreamMgr* load_stream_mgr() { + return _load_stream_mgr.get(); + } + + const std::vector& store_paths() const { + return _store_paths; + } + + void set_store_paths(const std::vector& paths) { + _store_paths = paths; + } + + OLAPEngine* olap_engine() { return _olap_engine; } + + void set_olap_engine(OLAPEngine* olap_engine) { _olap_engine = olap_engine; } + private: Status start_webserver(); + std::vector _store_paths; // Leave protected so that subclasses can override boost::scoped_ptr _stream_mgr; boost::scoped_ptr _result_mgr; @@ -203,6 +231,8 @@ private: std::unique_ptr _bfd_parser; std::unique_ptr _pull_load_task_mgr; std::unique_ptr _broker_mgr; + std::unique_ptr _tablet_writer_mgr; + std::unique_ptr _load_stream_mgr; std::unique_ptr _snapshot_loader; std::unique_ptr _brpc_stub_cache; bool _enable_webserver; @@ -210,6 +240,8 @@ private: boost::scoped_ptr _buffer_reservation; boost::scoped_ptr _buffer_pool; + OLAPEngine* _olap_engine = nullptr; + ObjectPool _object_pool; private: static ExecEnv* _exec_env; diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 413a989dc4..80ea7c7495 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -45,6 +45,14 @@ namespace palo { +std::string to_load_error_http_path(const std::string& file_name) { + std::stringstream url; + url << "http://" << BackendOptions::get_localhost() << ":" << config::webserver_port + << "/api/_load_error_log?" + << "file=" << file_name; + return url.str(); +} + using apache::thrift::TException; using apache::thrift::TProcessor; using apache::thrift::transport::TTransportException; @@ -68,7 +76,6 @@ public: void callback(const Status& status, RuntimeProfile* profile, bool done); std::string to_http_path(const std::string& file_name); - std::string to_load_error_http_path(const std::string& file_name); Status execute(); @@ -220,14 +227,6 @@ std::string FragmentExecState::to_http_path(const std::string& file_name) { return url.str(); } -std::string FragmentExecState::to_load_error_http_path(const std::string& file_name) { - std::stringstream url; - url << "http://" << BackendOptions::get_localhost() << ":" << config::webserver_port - << "/api/_load_error_log?" - << "file=" << file_name; - return url.str(); -} - // There can only be one of these callbacks in-flight at any moment, because // it is only invoked from the executor's reporting thread. // Also, the reported status will always reflect the most recent execution status, @@ -286,6 +285,13 @@ void FragmentExecState::coordinator_callback( params.__isset.export_files = true; params.export_files = runtime_state->export_output_files(); } + if (!runtime_state->tablet_commit_infos().empty()) { + params.__isset.commitInfos = true; + params.commitInfos.reserve(runtime_state->tablet_commit_infos().size()); + for (auto& info : runtime_state->tablet_commit_infos()) { + params.commitInfos.push_back(info); + } + } DCHECK(runtime_state != NULL); // Send new errors to coordinator @@ -438,11 +444,18 @@ Status FragmentMgr::exec_plan_fragment( } } else { pthread_t id; - pthread_create(&id, + int ret = pthread_create(&id, nullptr, fragment_executor, new ThreadPool::WorkFunction( std::bind(&FragmentMgr::exec_actual, this, exec_state, cb))); + if (ret != 0) { + std::string err_msg("Could not create thread."); + err_msg.append(strerror(ret)); + err_msg.append(","); + err_msg.append(std::to_string(ret)); + return Status(err_msg); + } pthread_detach(id); } diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index 36bf9c6a6b..4852b99e24 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -41,6 +41,8 @@ class FragmentExecState; class TExecPlanFragmentParams; class PlanFragmentExecutor; +std::string to_load_error_http_path(const std::string& file_name); + // This class used to manage all the fragment execute in this instance class FragmentMgr : public RestMonitorIface { public: diff --git a/be/src/runtime/load_path_mgr.cpp b/be/src/runtime/load_path_mgr.cpp index 18e22ebcdb..ec9b8e4fa1 100644 --- a/be/src/runtime/load_path_mgr.cpp +++ b/be/src/runtime/load_path_mgr.cpp @@ -23,28 +23,28 @@ #include #include "olap/olap_define.h" -#include "olap/olap_rootpath.h" +#include "olap/olap_engine.h" #include "util/file_utils.h" #include "gen_cpp/Types_types.h" +#include "runtime/exec_env.h" namespace palo { static const uint32_t MAX_SHARD_NUM = 1024; static const std::string SHARD_PREFIX = "__shard_"; -LoadPathMgr::LoadPathMgr() : _idx(0), _next_shard(0) { } +LoadPathMgr::LoadPathMgr(ExecEnv* exec_env) : _exec_env(exec_env), + _idx(0), _next_shard(0), _error_path_next_shard(0) { } Status LoadPathMgr::init() { - OLAPRootPath::RootPathVec all_available_root_path; - OLAPRootPath::get_instance()->get_all_available_root_path(&all_available_root_path); _path_vec.clear(); - for (auto& one_path : all_available_root_path) { - _path_vec.push_back(one_path + MINI_PREFIX); + for (auto& path : _exec_env->store_paths()) { + _path_vec.push_back(path.path + MINI_PREFIX); } LOG(INFO) << "Load path configured to [" << boost::join(_path_vec, ",") << "]"; // error log is saved in first root path - _error_log_dir = all_available_root_path[0] + ERROR_LOG_PREFIX; + _error_log_dir = _exec_env->store_paths()[0].path + ERROR_LOG_PREFIX; // check and make dir RETURN_IF_ERROR(FileUtils::create_dir(_error_log_dir)); @@ -126,31 +126,43 @@ Status LoadPathMgr::get_load_error_file_name( const TUniqueId& fragment_instance_id, std::string* error_path) { std::stringstream ss; - ss << ERROR_FILE_NAME << "_" << db << "_" << label + std::string shard = ""; + { + std::lock_guard l(_lock); + shard = SHARD_PREFIX + std::to_string(_error_path_next_shard++ % MAX_SHARD_NUM); + } + std::string shard_path = _error_log_dir + "/" + shard; + // check and create shard path + Status status = FileUtils::create_dir(shard_path); + if (!status.ok()) { + LOG(WARNING) << "create error sub path failed. path=" << shard_path; + } + // add shard sub dir to file path + ss << shard << "/" << ERROR_FILE_NAME << "_" << db << "_" << label << "_" << std::hex << fragment_instance_id.hi << "_" << fragment_instance_id.lo; *error_path = ss.str(); return Status::OK; } -std::string LoadPathMgr::get_load_error_absolute_path(const std::string& file_name) { +std::string LoadPathMgr::get_load_error_absolute_path(const std::string& file_path) { std::string path; path.append(_error_log_dir); path.append("/"); - path.append(file_name); + path.append(file_path); return path; } -void LoadPathMgr::process_label_dir(time_t now, const std::string& label_dir) { - if (!is_too_old(now, label_dir)) { +void LoadPathMgr::process_path(time_t now, const std::string& path) { + if (!is_too_old(now, path)) { return; } - LOG(INFO) << "Going to remove load directory. path=" << label_dir; - Status status = FileUtils::remove_all(label_dir); + LOG(INFO) << "Going to remove path. path=" << path; + Status status = FileUtils::remove_all(path); if (status.ok()) { - LOG(INFO) << "Remove load directory success. path=" << label_dir; + LOG(INFO) << "Remove path success. path=" << path; } else { - LOG(WARNING) << "Remove load directory failed. path=" << label_dir; + LOG(WARNING) << "Remove path failed. path=" << path; } } @@ -181,16 +193,16 @@ void LoadPathMgr::clean_one_path(const std::string& path) { std::vector labels; Status status = FileUtils::scan_dir(sub_path, &labels); if (!status.ok()) { - LOG(WARNING) << "scan one path to delete directory failed. path=" << path; + LOG(WARNING) << "scan one path to delete directory failed. path=" << sub_path; continue; } for (auto& label : labels) { std::string label_dir = sub_path + "/" + label; - process_label_dir(now, label_dir); + process_path(now, label_dir); } } else { // process label dir - process_label_dir(now, sub_path); + process_path(now, sub_path); } } } @@ -205,24 +217,32 @@ void LoadPathMgr::clean() { void LoadPathMgr::clean_error_log() { time_t now = time(nullptr); - std::vector error_logs; - Status status = FileUtils::scan_dir(_error_log_dir, &error_logs); + std::vector sub_dirs; + Status status = FileUtils::scan_dir(_error_log_dir, &sub_dirs); if (!status.ok()) { LOG(WARNING) << "scan error_log dir failed. dir=" << _error_log_dir; return; } - for (auto& error_log : error_logs) { - std::string log_path = _error_log_dir + "/" + error_log; - if (!is_too_old(now, log_path)) { - continue; - } - LOG(INFO) << "Going to remove error log file. path=" << log_path; - status = FileUtils::remove_all(log_path); - if (status.ok()) { - LOG(INFO) << "Remove load directory success. path=" << log_path; + for (auto& sub_dir : sub_dirs) { + std::string sub_path = _error_log_dir + "/" + sub_dir; + // for compatible + if (sub_dir.find(SHARD_PREFIX) == 0) { + // sub_dir starts with SHARD_PREFIX + // process shard sub dir + std::vector error_log_files; + Status status = FileUtils::scan_dir(sub_path, &error_log_files); + if (!status.ok()) { + LOG(WARNING) << "scan one path to delete directory failed. path=" << sub_path; + continue; + } + for (auto& error_log : error_log_files) { + std::string error_log_path = sub_path + "/" + error_log; + process_path(now, error_log_path); + } } else { - LOG(WARNING) << "Remove load directory failed. path=" << log_path; + // process error log file + process_path(now, sub_path); } } } diff --git a/be/src/runtime/load_path_mgr.h b/be/src/runtime/load_path_mgr.h index c67ca738d8..6ceab38fa9 100644 --- a/be/src/runtime/load_path_mgr.h +++ b/be/src/runtime/load_path_mgr.h @@ -25,12 +25,13 @@ namespace palo { class TUniqueId; +class ExecEnv; // In every directory, '.trash' directory is used to save data need to delete // daemon thread is check no used directory to delete class LoadPathMgr { public: - LoadPathMgr(); + LoadPathMgr(ExecEnv* env); ~LoadPathMgr() { } @@ -46,7 +47,7 @@ public: const std::string&label, const TUniqueId& fragment_instance_id, std::string* error_path); - std::string get_load_error_absolute_path(const std::string& file_name); + std::string get_load_error_absolute_path(const std::string& file_path); const std::string& get_load_error_file_dir() const { return _error_log_dir; } @@ -56,10 +57,11 @@ private: void clean_one_path(const std::string& path); void clean_error_log(); void clean(); - void process_label_dir(time_t now, const std::string& label_dir); + void process_path(time_t now, const std::string& path); static void* cleaner(void* param); + ExecEnv* _exec_env; std::mutex _lock; std::vector _path_vec; int _idx; @@ -67,6 +69,7 @@ private: pthread_t _cleaner_id; std::string _error_log_dir; uint32_t _next_shard; + uint32_t _error_path_next_shard; }; } diff --git a/be/src/runtime/mem_tracker.cpp b/be/src/runtime/mem_tracker.cpp index f8007dab0a..2309299244 100644 --- a/be/src/runtime/mem_tracker.cpp +++ b/be/src/runtime/mem_tracker.cpp @@ -310,17 +310,18 @@ Status MemTracker::MemLimitExceeded(RuntimeState* state, const std::string& deta // Choose which tracker to log the usage of. Default to the process tracker so we can // get the full view of memory consumption. - MemTracker* tracker_to_log = process_tracker; - if (state != nullptr && state->query_mem_tracker()->has_limit()) { - MemTracker* query_tracker = state->query_mem_tracker(); - const int64_t query_capacity = query_tracker->limit() - query_tracker->consumption(); - ss << "Memory left in query limit: " - << PrettyPrinter::print(query_capacity, TUnit::BYTES) << std::endl; - // Log the query tracker only if the query limit was closer to being exceeded. - if (query_capacity < process_capacity) tracker_to_log = query_tracker; - } - ss << tracker_to_log->LogUsage(); - //Status status = Status::MemLimitExceeded(ss.str()); + // FIXME(cmy): call LogUsage() lead to crash here, fix it later + // MemTracker* tracker_to_log = process_tracker; + // if (state != nullptr && state->query_mem_tracker()->has_limit()) { + // MemTracker* query_tracker = state->query_mem_tracker(); + // const int64_t query_capacity = query_tracker->limit() - query_tracker->consumption(); + // ss << "Memory left in query limit: " + // << PrettyPrinter::print(query_capacity, TUnit::BYTES) << std::endl; + // // Log the query tracker only if the query limit was closer to being exceeded. + // if (query_capacity < process_capacity) tracker_to_log = query_tracker; + // } + // ss << tracker_to_log->LogUsage(); + // Status status = Status::MemLimitExceeded(ss.str()); Status status = Status::MEM_LIMIT_EXCEEDED; if (state != nullptr) state->log_error(status.get_error_msg()); return status; diff --git a/be/src/runtime/plan_fragment_executor.cpp b/be/src/runtime/plan_fragment_executor.cpp index c5d80fc1a3..4f10eb0083 100644 --- a/be/src/runtime/plan_fragment_executor.cpp +++ b/be/src/runtime/plan_fragment_executor.cpp @@ -185,6 +185,8 @@ Status PlanFragmentExecutor::prepare(const TExecPlanFragmentParams& request) { print_volume_ids(params.per_node_scan_ranges); _runtime_state->set_per_fragment_instance_idx(params.sender_id); + _runtime_state->set_num_per_fragment_instances(params.num_senders); + // set up sink, if required if (request.fragment.__isset.output_sink) { RETURN_IF_ERROR(DataSink::create_data_sink(obj_pool(), @@ -533,14 +535,18 @@ void PlanFragmentExecutor::close() { } if (_sink.get() != NULL) { - _sink->close(runtime_state(), _status); + if (_prepared) { + _sink->close(runtime_state(), _status); + } else { + _sink->close(runtime_state(), Status("prepare failed")); + } } _exec_env->thread_mgr()->unregister_pool(_runtime_state->resource_pool()); { std::stringstream ss; - profile()->pretty_print(&ss); + _runtime_state->runtime_profile()->pretty_print(&ss); LOG(INFO) << ss.str(); } } diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h index 9d575a8cec..11070f2919 100644 --- a/be/src/runtime/primitive_type.h +++ b/be/src/runtime/primitive_type.h @@ -215,6 +215,13 @@ inline int get_slot_size(PrimitiveType type) { return 0; } +inline bool is_type_compatible(PrimitiveType lhs, PrimitiveType rhs) { + if (lhs == TYPE_CHAR || lhs == TYPE_VARCHAR || lhs == TYPE_HLL) { + return rhs == TYPE_CHAR || rhs == TYPE_VARCHAR || rhs == TYPE_HLL; + } + return lhs == rhs; +} + TExprOpcode::type to_in_opcode(PrimitiveType t); PrimitiveType thrift_to_type(TPrimitiveType::type ttype); TPrimitiveType::type to_thrift(PrimitiveType ptype); diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp index 39b878cd87..3a3e745c91 100644 --- a/be/src/runtime/row_batch.cpp +++ b/be/src/runtime/row_batch.cpp @@ -31,6 +31,7 @@ //#include "runtime/mem_tracker.h" #include "gen_cpp/Data_types.h" #include "gen_cpp/data.pb.h" +#include "util/debug_util.h" using std::vector; @@ -143,17 +144,14 @@ RowBatch::RowBatch(const RowDescriptor& row_desc, if ((*desc)->string_slots().empty()) { continue; } - Tuple* tuple = row->get_tuple(j); - if (tuple == NULL) { + if (tuple == nullptr) { continue; } - vector::const_iterator slot = (*desc)->string_slots().begin(); - for (; slot != (*desc)->string_slots().end(); ++slot) { - DCHECK((*slot)->type().is_string_type()); - StringValue* string_val = tuple->get_string_slot((*slot)->tuple_offset()); - + for (auto slot : (*desc)->string_slots()) { + DCHECK(slot->type().is_string_type()); + StringValue* string_val = tuple->get_string_slot(slot->tuple_offset()); int offset = reinterpret_cast(string_val->ptr); string_val->ptr = reinterpret_cast(tuple_data + offset); } @@ -380,14 +378,12 @@ int RowBatch::serialize(PRowBatch* output_batch) { TupleRow* row = get_row(i); const vector& tuple_descs = _row_desc.tuple_descriptors(); vector::const_iterator desc = tuple_descs.begin(); - for (int j = 0; desc != tuple_descs.end(); ++desc, ++j) { - if (row->get_tuple(j) == NULL) { + if (row->get_tuple(j) == nullptr) { // NULLs are encoded as -1 output_batch->mutable_tuple_offsets()->Add(-1); continue; } - // Record offset before creating copy (which increments offset and tuple_data) output_batch->mutable_tuple_offsets()->Add(offset); row->get_tuple(j)->deep_copy(**desc, &tuple_data, &offset, /* convert_ptrs */ true); diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index 2c0ebbd91a..b343d9a859 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -63,6 +63,7 @@ RuntimeState::RuntimeState( _root_node_id(-1), _num_rows_load_success(0), _num_rows_load_filtered(0), + _num_print_error_rows(0), _normal_row_number(0), _error_row_number(0), _error_log_file(nullptr), @@ -134,6 +135,7 @@ RuntimeState::~RuntimeState() { _buffer_reservation->Close(); } +#ifndef BE_TEST // _query_mem_tracker must be valid as long as _instance_mem_tracker is so // delete _instance_mem_tracker first. // LogUsage() walks the MemTracker tree top-down when the memory limit is exceeded. @@ -142,15 +144,16 @@ RuntimeState::~RuntimeState() { if (_instance_mem_tracker.get() != NULL) { // May be NULL if InitMemTrackers() is not called, for example from tests. _instance_mem_tracker->unregister_from_parent(); + _instance_mem_tracker->close(); } - _instance_mem_tracker->close(); +#endif _instance_mem_tracker.reset(); if (_query_mem_tracker.get() != NULL) { _query_mem_tracker->unregister_from_parent(); + _query_mem_tracker->close(); } - _query_mem_tracker->close(); _query_mem_tracker.reset(); } @@ -431,6 +434,10 @@ void RuntimeState::append_error_msg_to_file( } } + if (_num_print_error_rows.fetch_add(1, std::memory_order_relaxed) > MAX_ERROR_NUM) { + return; + } + std::stringstream out; if (line.empty()) { out << "Summary: "; diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 80c2e90c70..43030c2c02 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -437,6 +437,14 @@ public: return _per_fragment_instance_idx; } + void set_num_per_fragment_instances(int num_instances) { + _num_per_fragment_instances = num_instances; + } + + int num_per_fragment_instances() const { + return _num_per_fragment_instances; + } + ReservationTracker* instance_buffer_reservation() { return _instance_buffer_reservation.get(); } @@ -462,6 +470,14 @@ public: return _buffer_reservation; } + const std::vector& tablet_commit_infos() const { + return _tablet_commit_infos; + } + + std::vector& tablet_commit_infos() { + return _tablet_commit_infos; + } + /// Helper to call QueryState::StartSpilling(). Status StartSpilling(MemTracker* mem_tracker); @@ -551,6 +567,7 @@ private: bool _is_cancelled; int _per_fragment_instance_idx; + int _num_per_fragment_instances = 0; // used as send id int _be_number; @@ -581,6 +598,7 @@ private: std::vector _output_files; std::atomic _num_rows_load_success; std::atomic _num_rows_load_filtered; + std::atomic _num_print_error_rows; std::vector _export_output_files; @@ -596,6 +614,7 @@ private: std::string _error_log_file_path; std::ofstream* _error_log_file; // error file path, absolute path std::unique_ptr _error_hub; + std::vector _tablet_commit_infos; // state of execution volatile bool _is_running; diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index aaea0214ad..33a209b69a 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -446,11 +446,13 @@ Status SnapshotLoader::download( Status SnapshotLoader::move( const std::string& snapshot_path, const std::string& tablet_path, + const std::string& store_path, int64_t job_id, bool overwrite) { LOG(INFO) << "begin to move snapshot files. from: " - << snapshot_path << ", to: " << tablet_path << ", job: " << job_id; + << snapshot_path << ", to: " << tablet_path + << ", store: " << store_path << ", job: " << job_id; Status status = Status::OK; @@ -552,7 +554,7 @@ Status SnapshotLoader::move( // than we merge the 2 .hdr file before reloading it. // load header in tablet dir to get the base vesion - SmartOLAPTable tablet = OLAPEngine::get_instance()->get_table( + OLAPTablePtr tablet = OLAPEngine::get_instance()->get_table( tablet_id, schema_hash); if (tablet.get() == NULL) { std::stringstream ss; @@ -563,7 +565,7 @@ Status SnapshotLoader::move( } // get base version tablet->obtain_header_rdlock(); - const FileVersionMessage* base_version = tablet->base_version(); + const PDelta* base_version = tablet->base_version(); tablet->release_header_lock(); if (base_version == nullptr) { std::stringstream ss; @@ -580,7 +582,7 @@ Status SnapshotLoader::move( std::string snapshot_header_file = hdr.str(); OLAPHeader snapshot_header(snapshot_header_file); - OLAPStatus ost = snapshot_header.load(); + OLAPStatus ost = snapshot_header.load_and_init(); if (ost != OLAP_SUCCESS) { LOG(WARNING) << "failed to load snapshot header: " << snapshot_header_file; return Status("failed to load snapshot header: " + snapshot_header_file); @@ -643,7 +645,7 @@ Status SnapshotLoader::move( LOG(WARNING) << ss.str(); return Status(ss.str()); } - + // merge 2 headers ost = tablet->merge_header(snapshot_header, end_version); if (ost != OLAP_SUCCESS) { @@ -654,9 +656,17 @@ Status SnapshotLoader::move( } } + // fixme: there is no header now and can not call load_one_tablet here // reload header + OlapStore* store = OLAPEngine::get_instance()->get_store(store_path); + if (store == nullptr) { + std::stringstream ss; + ss << "failed to get store by path: " << store_path; + LOG(WARNING) << ss.str(); + return Status(ss.str()); + } OLAPStatus ost = OLAPEngine::get_instance()->load_one_tablet( - tablet_id, schema_hash, tablet_path, true); + store, tablet_id, schema_hash, tablet_path, true); if (ost != OLAP_SUCCESS) { std::stringstream ss; ss << "failed to reload header of tablet: " << tablet_id; diff --git a/be/src/runtime/snapshot_loader.h b/be/src/runtime/snapshot_loader.h index e4bc7321c3..5f751dae8a 100644 --- a/be/src/runtime/snapshot_loader.h +++ b/be/src/runtime/snapshot_loader.h @@ -80,6 +80,7 @@ public: Status move( const std::string& snapshot_path, const std::string& tablet_path, + const std::string& store_path, int64_t job_id, bool overwrite); diff --git a/be/src/runtime/test_env.cc b/be/src/runtime/test_env.cc index 791f382b1f..32b5920b89 100644 --- a/be/src/runtime/test_env.cc +++ b/be/src/runtime/test_env.cc @@ -33,13 +33,13 @@ TestEnv::TestEnv() { _s_static_metrics.reset(new MetricRegistry("test_env")); // PaloMetrics::create_metrics(_s_static_metrics.get()); } - _exec_env.reset(new ExecEnv); + _exec_env.reset(new ExecEnv()); _exec_env->init_for_tests(); _io_mgr_tracker.reset(new MemTracker(-1)); _block_mgr_parent_tracker.reset(new MemTracker(-1)); _exec_env->disk_io_mgr()->init(_io_mgr_tracker.get()); init_metrics(); - _tmp_file_mgr.reset(new TmpFileMgr); + _tmp_file_mgr.reset(new TmpFileMgr()); _tmp_file_mgr->init(_metrics.get()); } @@ -51,7 +51,7 @@ void TestEnv::init_tmp_file_mgr(const std::vector& tmp_dirs, bool one_dir_per_device) { // Need to recreate metrics to avoid error when registering metric twice. init_metrics(); - _tmp_file_mgr.reset(new TmpFileMgr); + _tmp_file_mgr.reset(new TmpFileMgr()); _tmp_file_mgr->init_custom(tmp_dirs, one_dir_per_device, _metrics.get()); } diff --git a/be/src/runtime/tmp_file_mgr.cc b/be/src/runtime/tmp_file_mgr.cc index c39d1d8fe2..17e134e64a 100644 --- a/be/src/runtime/tmp_file_mgr.cc +++ b/be/src/runtime/tmp_file_mgr.cc @@ -30,10 +30,11 @@ // #include // #include -#include "olap/olap_rootpath.h" +#include "olap/olap_engine.h" #include "util/debug_util.h" #include "util/disk_info.h" #include "util/filesystem_util.h" +#include "runtime/exec_env.h" using boost::algorithm::is_any_of; using boost::algorithm::join; @@ -55,16 +56,16 @@ const uint64_t _s_available_space_threshold_mb = 1024; const std::string TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS = "tmp_file_mgr.active_scratch_dirs"; const std::string TMP_FILE_MGR_ACTIVE_SCRATCH_DIRS_LIST = "tmp_file_mgr.active_scratch_dirs.list"; -TmpFileMgr::TmpFileMgr() : - _initialized(false), _dir_status_lock(), _tmp_dirs() { } +TmpFileMgr::TmpFileMgr(ExecEnv* exec_env) : + _exec_env(exec_env), _initialized(false), _dir_status_lock(), _tmp_dirs() { } // _num_active_scratch_dirs_metric(NULL), _active_scratch_dirs_metric(NULL) {} Status TmpFileMgr::init(MetricRegistry* metrics) { std::string tmp_dirs_spec = config::storage_root_path; vector all_tmp_dirs; - - // we already paser the config::storage_root_path in OLAPRootPath, use it. - OLAPRootPath::get_instance()->get_all_available_root_path(&all_tmp_dirs); + for (auto& path : _exec_env->store_paths()) { + all_tmp_dirs.emplace_back(path.path); + } return init_custom(all_tmp_dirs, true, metrics); } diff --git a/be/src/runtime/tmp_file_mgr.h b/be/src/runtime/tmp_file_mgr.h index 69fe4479a8..de9072dba0 100644 --- a/be/src/runtime/tmp_file_mgr.h +++ b/be/src/runtime/tmp_file_mgr.h @@ -30,6 +30,7 @@ namespace palo { class MetricRegistry; +class ExecEnv; // TmpFileMgr creates and manages temporary files and directories on the local // filesystem. It can manage multiple temporary directories across multiple devices. @@ -116,7 +117,8 @@ public: bool _blacklisted; }; - TmpFileMgr(); + TmpFileMgr(ExecEnv* exec_env); + TmpFileMgr() { } ~TmpFileMgr(){ // do nothing. @@ -187,6 +189,7 @@ private: bool is_blacklisted(DeviceId device_id); + ExecEnv* _exec_env; bool _initialized; // Protects the status of tmp dirs (i.e. whether they're blacklisted). diff --git a/be/src/runtime/tuple.cpp b/be/src/runtime/tuple.cpp index b3074f0015..74c83e4b38 100644 --- a/be/src/runtime/tuple.cpp +++ b/be/src/runtime/tuple.cpp @@ -127,19 +127,17 @@ int64_t Tuple::release_string(const TupleDescriptor& desc) { return bytes; } -void Tuple::deep_copy(const TupleDescriptor& desc, char** data, int* offset, - bool convert_ptrs) { +void Tuple::deep_copy( + const TupleDescriptor& desc, char** data, int* offset, bool convert_ptrs) { Tuple* dst = reinterpret_cast(*data); memory_copy(dst, this, desc.byte_size()); *data += desc.byte_size(); *offset += desc.byte_size(); - for (std::vector::const_iterator i = desc.string_slots().begin(); - i != desc.string_slots().end(); ++i) { - DCHECK((*i)->type().is_string_type()); - - if (!dst->is_null((*i)->null_indicator_offset())) { - StringValue* string_v = dst->get_string_slot((*i)->tuple_offset()); + for (auto slot_desc : desc.string_slots()) { + DCHECK(slot_desc->type().is_string_type()); + if (!dst->is_null(slot_desc->null_indicator_offset())) { + StringValue* string_v = dst->get_string_slot(slot_desc->tuple_offset()); memory_copy(*data, string_v->ptr, string_v->len); string_v->ptr = (convert_ptrs ? reinterpret_cast(*offset) : *data); *data += string_v->len; diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h index 88193968c3..f8a81fbaa0 100644 --- a/be/src/runtime/tuple.h +++ b/be/src/runtime/tuple.h @@ -175,6 +175,7 @@ public: // For C++/IR interop, we need to be able to look up types by name. static const char* _s_llvm_class_name; + void* get_data() { return this; } private: void* _data; }; diff --git a/be/src/runtime/types.cpp b/be/src/runtime/types.cpp index f3f8575a9c..8cfcd73895 100644 --- a/be/src/runtime/types.cpp +++ b/be/src/runtime/types.cpp @@ -119,6 +119,51 @@ void TypeDescriptor::to_thrift(TTypeDesc* thrift_type) const { } } +void TypeDescriptor::to_protobuf(PTypeDesc* ptype) const { + DCHECK(!is_complex_type()) << "Don't support complex type now, type=" << type; + auto node = ptype->add_types(); + node->set_type(TTypeNodeType::SCALAR); + auto scalar_type = node->mutable_scalar_type(); + scalar_type->set_type(palo::to_thrift(type)); + if (type == TYPE_CHAR || type == TYPE_VARCHAR || type == TYPE_HLL) { + scalar_type->set_len(len); + } else if (type == TYPE_DECIMAL) { + DCHECK_NE(precision, -1); + DCHECK_NE(scale, -1); + scalar_type->set_precision(precision); + scalar_type->set_scale(scale); + } +} + +TypeDescriptor::TypeDescriptor( + const google::protobuf::RepeatedPtrField& types, + int* idx) + : len(-1), precision(-1), scale(-1) { + DCHECK_GE(*idx, 0); + DCHECK_LT(*idx, types.size()); + + const PTypeNode& node = types.Get(*idx); + switch (node.type()) { + case TTypeNodeType::SCALAR: { + DCHECK(node.has_scalar_type()); + const PScalarType& scalar_type = node.scalar_type(); + type = thrift_to_type((TPrimitiveType::type)scalar_type.type()); + if (type == TYPE_CHAR || type == TYPE_VARCHAR || type == TYPE_HLL) { + DCHECK(scalar_type.has_len()); + len = scalar_type.len(); + } else if (type == TYPE_DECIMAL) { + DCHECK(scalar_type.has_precision()); + DCHECK(scalar_type.has_scale()); + precision = scalar_type.precision(); + scale = scalar_type.scale(); + } + break; + } + default: + DCHECK(false) << node.type(); + } +} + std::string TypeDescriptor::debug_string() const { std::stringstream ss; switch (type) { diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h index d4c7087dd7..a347115bde 100644 --- a/be/src/runtime/types.h +++ b/be/src/runtime/types.h @@ -25,6 +25,7 @@ #include #include "gen_cpp/Types_types.h" // for TPrimitiveType +#include "gen_cpp/types.pb.h" // for PTypeDesc #include "runtime/primitive_type.h" #include "thrift/protocol/TDebugProtocol.h" #include "common/config.h" @@ -129,6 +130,13 @@ struct TypeDescriptor { return result; } + static TypeDescriptor from_protobuf(const PTypeDesc& ptype) { + int idx = 0; + TypeDescriptor result(ptype.types(), &idx); + DCHECK_EQ(idx, ptype.types_size() - 1); + return result; + } + bool operator==(const TypeDescriptor& o) const { if (type != o.type) { return false; @@ -155,6 +163,8 @@ struct TypeDescriptor { return thrift_type; } + void to_protobuf(PTypeDesc* ptype) const; + inline bool is_string_type() const { return type == TYPE_VARCHAR || type == TYPE_CHAR || type == TYPE_HLL; } @@ -168,8 +178,7 @@ struct TypeDescriptor { } inline bool is_var_len_string_type() const { - return type == TYPE_VARCHAR || type == TYPE_HLL - || (type == TYPE_CHAR && len > MAX_CHAR_INLINE_LENGTH); + return type == TYPE_VARCHAR || type == TYPE_HLL || type == TYPE_CHAR; } inline bool is_complex_type() const { @@ -287,6 +296,7 @@ private: /// 'types' being constructed, and is set to the index of the next type in 'types' that /// needs to be processed (or the size 'types' if all nodes have been processed). TypeDescriptor(const std::vector& types, int* idx); + TypeDescriptor(const google::protobuf::RepeatedPtrField& types, int* idx); /// Recursive implementation of ToThrift() that populates 'thrift_type' with the /// TTypeNodes for this type and its children. diff --git a/be/src/service/backend_service.cpp b/be/src/service/backend_service.cpp index 245f4a3337..a0dc6180df 100644 --- a/be/src/service/backend_service.cpp +++ b/be/src/service/backend_service.cpp @@ -25,6 +25,7 @@ #include #include +#include "olap/olap_engine.h" #include "service/backend_options.h" #include "util/network_util.h" #include "util/thrift_util.h" @@ -226,4 +227,8 @@ void BackendService::erase_export_task(TStatus& t_status, const TUniqueId& task_ // status.to_thrift(&t_status); } +void BackendService::get_tablet_stat(TTabletStatResult& result) { + OLAPEngine::get_instance()->get_tablet_stat(result); +} + } // namespace palo diff --git a/be/src/service/backend_service.h b/be/src/service/backend_service.h index a84305b4a7..d1c3a48ff5 100644 --- a/be/src/service/backend_service.h +++ b/be/src/service/backend_service.h @@ -148,6 +148,8 @@ public: void erase_export_task(TStatus& t_status, const TUniqueId& task_id) override; + virtual void get_tablet_stat(TTabletStatResult& result) override; + private: Status start_plan_fragment_execution(const TExecPlanFragmentParams& exec_params); diff --git a/be/src/service/brpc.h b/be/src/service/brpc.h index ffe0fcd793..bc80b87ac1 100644 --- a/be/src/service/brpc.h +++ b/be/src/service/brpc.h @@ -53,3 +53,4 @@ #include #include #include +#include diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 9501818424..5f3ae6f260 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -15,6 +15,8 @@ #include "service/internal_service.h" +#include "common/config.h" +#include "runtime/tablet_writer_mgr.h" #include "gen_cpp/BackendService.h" #include "runtime/exec_env.h" #include "runtime/data_stream_mgr.h" @@ -26,7 +28,9 @@ namespace palo { -PInternalServiceImpl::PInternalServiceImpl(ExecEnv* exec_env) : _exec_env(exec_env) { +PInternalServiceImpl::PInternalServiceImpl(ExecEnv* exec_env) + : _exec_env(exec_env), + _tablet_worker_pool(config::number_tablet_writer_threads, 10240) { } PInternalServiceImpl::~PInternalServiceImpl() { @@ -57,6 +61,23 @@ void PInternalServiceImpl::transmit_data(google::protobuf::RpcController* cntl_b } } +void PInternalServiceImpl::tablet_writer_open(google::protobuf::RpcController* controller, + const PTabletWriterOpenRequest* request, + PTabletWriterOpenResult* response, + google::protobuf::Closure* done) { + VLOG_RPC << "tablet writer open, id=" << request->id() + << ", index_id=" << request->index_id() << ", txn_id=" << request->txn_id(); + brpc::ClosureGuard closure_guard(done); + auto st = _exec_env->tablet_writer_mgr()->open(*request); + if (!st.ok()) { + LOG(WARNING) << "tablet writer open failed, message=" << st.get_error_msg() + << ", id=" << request->id() + << ", index_id=" << request->index_id() + << ", txn_id=" << request->txn_id(); + } + st.to_protobuf(response->mutable_status()); +} + void PInternalServiceImpl::exec_plan_fragment( google::protobuf::RpcController* cntl_base, const PExecPlanFragmentRequest* request, @@ -71,6 +92,46 @@ void PInternalServiceImpl::exec_plan_fragment( st.to_protobuf(response->mutable_status()); } +void PInternalServiceImpl::tablet_writer_add_batch(google::protobuf::RpcController* controller, + const PTabletWriterAddBatchRequest* request, + PTabletWriterAddBatchResult* response, + google::protobuf::Closure* done) { + VLOG_RPC << "tablet writer add batch, id=" << request->id() + << ", index_id=" << request->index_id() + << ", sender_id=" << request->sender_id(); + // add batch maybe cost a lot of time, and this callback thread will be held. + // this will influence query execute, because of no bthread. So, we put this to + // a local thread pool to process + _tablet_worker_pool.offer( + [request, response, done, this] () { + brpc::ClosureGuard closure_guard(done); + auto st = _exec_env->tablet_writer_mgr()->add_batch(*request, response->mutable_tablet_vec()); + if (!st.ok()) { + LOG(WARNING) << "tablet writer add batch failed, message=" << st.get_error_msg() + << ", id=" << request->id() + << ", index_id=" << request->index_id() + << ", sender_id=" << request->sender_id(); + } + st.to_protobuf(response->mutable_status()); + }); +} + +void PInternalServiceImpl::tablet_writer_cancel(google::protobuf::RpcController* controller, + const PTabletWriterCancelRequest* request, + PTabletWriterCancelResult* response, + google::protobuf::Closure* done) { + VLOG_RPC << "tablet writer cancel, id=" << request->id() + << ", index_id=" << request->index_id() + << ", sender_id=" << request->sender_id(); + brpc::ClosureGuard closure_guard(done); + auto st = _exec_env->tablet_writer_mgr()->cancel(*request); + if (!st.ok()) { + LOG(WARNING) << "tablet writer cancel failed, id=" << request->id() + << ", index_id=" << request->index_id() + << ", sender_id=" << request->sender_id(); + } +} + Status PInternalServiceImpl::_exec_plan_fragment(brpc::Controller* cntl) { auto ser_request = cntl->request_attachment().to_string(); TExecPlanFragmentParams t_request; diff --git a/be/src/service/internal_service.h b/be/src/service/internal_service.h index 5bbc1bef3d..2fcbbbb7b8 100644 --- a/be/src/service/internal_service.h +++ b/be/src/service/internal_service.h @@ -17,6 +17,7 @@ #include "common/status.h" #include "gen_cpp/internal_service.pb.h" +#include "util/thread_pool.hpp" namespace brpc { class Controller; @@ -35,6 +36,7 @@ public: const ::palo::PTransmitDataParams* request, ::palo::PTransmitDataResult* response, ::google::protobuf::Closure* done) override; + void exec_plan_fragment( google::protobuf::RpcController* controller, const PExecPlanFragmentRequest* request, @@ -53,6 +55,21 @@ public: PFetchDataResult* result, google::protobuf::Closure* done) override; + void tablet_writer_open(google::protobuf::RpcController* controller, + const PTabletWriterOpenRequest* request, + PTabletWriterOpenResult* response, + google::protobuf::Closure* done) override; + + void tablet_writer_add_batch(google::protobuf::RpcController* controller, + const PTabletWriterAddBatchRequest* request, + PTabletWriterAddBatchResult* response, + google::protobuf::Closure* done) override; + + void tablet_writer_cancel(google::protobuf::RpcController* controller, + const PTabletWriterCancelRequest* request, + PTabletWriterCancelResult* response, + google::protobuf::Closure* done) override; + void fetch_fragment_exec_infos( google::protobuf::RpcController* controller, const PFetchFragmentExecInfoRequest* request, @@ -63,6 +80,7 @@ private: Status _exec_plan_fragment(brpc::Controller* cntl); private: ExecEnv* _exec_env; + ThreadPool _tablet_worker_pool; }; } diff --git a/be/src/service/palo_main.cpp b/be/src/service/palo_main.cpp index 42a200b4d2..d413ba0e11 100644 --- a/be/src/service/palo_main.cpp +++ b/be/src/service/palo_main.cpp @@ -43,7 +43,7 @@ #include "agent/status.h" #include "agent/topic_subscriber.h" #include "util/palo_metrics.h" -#include "olap/olap_main.h" +#include "olap/options.h" #include "service/backend_options.h" #include "service/backend_service.h" #include "service/brpc_service.h" @@ -55,6 +55,12 @@ static void help(const char*); #include +extern "C" { void __lsan_do_leak_check(); } + +namespace palo { +extern bool k_palo_exit; +} + int main(int argc, char** argv) { // check if print version or help if (argc > 1) { @@ -107,23 +113,36 @@ int main(int argc, char** argv) { MallocExtension::instance()->SetNumericProperty( "tcmalloc.aggressive_memory_decommit", 21474836480); #endif + + std::vector paths; + auto olap_res = palo::parse_conf_store_paths(palo::config::storage_root_path, &paths); + if (olap_res != palo::OLAP_SUCCESS) { + LOG(FATAL) << "parse config storage path failed, path=" << palo::config::storage_root_path; + exit(-1); + } + palo::LlvmCodeGen::initialize_llvm(); - palo::init_daemon(argc, argv); + palo::init_daemon(argc, argv, paths); palo::ResourceTls::init(); if (!palo::BackendOptions::init()) { exit(-1); } - // initialize storage - if (0 != palo::olap_main(argc, argv)) { - LOG(ERROR) << "olap start error!"; - palo::shutdown_logging(); - exit(1); + // options + palo::EngineOptions options; + options.store_paths = paths; + palo::OLAPEngine* engine = nullptr; + auto st = palo::OLAPEngine::open(options, &engine); + if (!st.ok()) { + LOG(FATAL) << "fail to open OLAPEngine, res=" << st.get_error_msg(); + exit(-1); } // start backend service for the coordinator on be_port - palo::ExecEnv exec_env; + palo::ExecEnv exec_env(paths); + exec_env.set_olap_engine(engine); + palo::FrontendHelper::setup(&exec_env); palo::ThriftServer* be_server = nullptr; @@ -168,7 +187,7 @@ int main(int argc, char** argv) { palo::shutdown_logging(); exit(1); } - + status = heartbeat_thrift_server->start(); if (!status.ok()) { LOG(ERROR) << "Palo BE HeartBeat Service did not start correctly, exiting"; @@ -176,14 +195,16 @@ int main(int argc, char** argv) { exit(1); } + while (!palo::k_palo_exit) { #if defined(LEAK_SANITIZER) - // __lsan_enable(); - while (true) { __lsan_do_leak_check(); +#endif sleep(10); } -#endif - brpc_service.join(); + heartbeat_thrift_server->stop(); + heartbeat_thrift_server->join(); + be_server->stop(); + be_server->join(); delete be_server; return 0; diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index dc76bc5881..6506527f9b 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -25,6 +25,7 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/util") set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/util") add_library(Util STATIC + arena.cc bfd_parser.cpp bitmap.cpp codec.cpp @@ -35,6 +36,7 @@ add_library(Util STATIC decompress.cpp disk_info.cpp hash_util.hpp + json_util.cpp palo_metrics.cpp mem_info.cpp metrics.cpp @@ -69,6 +71,7 @@ add_library(Util STATIC # coding_util.cpp cidr.cpp core_local.cpp + uid_util.cpp ) #ADD_BE_TEST(integer-array-test) diff --git a/be/src/util/brpc_stub_cache.h b/be/src/util/brpc_stub_cache.h index 7ec04116ec..9ba20abf26 100644 --- a/be/src/util/brpc_stub_cache.h +++ b/be/src/util/brpc_stub_cache.h @@ -64,6 +64,15 @@ public: return get_stub(endpoint); } + PInternalService_Stub* get_stub(const std::string& host, int port) { + butil::EndPoint endpoint; + if (str2endpoint(host.c_str(), port, &endpoint)) { + LOG(WARNING) << "unknown endpoint, hostname=" << host; + return nullptr; + } + return get_stub(endpoint); + } + private: SpinLock _lock; butil::FlatMap _stub_map; diff --git a/be/src/util/file_utils.cpp b/be/src/util/file_utils.cpp index 52260c9135..e38b4b4819 100644 --- a/be/src/util/file_utils.cpp +++ b/be/src/util/file_utils.cpp @@ -260,5 +260,15 @@ Status FileUtils::md5sum(const std::string& file, std::string* md5sum) { return Status::OK; } +bool FileUtils::check_exist(const std::string& path) { + boost::system::error_code errcode; + bool exist = boost::filesystem::exists(path, errcode); + if (errcode != boost::system::errc::success && errcode != boost::system::errc::no_such_file_or_directory) { + LOG(WARNING) << "error when check path:" << path << ", error code:" << errcode; + return false; + } + return exist; +} + } diff --git a/be/src/util/file_utils.h b/be/src/util/file_utils.h index 02fb3aaece..47163f2ff4 100644 --- a/be/src/util/file_utils.h +++ b/be/src/util/file_utils.h @@ -68,6 +68,8 @@ public: // calc md5sum of a local file static Status md5sum(const std::string& file, std::string* md5sum); + + static bool check_exist(const std::string& path); }; } diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index 6da0ce5377..fd3018557d 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -309,6 +309,17 @@ struct hash<__int128> { }; #endif +template<> +struct hash> { + size_t operator()(const std::pair& pair) const { + size_t seed = 0; + seed = palo::HashUtil::hash(&pair.first.lo, sizeof(pair.first.lo), seed); + seed = palo::HashUtil::hash(&pair.first.hi, sizeof(pair.first.hi), seed); + seed = palo::HashUtil::hash(&pair.second, sizeof(pair.second), seed); + return seed; + } +}; + } #endif diff --git a/be/src/util/json_util.h b/be/src/util/json_util.h index 06cc17c6a0..cbbd3f8206 100644 --- a/be/src/util/json_util.h +++ b/be/src/util/json_util.h @@ -21,9 +21,12 @@ #ifndef BDG_PALO_BE_SRC_UTIL_JSON_UTIL_H #define BDG_PALO_BE_SRC_UTIL_JSON_UTIL_H +#include + #include #include +#include "common/status.h" #include "util/pretty_printer.h" #include "util/template_util.h" @@ -61,6 +64,7 @@ ToJsonValue(const T& value, const TUnit::type unit, rapidjson::Document* documen } } +std::string to_json(const Status& status); } #endif diff --git a/be/src/util/thrift_server.cpp b/be/src/util/thrift_server.cpp index 0239d2959f..a627e2d791 100644 --- a/be/src/util/thrift_server.cpp +++ b/be/src/util/thrift_server.cpp @@ -370,6 +370,10 @@ Status ThriftServer::start() { return Status::OK; } +void ThriftServer::stop() { + _server->stop(); +} + void ThriftServer::join() { DCHECK(_server_thread != NULL); DCHECK(_started); diff --git a/be/src/util/thrift_server.h b/be/src/util/thrift_server.h index ea58cf179c..18fd9fd652 100644 --- a/be/src/util/thrift_server.h +++ b/be/src/util/thrift_server.h @@ -84,6 +84,7 @@ public: return _port; } + void stop(); // Blocks until the server stops and exits its main thread. void join(); diff --git a/be/src/util/uid_util.h b/be/src/util/uid_util.h index c1f51471b4..d4898c33dd 100644 --- a/be/src/util/uid_util.h +++ b/be/src/util/uid_util.h @@ -21,13 +21,75 @@ #ifndef BDG_PALO_BE_SRC_UTIL_UID_UTIL_H #define BDG_PALO_BE_SRC_UTIL_UID_UTIL_H +#include + +#include #include #include +#include #include "gen_cpp/Types_types.h" // for TUniqueId +#include "gen_cpp/types.pb.h" // for PUniqueId #include "util/debug_util.h" +#include "util/hash_util.hpp" namespace palo { + +// convert int to a hex format string, buf must enough to hold coverted hex string +template +inline void to_hex(T val, char* buf) { + static const char* digits = "0123456789ABCDEF"; + for (int i = 0; i < 2 * sizeof(T); ++i) { + buf[2 * sizeof(T) - 1 - i] = digits[val & 0x0F]; + val >>= 4; + } +} + +struct UniqueId { + int64_t hi; + int64_t lo; + + UniqueId() { + auto uuid = boost::uuids::basic_random_generator()(); + memcpy(&hi, uuid.data, sizeof(int64_t)); + memcpy(&lo, uuid.data + sizeof(int64_t), sizeof(int64_t)); + } + UniqueId(int64_t hi_, int64_t lo_) : hi(hi_), lo(lo_) { } + UniqueId(const TUniqueId& tuid) : hi(tuid.hi), lo(tuid.lo) { } + UniqueId(const PUniqueId& puid) : hi(puid.hi()), lo(puid.lo()) { } + ~UniqueId() noexcept { } + + std::string to_string() const { + char buf[33]; + to_hex(hi, buf); + buf[16] = ':'; + to_hex(lo, buf + 17); + return {buf, 33}; + } + + size_t hash(size_t seed = 0) const { + return palo::HashUtil::hash(this, sizeof(*this), seed); + } + + bool operator==(const UniqueId& rhs) const { + return hi == rhs.hi && lo == rhs.lo; + } + + TUniqueId to_thrift() const { + TUniqueId tid; + tid.__set_hi(hi); + tid.__set_lo(lo); + return tid; + } + + PUniqueId to_proto() const { + PUniqueId pid; + pid.set_hi(hi); + pid.set_lo(lo); + return pid; + } +}; + // This function must be called 'hash_value' to be picked up by boost. inline std::size_t hash_value(const palo::TUniqueId& id) { std::size_t seed = 0; @@ -36,7 +98,34 @@ inline std::size_t hash_value(const palo::TUniqueId& id) { return seed; } +/// generates a 16 byte UUID +inline std::string generate_uuid_string() { + return boost::uuids::to_string(boost::uuids::basic_random_generator()()); +} + +/// generates a 16 byte UUID +inline TUniqueId generate_uuid() { + auto uuid = boost::uuids::basic_random_generator()(); + TUniqueId uid; + memcpy(&uid.hi, uuid.data, sizeof(int64_t)); + memcpy(&uid.lo, uuid.data + sizeof(int64_t), sizeof(int64_t)); + return uid; +} + +std::ostream& operator<<(std::ostream& os, const UniqueId& uid); + } // namespace palo +namespace std { + +template<> +struct hash { + size_t operator()(const palo::UniqueId& uid) const { + return uid.hash(); + } +}; + +} + #endif // BDG_PALO_BE_SRC_UTIL_UID_UTIL_H diff --git a/be/src/util/url_coding.cpp b/be/src/util/url_coding.cpp index a494ba2bfb..2a9c63fc78 100644 --- a/be/src/util/url_coding.cpp +++ b/be/src/util/url_coding.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include "common/logging.h" @@ -90,6 +91,51 @@ bool url_decode(const std::string& in, std::string* out) { return true; } +static void encode_base64_internal(const std::string& in, std::string* out, + const unsigned char* basis, bool padding) { + size_t len = in.size(); + std::unique_ptr buf(new unsigned char[len]); + const unsigned char* s = reinterpret_cast(in.data()); + unsigned char* d = buf.get(); + while (len > 2) { + *d++ = basis[(s[0] >> 2) & 0x3f]; + *d++ = basis[((s[0] & 3) << 4) | (s[1] >> 4)]; + *d++ = basis[((s[1] & 0x0f) << 2) | (s[2] >> 6)]; + *d++ = basis[s[2] & 0x3f]; + + s += 3; + len -= 3; + } + if (len) { + *d++ = basis[(s[0] >> 2) & 0x3f]; + if (len == 1) { + *d++ = basis[(s[0] & 3) << 4]; + if (padding) { + *d++ = '='; + } + } else { + *d++ = basis[((s[0] & 3) << 4) | (s[1] >> 4)]; + *d++ = basis[(s[1] & 0x0f) << 2]; + } + if (padding) { + *d++ = '='; + } + } + out->assign((char*)buf.get(), d - buf.get()); +} + +void base64url_encode(const std::string& in, std::string *out) { + static unsigned char basis64[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + encode_base64_internal(in, out, basis64, false); +} + +void base64_encode(const std::string& in, std::string* out) { + static unsigned char basis64[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + encode_base64_internal(in, out, basis64, true); +} + static char encoding_table[] = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', diff --git a/be/src/util/url_coding.h b/be/src/util/url_coding.h index 03742f3952..afff4289e0 100644 --- a/be/src/util/url_coding.h +++ b/be/src/util/url_coding.h @@ -42,6 +42,9 @@ void url_encode(const std::vector& in, std::string* out); // certain characters like ' '. bool url_decode(const std::string& in, std::string* out); +void base64url_encode(const std::string& in, std::string *out); +void base64_encode(const std::string& in, std::string *out); + // Utility method to decode base64 encoded strings. Also not extremely // performant. // Returns true unless the string could not be correctly decoded. diff --git a/be/test/agent/CMakeLists.txt b/be/test/agent/CMakeLists.txt index 31129420e9..ad205bb89e 100644 --- a/be/test/agent/CMakeLists.txt +++ b/be/test/agent/CMakeLists.txt @@ -4,10 +4,10 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/test/agent") # where to put generated binaries set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/agent") -ADD_BE_TEST(agent_server_test) +# ADD_BE_TEST(agent_server_test) ADD_BE_TEST(cgroups_mgr_test) ADD_BE_TEST(file_downloader_test) -ADD_BE_TEST(heartbeat_server_test) +#ADD_BE_TEST(heartbeat_server_test) ADD_BE_TEST(pusher_test) -ADD_BE_TEST(task_worker_pool_test) +# ADD_BE_TEST(task_worker_pool_test) ADD_BE_TEST(utils_test) diff --git a/be/test/agent/agent_server_test.cpp b/be/test/agent/agent_server_test.cpp index e53f4bd76d..3a18016f6f 100644 --- a/be/test/agent/agent_server_test.cpp +++ b/be/test/agent/agent_server_test.cpp @@ -34,9 +34,10 @@ TEST(SubmitTasksTest, TestSubmitTasks){ TAgentResult return_value; vector tasks; + ExecEnv env; TMasterInfo master_info; TNetworkAddress network_address; - AgentServer agent_server(NULL, master_info); + AgentServer agent_server(&env, master_info); // Master info not init agent_server.submit_tasks(return_value, tasks); @@ -114,9 +115,10 @@ TEST(MakeSnapshotTest, TestMakeSnapshot) { string snapshot_path; TMasterInfo master_info; + ExecEnv env; CommandExecutor* tmp; MockCommandExecutor mock_command_executor; - AgentServer agent_server(NULL, master_info); + AgentServer agent_server(&env, master_info); tmp = agent_server._command_executor; agent_server._command_executor = &mock_command_executor; @@ -146,7 +148,8 @@ TEST(ReleaseSnapshotTest, TestReleaseSnapshot) { CommandExecutor* tmp; MockCommandExecutor mock_command_executor; - AgentServer agent_server(NULL, master_info); + ExecEnv env; + AgentServer agent_server(&env, master_info); tmp = agent_server._command_executor; agent_server._command_executor = &mock_command_executor; diff --git a/be/test/agent/pusher_test.cpp b/be/test/agent/pusher_test.cpp index 92d9db254d..d208627968 100644 --- a/be/test/agent/pusher_test.cpp +++ b/be/test/agent/pusher_test.cpp @@ -42,12 +42,12 @@ TEST(PusherTest, TestInit) { TPushReq push_req; push_req.tablet_id = 1; push_req.schema_hash = 12345; - Pusher pusher(push_req); + Pusher pusher(nullptr, push_req); - CommandExecutor* tmp = NULL; + OLAPEngine* tmp = NULL; MockCommandExecutor mock_command_executor; - tmp = pusher._command_executor; - pusher._command_executor = &mock_command_executor; + tmp = pusher._engine; + pusher._engine = &mock_command_executor; OLAPTable* olap_table = NULL; // not init, can not get olap table @@ -58,7 +58,7 @@ TEST(PusherTest, TestInit) { EXPECT_EQ(PALO_PUSH_INVALID_TABLE, ret); // not init, can get olap table, and empty remote path - olap_table = new OLAPTable(new OLAPHeader("./test_data/header")); + olap_table = new OLAPTable(new OLAPHeader("./test_data/header"), nullptr); EXPECT_CALL(mock_command_executor, get_table(1, 12345)) .Times(1) .WillOnce(Return(std::shared_ptr(olap_table))); @@ -69,16 +69,16 @@ TEST(PusherTest, TestInit) { // has inited ret = pusher.init(); EXPECT_EQ(PALO_SUCCESS, ret); - pusher._command_executor = tmp; + pusher._engine = tmp; // not inited, remote path not empty string http_file_path = "http://xx"; string root_path_name = "./test_data/data"; - olap_table = new OLAPTable(new OLAPHeader("./test_data/header")); + olap_table = new OLAPTable(new OLAPHeader("./test_data/header"), nullptr); push_req.__set_http_file_path(http_file_path); - Pusher pusher2(push_req); - tmp = pusher2._command_executor; - pusher2._command_executor = &mock_command_executor; + Pusher pusher2(nullptr, push_req); + tmp = pusher2._engine; + pusher2._engine = &mock_command_executor; olap_table->_storage_root_path = root_path_name; EXPECT_CALL(mock_command_executor, get_table(1, 12345)) .Times(1) @@ -92,12 +92,12 @@ TEST(PusherTest, TestInit) { root_path_name.c_str(), strlen(root_path_name.c_str()))); - pusher2._command_executor = tmp; + pusher2._engine = tmp; } TEST(PusherTest, TestGetTmpFileDir) { TPushReq push_req; - Pusher pusher(push_req); + Pusher pusher(nullptr, push_req); // download path not exist string root_path = "./test_data/dpp_download_file"; @@ -113,7 +113,7 @@ TEST(PusherTest, TestGetTmpFileDir) { TEST(PusherTest, TestDownloadFile){ TPushReq push_req; - Pusher pusher(push_req); + Pusher pusher(nullptr, push_req); // download success FileDownloader::FileDownloaderParam param; @@ -135,7 +135,7 @@ TEST(PusherTest, TestDownloadFile){ TEST(PusherTest, TestGetFileNameFromPath) { TPushReq push_req; - Pusher pusher(push_req); + Pusher pusher(nullptr, push_req); string file_path = "/file_path/file_name"; string file_name; @@ -145,7 +145,7 @@ TEST(PusherTest, TestGetFileNameFromPath) { TEST(PusherTest, TestProcess) { TPushReq push_req; - Pusher pusher(push_req); + Pusher pusher(nullptr, push_req); vector tablet_infos; // not init @@ -156,9 +156,9 @@ TEST(PusherTest, TestProcess) { pusher._is_init = true; pusher._downloader_param.local_file_path = "./test_data/download_file"; MockCommandExecutor mock_command_executor; - CommandExecutor* tmp; - tmp = pusher._command_executor; - pusher._command_executor = &mock_command_executor; + OLAPEngine* tmp; + tmp = pusher._engine; + pusher._engine = &mock_command_executor; EXPECT_CALL(mock_command_executor, push(push_req, &tablet_infos)) .Times(1) .WillOnce(Return(OLAPStatus::OLAP_SUCCESS)); @@ -191,11 +191,11 @@ TEST(PusherTest, TestProcess) { EXPECT_EQ(PALO_ERROR, ret); EXPECT_FALSE(boost::filesystem::exists(download_file_path)); - pusher._command_executor = tmp; + pusher._engine = tmp; // init, remote file not empty, not set file length push_req.__set_http_file_path("http://xxx"); - Pusher pusher2(push_req); + Pusher pusher2(nullptr, push_req); pusher2._is_init = true; FileDownloader::FileDownloaderParam param; MockFileDownloader mock_file_downloader(param); @@ -236,8 +236,8 @@ TEST(PusherTest, TestProcess) { fputs("palo be test", fp); fclose(fp); now = time(NULL); - tmp = pusher2._command_executor; - pusher2._command_executor = &mock_command_executor; + tmp = pusher2._engine; + pusher2._engine = &mock_command_executor; pusher2._push_req.timeout = now + 100; pusher2._push_req.__set_http_file_size(local_file_size); EXPECT_CALL(mock_command_executor, push(_, &tablet_infos)) @@ -259,7 +259,7 @@ TEST(PusherTest, TestProcess) { ret = pusher2.process(&tablet_infos); EXPECT_EQ(PALO_SUCCESS, ret); - pusher2._command_executor = tmp; + pusher2._engine = tmp; } } // namespace palo diff --git a/be/test/agent/task_worker_pool_test.cpp b/be/test/agent/task_worker_pool_test.cpp index 05de7f44af..6e00e48093 100644 --- a/be/test/agent/task_worker_pool_test.cpp +++ b/be/test/agent/task_worker_pool_test.cpp @@ -79,6 +79,13 @@ TEST(TaskWorkerPoolTest, TestStart) { EXPECT_EQ(task_worker_pool_push._worker_count, config::push_worker_count_normal_priority + config::push_worker_count_high_priority); + TaskWorkerPool task_worker_pool_publish_version( + TaskWorkerPool::TaskWorkerType::PUBLISH_VERSION, + &env, + master_info); + task_worker_pool_publish_version.start(); + EXPECT_EQ(task_worker_pool_publish_version._worker_count, config::publish_version_worker_count); + TaskWorkerPool task_worker_pool_alter_table( TaskWorkerPool::TaskWorkerType::ALTER_TABLE, &env, @@ -379,6 +386,7 @@ TEST(TaskWorkerPoolTest, TestFinishTask) { task_worker_pool._master_client = original_master_server_client; } +#if 0 TEST(TaskWorkerPoolTest, TestCreateTable) { TMasterInfo master_info; ExecEnv env; @@ -433,6 +441,7 @@ TEST(TaskWorkerPoolTest, TestCreateTable) { task_worker_pool._command_executor = original_command_executor; task_worker_pool._master_client = original_master_server_client; } +#endif TEST(TaskWorkerPoolTest, TestDropTableTask) { TMasterInfo master_info; @@ -597,7 +606,7 @@ TEST(TaskWorkerPoolTest, TestSchemaChange) { agent_task_request.alter_tablet_req.base_tablet_id, agent_task_request.alter_tablet_req.base_schema_hash)) .Times(1) - .WillOnce(Return(ALTER_TABLE_DONE)); + .WillOnce(Return(ALTER_TABLE_FINISHED)); EXPECT_CALL(mock_command_executor, drop_table(_)) .Times(0); EXPECT_CALL(mock_command_executor, schema_change(_)) @@ -652,7 +661,7 @@ TEST(TaskWorkerPoolTest, TestRollup) { agent_task_request.alter_tablet_req.base_tablet_id, agent_task_request.alter_tablet_req.base_schema_hash)) .Times(1) - .WillOnce(Return(ALTER_TABLE_DONE)); + .WillOnce(Return(ALTER_TABLE_FINISHED)); EXPECT_CALL(mock_command_executor, drop_table(_)) .Times(0); EXPECT_CALL(mock_command_executor, create_rollup_table(_)) @@ -792,6 +801,61 @@ TEST(TaskWorkerPoolTest, TestPush) { task_worker_pool._pusher = original_pusher; } +TEST(TaskWorkerPoolTest, TestPublishVersionTask) { + TMasterInfo master_info; + ExecEnv env; + TAgentTaskRequest agent_task_request; + agent_task_request.task_type = TTaskType::PUBLISH_VERSION; + agent_task_request.signature = 123456; + TaskWorkerPool task_worker_pool( + TaskWorkerPool::TaskWorkerType::PUBLISH_VERSION, + &env, + master_info); + + MockCommandExecutor mock_command_executor; + CommandExecutor* original_command_executor; + original_command_executor = task_worker_pool._command_executor; + task_worker_pool._command_executor = &mock_command_executor; + FrontendServiceClientCache* client_cache = new FrontendServiceClientCache(); + MockMasterServerClient mock_master_server_client(master_info, client_cache); + MasterServerClient* original_master_server_client; + original_master_server_client = task_worker_pool._master_client; + task_worker_pool._master_client = &mock_master_server_client; + + // publish version failed + EXPECT_CALL(mock_command_executor, publish_version(_, _)) + .Times(3) + .WillRepeatedly(Return(OLAPStatus::OLAP_ERR_OTHER_ERROR)); + EXPECT_CALL(mock_master_server_client, finish_task(_, _)) + .Times(1) + .WillOnce(Return(PALO_SUCCESS)); + + task_worker_pool.submit_task(agent_task_request); + EXPECT_EQ(1, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(1, task_worker_pool._tasks.size()); + task_worker_pool._publish_version_worker_thread_callback(&task_worker_pool); + EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(0, task_worker_pool._tasks.size()); + + // publish version success + EXPECT_CALL(mock_command_executor, publish_version(_, _)) + .Times(1) + .WillOnce(Return(OLAPStatus::OLAP_SUCCESS)); + EXPECT_CALL(mock_master_server_client, finish_task(_, _)) + .Times(1) + .WillOnce(Return(PALO_SUCCESS)); + + task_worker_pool.submit_task(agent_task_request); + EXPECT_EQ(1, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(1, task_worker_pool._tasks.size()); + task_worker_pool._publish_version_worker_thread_callback(&task_worker_pool); + EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(0, task_worker_pool._tasks.size()); + + task_worker_pool._command_executor = original_command_executor; + task_worker_pool._master_client = original_master_server_client; +} + TEST(TaskWorkerPoolTest, TestClone) { TMasterInfo master_info; ExecEnv env; @@ -827,17 +891,142 @@ TEST(TaskWorkerPoolTest, TestClone) { original_agent_utils = task_worker_pool._agent_utils; task_worker_pool._agent_utils = &mock_agent_utils; - // Tablet has exist, get tablet info failed - agent_task_request.clone_req.tablet_id = 123; - agent_task_request.clone_req.schema_hash = 456; - std::shared_ptr olap_table_ok(new OLAPTable(NULL)); + // Tablet has exist + // incremental clone's make snapshot failed + // full clone's make snapshot failed + TCloneReq clone_req; + TBackend backend1; + TBackend backend2; + TBackend backend3; + clone_req.src_backends.push_back(backend1); + clone_req.src_backends.push_back(backend2); + clone_req.src_backends.push_back(backend3); + clone_req.tablet_id = 123; + clone_req.schema_hash = 456; + + TAgentResult agent_result; + agent_result.status.status_code = TStatusCode::INTERNAL_ERROR; + agent_task_request.__set_clone_req(clone_req); + + TSnapshotRequest snapshot_request; + snapshot_request.__set_tablet_id(agent_task_request.clone_req.tablet_id); + snapshot_request.__set_schema_hash(agent_task_request.clone_req.schema_hash); + + TSnapshotRequest snapshot_request2; + snapshot_request2.__set_tablet_id(agent_task_request.clone_req.tablet_id); + snapshot_request2.__set_schema_hash(agent_task_request.clone_req.schema_hash); + std::vector missing_versions; + snapshot_request2.__set_missing_version(missing_versions); + + std::shared_ptr olap_table_ok(new OLAPTable(NULL, nullptr)); EXPECT_CALL(mock_command_executor, get_table( agent_task_request.clone_req.tablet_id, agent_task_request.clone_req.schema_hash)) .Times(1) .WillOnce(Return(olap_table_ok)); - EXPECT_CALL(mock_command_executor, obtain_shard_path(_, _)) + EXPECT_CALL(mock_command_executor, get_info_before_incremental_clone(_, _, _)) + .Times(1); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request2, _)) + .Times(clone_req.src_backends.size()) + .WillRepeatedly(DoAll(SetArgPointee<1>(agent_result), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request, _)) + .Times(clone_req.src_backends.size()) + .WillRepeatedly(DoAll(SetArgPointee<1>(agent_result), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_agent_server_client, release_snapshot(_, _)) .Times(0); + EXPECT_CALL(mock_command_executor, finish_clone(_, _, _, _)) + .Times(0); + EXPECT_CALL(mock_command_executor, report_tablet_info(_)) + .Times(0); + EXPECT_CALL(mock_master_server_client, finish_task(_, _)) + .Times(1) + .WillOnce(Return(PALO_SUCCESS)); + + task_worker_pool.submit_task(agent_task_request); + EXPECT_EQ(1, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(1, task_worker_pool._tasks.size()); + task_worker_pool._clone_worker_thread_callback(&task_worker_pool); + EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(0, task_worker_pool._tasks.size()); + + // Tablet has exist + // incremental clone's make snapshot success + // incremental clone failed + TAgentResult agent_result2; + agent_result2.__set_snapshot_path("path"); + agent_result2.status.status_code = TStatusCode::OK; + + EXPECT_CALL(mock_command_executor, get_table( + agent_task_request.clone_req.tablet_id, + agent_task_request.clone_req.schema_hash)) + .Times(1) + .WillOnce(Return(olap_table_ok)); + EXPECT_CALL(mock_command_executor, get_info_before_incremental_clone(_, _, _)) + .Times(1) + .WillOnce(Return("./test_data/5/6")); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request2, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, list_file_dir(_)) + .Times(1) + .WillOnce( + DoAll(SetArgPointee<0>("1.hdr\n1.idx\n1.dat"), Return(PALO_SUCCESS))); + uint64_t file_size = 4; + EXPECT_CALL(mock_file_downloader, get_length(_)) + .Times(3) + .WillRepeatedly(DoAll(SetArgPointee<0>(file_size), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, download_file()) + .Times(3) + .WillRepeatedly(Return(PALO_SUCCESS)); + EXPECT_CALL(mock_agent_server_client, release_snapshot(_, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_command_executor, finish_clone(_, _, _, _)) + .Times(1) + .WillOnce(Return(OLAP_ERR_OTHER_ERROR)); + EXPECT_CALL(mock_command_executor, report_tablet_info(_)) + .Times(0); + EXPECT_CALL(mock_master_server_client, finish_task(_, _)) + .Times(1) + .WillOnce(Return(PALO_SUCCESS)); + + task_worker_pool.submit_task(agent_task_request); + EXPECT_EQ(1, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(1, task_worker_pool._tasks.size()); + task_worker_pool._clone_worker_thread_callback(&task_worker_pool); + EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(0, task_worker_pool._tasks.size()); + + // Tablet has exist + // incremental clone success + // get tablet info failed + EXPECT_CALL(mock_command_executor, get_table( + agent_task_request.clone_req.tablet_id, + agent_task_request.clone_req.schema_hash)) + .Times(1) + .WillOnce(Return(olap_table_ok)); + EXPECT_CALL(mock_command_executor, get_info_before_incremental_clone(_, _, _)) + .Times(1) + .WillOnce(Return("./test_data/5/6")); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request2, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, list_file_dir(_)) + .Times(1) + .WillOnce( + DoAll(SetArgPointee<0>("1.hdr\n1.idx\n1.dat"), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, get_length(_)) + .Times(3) + .WillRepeatedly(DoAll(SetArgPointee<0>(file_size), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, download_file()) + .Times(3) + .WillRepeatedly(Return(PALO_SUCCESS)); + EXPECT_CALL(mock_agent_server_client, release_snapshot(_, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_command_executor, finish_clone(_, _, _, _)) + .Times(1) + .WillOnce(Return(OLAP_SUCCESS)); EXPECT_CALL(mock_command_executor, report_tablet_info(_)) .Times(1) .WillOnce(Return(OLAPStatus::OLAP_ERR_OTHER_ERROR)); @@ -852,6 +1041,100 @@ TEST(TaskWorkerPoolTest, TestClone) { EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); EXPECT_EQ(0, task_worker_pool._tasks.size()); + // Tablet has exist + // incremental clone's make snapshot failed + // full clone's make snapshot success + // full clone failed + EXPECT_CALL(mock_command_executor, get_table( + agent_task_request.clone_req.tablet_id, + agent_task_request.clone_req.schema_hash)) + .Times(1) + .WillOnce(Return(olap_table_ok)); + EXPECT_CALL(mock_command_executor, get_info_before_incremental_clone(_, _, _)) + .Times(1) + .WillOnce(Return("./test_data/5/6")); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request2, _)) + .Times(clone_req.src_backends.size()) + .WillRepeatedly(DoAll(SetArgPointee<1>(agent_result), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, list_file_dir(_)) + .Times(1) + .WillOnce( + DoAll(SetArgPointee<0>("1.hdr\n1.idx\n1.dat"), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, get_length(_)) + .Times(3) + .WillRepeatedly(DoAll(SetArgPointee<0>(file_size), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, download_file()) + .Times(3) + .WillRepeatedly(Return(PALO_SUCCESS)); + EXPECT_CALL(mock_agent_server_client, release_snapshot(_, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_command_executor, finish_clone(_, _, _, _)) + .Times(1) + .WillOnce(Return(OLAP_ERR_OTHER_ERROR)); + EXPECT_CALL(mock_command_executor, report_tablet_info(_)) + .Times(0); + EXPECT_CALL(mock_master_server_client, finish_task(_, _)) + .Times(1) + .WillOnce(Return(PALO_SUCCESS)); + + task_worker_pool.submit_task(agent_task_request); + EXPECT_EQ(1, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(1, task_worker_pool._tasks.size()); + task_worker_pool._clone_worker_thread_callback(&task_worker_pool); + EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(0, task_worker_pool._tasks.size()); + + // Tablet has exist + // incremental clone's make snapshot failed + // full clone's make snapshot success + // full clone success + EXPECT_CALL(mock_command_executor, get_table( + agent_task_request.clone_req.tablet_id, + agent_task_request.clone_req.schema_hash)) + .Times(1) + .WillOnce(Return(olap_table_ok)); + EXPECT_CALL(mock_command_executor, get_info_before_incremental_clone(_, _, _)) + .Times(1) + .WillOnce(Return("./test_data/5/6")); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request2, _)) + .Times(clone_req.src_backends.size()) + .WillRepeatedly(DoAll(SetArgPointee<1>(agent_result), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, list_file_dir(_)) + .Times(1) + .WillOnce( + DoAll(SetArgPointee<0>("1.hdr\n1.idx\n1.dat"), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, get_length(_)) + .Times(3) + .WillRepeatedly(DoAll(SetArgPointee<0>(file_size), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_file_downloader, download_file()) + .Times(3) + .WillRepeatedly(Return(PALO_SUCCESS)); + EXPECT_CALL(mock_agent_server_client, release_snapshot(_, _)) + .Times(1) + .WillOnce(DoAll(SetArgPointee<1>(agent_result2), Return(PALO_SUCCESS))); + EXPECT_CALL(mock_command_executor, finish_clone(_, _, _, _)) + .Times(1) + .WillOnce(Return(OLAP_SUCCESS)); + EXPECT_CALL(mock_command_executor, report_tablet_info(_)) + .Times(1); + EXPECT_CALL(mock_master_server_client, finish_task(_, _)) + .Times(1) + .WillOnce(Return(PALO_SUCCESS)); + + task_worker_pool.submit_task(agent_task_request); + EXPECT_EQ(1, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(1, task_worker_pool._tasks.size()); + task_worker_pool._clone_worker_thread_callback(&task_worker_pool); + EXPECT_EQ(0, task_worker_pool._s_task_signatures[agent_task_request.task_type].size()); + EXPECT_EQ(0, task_worker_pool._tasks.size()); + // Tablet not exist, obtain root path failed, do not get tablet info std::shared_ptr olap_table_null(NULL); EXPECT_CALL(mock_command_executor, get_table( @@ -876,20 +1159,7 @@ TEST(TaskWorkerPoolTest, TestClone) { EXPECT_EQ(0, task_worker_pool._tasks.size()); // Tablet not exist, obtain root path success, make snapshot failed - TCloneReq clone_req; - TBackend backend1; - TBackend backend2; - TBackend backend3; - clone_req.src_backends.push_back(backend1); - clone_req.src_backends.push_back(backend2); - clone_req.src_backends.push_back(backend3); - TAgentResult agent_result; - agent_result.status.status_code = TStatusCode::INTERNAL_ERROR; - TAgentResult agent_result2; - agent_result2.status.status_code = TStatusCode::OK; agent_result2.__isset.snapshot_path = false; - agent_task_request.__set_clone_req(clone_req); - EXPECT_CALL(mock_command_executor, get_table( agent_task_request.clone_req.tablet_id, agent_task_request.clone_req.schema_hash)) @@ -898,9 +1168,6 @@ TEST(TaskWorkerPoolTest, TestClone) { EXPECT_CALL(mock_command_executor, obtain_shard_path(_, _)) .Times(1) .WillOnce(Return(OLAPStatus::OLAP_SUCCESS)); - TSnapshotRequest snapshot_request; - snapshot_request.__set_tablet_id(agent_task_request.clone_req.tablet_id); - snapshot_request.__set_schema_hash(agent_task_request.clone_req.schema_hash); EXPECT_CALL(mock_agent_server_client, make_snapshot(snapshot_request, _)) .Times(clone_req.src_backends.size()) .WillOnce(DoAll(SetArgPointee<1>(agent_result), Return(PALO_SUCCESS))) @@ -1067,7 +1334,7 @@ TEST(TaskWorkerPoolTest, TestClone) { .Times(clone_req.src_backends.size()) .WillRepeatedly( DoAll(SetArgPointee<0>("1.hdr\n1.idx\n1.dat"), Return(PALO_SUCCESS))); - uint64_t file_size = 5; + file_size = 5; EXPECT_CALL(mock_file_downloader, get_length(_)) .Times(clone_req.src_backends.size()) .WillRepeatedly(DoAll(SetArgPointee<0>(file_size), Return(PALO_SUCCESS))); diff --git a/be/test/exec/CMakeLists.txt b/be/test/exec/CMakeLists.txt index 3b248ca0f5..7bd208ebc7 100644 --- a/be/test/exec/CMakeLists.txt +++ b/be/test/exec/CMakeLists.txt @@ -46,6 +46,8 @@ ADD_BE_TEST(plain_text_line_reader_lzop_test) ADD_BE_TEST(broker_reader_test) ADD_BE_TEST(broker_scanner_test) ADD_BE_TEST(broker_scan_node_test) +ADD_BE_TEST(olap_table_info_test) +ADD_BE_TEST(olap_table_sink_test) #ADD_BE_TEST(schema_scan_node_test) #ADD_BE_TEST(schema_scanner_test) ##ADD_BE_TEST(set_executor_test) diff --git a/be/test/exec/broker_scan_node_test.cpp b/be/test/exec/broker_scan_node_test.cpp index 02360742c1..8c04e56a4f 100644 --- a/be/test/exec/broker_scan_node_test.cpp +++ b/be/test/exec/broker_scan_node_test.cpp @@ -38,6 +38,7 @@ class BrokerScanNodeTest : public testing::Test { public: BrokerScanNodeTest() : _runtime_state("BrokerScanNodeTest") { init(); + _runtime_state._instance_mem_tracker.reset(new MemTracker()); } void init(); static void SetUpTestCase() { @@ -395,8 +396,9 @@ TEST_F(BrokerScanNodeTest, normal) { status = scan_node.open(&_runtime_state); ASSERT_TRUE(status.ok()); + MemTracker tracker; // Get batch - RowBatch batch(scan_node.row_desc(), 1024, _runtime_state.instance_mem_tracker()); + RowBatch batch(scan_node.row_desc(), _runtime_state.batch_size(), &tracker); bool eos = false; status = scan_node.get_next(&_runtime_state, &batch, &eos); diff --git a/be/test/exec/broker_scanner_test.cpp b/be/test/exec/broker_scanner_test.cpp index 5ac39d2fcd..8062a08a06 100644 --- a/be/test/exec/broker_scanner_test.cpp +++ b/be/test/exec/broker_scanner_test.cpp @@ -25,6 +25,7 @@ #include "runtime/tuple.h" #include "exec/local_file_reader.h" #include "runtime/descriptors.h" +#include "runtime/mem_tracker.h" #include "runtime/runtime_state.h" #include "runtime/lib_cache.h" #include "gen_cpp/Descriptors_types.h" @@ -38,6 +39,7 @@ public: BrokerScannerTest() : _runtime_state("BrokerScannerTest") { init(); _profile = _runtime_state.runtime_profile(); + _runtime_state._instance_mem_tracker.reset(new MemTracker()); } void init(); @@ -55,6 +57,7 @@ private: void init_desc_table(); void init_params(); + MemTracker _tracker; RuntimeState _runtime_state; RuntimeProfile* _profile; ObjectPool _obj_pool; @@ -356,7 +359,7 @@ TEST_F(BrokerScannerTest, normal) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemPool tuple_pool(&_tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // 1,2,3 @@ -408,7 +411,8 @@ TEST_F(BrokerScannerTest, normal2) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // 1,2,3 @@ -454,7 +458,8 @@ TEST_F(BrokerScannerTest, normal3) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // 1,2,3 @@ -501,7 +506,8 @@ TEST_F(BrokerScannerTest, normal4) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // 1,2,3 @@ -532,7 +538,8 @@ TEST_F(BrokerScannerTest, normal5) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // end of file @@ -556,7 +563,8 @@ TEST_F(BrokerScannerTest, normal6) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // 4,5,6 @@ -587,7 +595,8 @@ TEST_F(BrokerScannerTest, normal7) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // end of file @@ -611,7 +620,8 @@ TEST_F(BrokerScannerTest, normal8) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // 4,5,6 @@ -642,7 +652,8 @@ TEST_F(BrokerScannerTest, normal9) { auto st = scanner.open(); ASSERT_TRUE(st.ok()); - MemPool tuple_pool(_runtime_state.instance_mem_tracker()); + MemTracker tracker; + MemPool tuple_pool(&tracker); Tuple* tuple = (Tuple*)tuple_pool.allocate(20); bool eof = false; // end of file diff --git a/be/test/http/CMakeLists.txt b/be/test/http/CMakeLists.txt index fb08d9f75c..ffac03a2f5 100644 --- a/be/test/http/CMakeLists.txt +++ b/be/test/http/CMakeLists.txt @@ -17,3 +17,6 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/http") ADD_BE_TEST(metrics_action_test) +ADD_BE_TEST(message_body_sink_test) +ADD_BE_TEST(http_utils_test) +ADD_BE_TEST(stream_load_test) diff --git a/be/test/olap/CMakeLists.txt b/be/test/olap/CMakeLists.txt index 179942c268..e743c68b19 100644 --- a/be/test/olap/CMakeLists.txt +++ b/be/test/olap/CMakeLists.txt @@ -5,7 +5,6 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/test/olap") set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/olap") ADD_BE_TEST(row_block_test) -ADD_BE_TEST(command_executor_test) ADD_BE_TEST(bit_field_test) ADD_BE_TEST(byte_buffer_test) ADD_BE_TEST(run_length_byte_test) @@ -19,10 +18,11 @@ ADD_BE_TEST(in_list_predicate_test) ADD_BE_TEST(null_predicate_test) ADD_BE_TEST(file_helper_test) ADD_BE_TEST(file_utils_test) -# ADD_BE_TEST(delete_handler_test) -# ADD_BE_TEST(column_reader_test) -# ADD_BE_TEST(row_cursor_test) - -## deleted -# ADD_BE_TEST(olap_reader_test) -# ADD_BE_TEST(vectorized_olap_reader_test) +ADD_BE_TEST(delete_handler_test) +ADD_BE_TEST(column_reader_test) +ADD_BE_TEST(row_cursor_test) +ADD_BE_TEST(skiplist_test) +ADD_BE_TEST(delta_writer_test) +ADD_BE_TEST(serialize_test) +ADD_BE_TEST(olap_meta_test) +ADD_BE_TEST(olap_header_manager_test) diff --git a/be/test/olap/bloom_filter_test.cpp b/be/test/olap/bloom_filter_test.cpp index f6df4501a5..2790bbafb1 100644 --- a/be/test/olap/bloom_filter_test.cpp +++ b/be/test/olap/bloom_filter_test.cpp @@ -120,35 +120,35 @@ TEST_F(TestBloomFilter, bloom_filter_info) { bytes = "a"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "ab"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "b"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "ba"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "c"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "bc"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "ac"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; bytes = "abc"; points = bf.get_bytes_points_string(bytes.c_str(), bytes.size()); - OLAP_LOG_WARNING("bytes=%s points=%s", bytes.c_str(), points.c_str()); + LOG(WARNING) << "bytes=" << bytes << " points=" << points; } } // namespace column_file diff --git a/be/test/olap/column_reader_test.cpp b/be/test/olap/column_reader_test.cpp index bb7d5f87a2..74aac65862 100644 --- a/be/test/olap/column_reader_test.cpp +++ b/be/test/olap/column_reader_test.cpp @@ -275,13 +275,21 @@ TEST_F(TestColumn, VectorizedTinyColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + char value = 1; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -323,19 +331,30 @@ TEST_F(TestColumn, SeekTinyColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + char value = 1; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -398,17 +417,26 @@ TEST_F(TestColumn, SkipTinyColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + char value = 1; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(2, write_row); + block.finalize(3); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -445,13 +473,22 @@ TEST_F(TestColumn, VectorizedTinyColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); write_row.set_not_null(0); char value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -490,13 +527,22 @@ TEST_F(TestColumn, TinyColumnIndex) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + char value = 1; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -535,19 +581,31 @@ TEST_F(TestColumn, SeekTinyColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + char value = 1; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -609,17 +667,26 @@ TEST_F(TestColumn, SkipTinyColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + char value = 1; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); value = 3; write_row.set_field_content(0, &value, _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(2, write_row); + block.finalize(3); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -656,13 +723,22 @@ TEST_F(TestColumn, VectorizedShortColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + int16_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -701,19 +777,29 @@ TEST_F(TestColumn, SeekShortColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + int16_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -776,17 +862,26 @@ TEST_F(TestColumn, SkipShortColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + int16_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(2, write_row); + block.finalize(3); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -823,19 +918,29 @@ TEST_F(TestColumn, SeekShortColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + int16_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -896,13 +1001,21 @@ TEST_F(TestColumn, VectorizedShortColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); int16_t value = 3; write_row.set_not_null(0); write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -943,17 +1056,25 @@ TEST_F(TestColumn, SkipShortColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + int16_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 2; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(2, write_row); + block.finalize(3); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -990,13 +1111,22 @@ TEST_F(TestColumn, VectorizedIntColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 1024; + block.init(block_info); + int32_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1035,11 +1165,18 @@ TEST_F(TestColumn, VectorizedIntColumnMassWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + for (int32_t i = 0; i < 10000; i++) { write_row.set_field_content(0, reinterpret_cast(&i), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(i, write_row); } + block.finalize(10000); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1084,12 +1221,22 @@ TEST_F(TestColumn, VectorizedIntColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + int32_t value = -1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1131,13 +1278,23 @@ TEST_F(TestColumn, VectorizedLongColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + int64_t value = 1; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); value = 3; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1176,13 +1333,23 @@ TEST_F(TestColumn, VectorizedLongColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); int64_t value = 3; write_row.set_not_null(0); write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1223,13 +1390,23 @@ TEST_F(TestColumn, VectorizedFloatColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + float value = 1.234; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); value = 3.234; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1269,13 +1446,23 @@ TEST_F(TestColumn, VectorizedFloatColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); float value = 3.234; write_row.set_not_null(0); write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1316,15 +1503,25 @@ TEST_F(TestColumn, SeekFloatColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + float value = 1.234; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); value = 3.234; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -1379,13 +1576,23 @@ TEST_F(TestColumn, SkipFloatColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + float value = 1.234; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); value = 3.234; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1422,13 +1629,23 @@ TEST_F(TestColumn, VectorizedDoubleColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + double value = 1.23456789; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); value = 3.23456789; write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1468,13 +1685,23 @@ TEST_F(TestColumn, VectorizedDoubleColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); double value = 3.23456789; write_row.set_not_null(0); write_row.set_field_content(0, reinterpret_cast(&value), _mem_pool.get()); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1516,11 +1743,19 @@ TEST_F(TestColumn, VectorizedDatetimeColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("2000-10-10 10:10:10"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1557,14 +1792,25 @@ TEST_F(TestColumn, VectorizedDatetimeColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); std::vector val_string_array; val_string_array.push_back("2000-10-10 10:10:10"); - write_row.from_string(val_string_array); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1610,11 +1856,19 @@ TEST_F(TestColumn, VectorizedDateColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("2000-10-10"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1650,16 +1904,26 @@ TEST_F(TestColumn, VectorizedDateColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); std::vector val_string_array; val_string_array.push_back("2000-10-10"); - write_row.from_string(val_string_array); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); for (uint32_t i = 0; i < 100; ++i) { write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); } ColumnDataHeaderMessage header; @@ -1704,16 +1968,28 @@ TEST_F(TestColumn, VectorizedDecimalColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1753,16 +2029,27 @@ TEST_F(TestColumn, VectorizedDecimalColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1803,16 +2090,25 @@ TEST_F(TestColumn, SkipDecimalColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + write_row.from_tuple(val_string_array); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + write_row.from_tuple(val_string_array); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1851,18 +2147,29 @@ TEST_F(TestColumn, SeekDecimalColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("1234.5678"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); val_string_array.clear(); val_string_array.push_back("5678.1234"); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); create_and_save_last_position(); @@ -1929,15 +2236,26 @@ TEST_F(TestColumn, VectorizedLargeIntColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back(value1); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back(value2); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -1981,21 +2299,34 @@ TEST_F(TestColumn, VectorizedLargeIntColumnWithPresent) { CreateColumnWriter(tablet_schema); RowCursor write_row; write_row.init(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); std::vector val_string_array; val_string_array.push_back(value1); - write_row.from_string(val_string_array); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back(value2); - write_row.from_string(val_string_array); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2047,16 +2378,27 @@ TEST_F(TestColumn, SkipLargeIntColumnWithPresent) { CreateColumnWriter(tablet_schema); RowCursor write_row; write_row.init(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); std::vector val_string_array; val_string_array.push_back(value1); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back(value2); - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2076,80 +2418,6 @@ TEST_F(TestColumn, SkipLargeIntColumnWithPresent) { ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value2.c_str(), value2.size()) == 0); } -// TODO(jiangguoqiang): this test has a problem. Need to fix it. -// TEST_F(TestColumn, SeekLargeIntColumnWithPresent) { - // // init table schema - // std::vector tablet_schema; - // FieldInfo field_info; - // SetFieldInfo(field_info, - // std::string("LargeIntColumnWithPresent"), - // OLAP_FIELD_TYPE_LARGEINT, - // OLAP_FIELD_AGGREGATION_SUM, - // 16, - // true, - // true); - // tablet_schema.push_back(field_info); - - // // test data - // string value1 = "100000000000000000000000000000000000000"; - // string value2 = "-170141183460469231731687303715884105728"; - // string value3 = "65535"; - - // // write data - // CreateColumnWriter(tablet_schema); - // RowCursor write_row; - // write_row.init(tablet_schema); - - // std::vector val_string_array; - // val_string_array.push_back(value1); - // write_row.from_string(val_string_array); - // ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - // val_string_array.clear(); - // val_string_array.push_back(value2); - // write_row.from_string(val_string_array); - // ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - // create_and_save_last_position(); - - // val_string_array.clear(); - // val_string_array.push_back(value3); - // write_row.from_string(val_string_array); - // ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - - // create_and_save_last_position(); - - // ColumnDataHeaderMessage header; - // ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - - // // read data - // CreateColumnReader(tablet_schema); - // RowCursor read_row; - // read_row.init(tablet_schema); - // PositionEntryReader entry0; - // entry0._positions = _column_writer->index()->mutable_entry(0)->_positions; - // entry0._positions_count = _column_writer->index()->mutable_entry(0)->_positions_count; - // entry0._statistics.init(OLAP_FIELD_TYPE_LARGEINT); - - // PositionEntryReader entry1; - // entry1._positions = _column_writer->index()->mutable_entry(1)->_positions; - // entry1._positions_count = _column_writer->index()->mutable_entry(1)->_positions_count; - // entry1._statistics.init(OLAP_FIELD_TYPE_LARGEINT); - - // PositionProvider position0(&entry0); - // PositionProvider position1(&entry1); - - // ASSERT_EQ(_column_reader->seek(&position0), OLAP_SUCCESS); - // ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - // ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - // ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value1.c_str(), value1.size()) == 0); - - // ASSERT_EQ(_column_reader->seek(&position1), OLAP_SUCCESS); - // ASSERT_EQ(_column_reader->next(), OLAP_SUCCESS); - // ASSERT_EQ(_column_reader->attach(&read_row), OLAP_SUCCESS); - // ASSERT_TRUE(strncmp(read_row.to_string().c_str(), value3.c_str(), value3.size()) == 0); -// } - TEST_F(TestColumn, VectorizedDirectVarcharColumnWithoutPresent) { // write data std::vector tablet_schema; @@ -2168,19 +2436,32 @@ TEST_F(TestColumn, VectorizedDirectVarcharColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); } val_string_array.clear(); val_string_array.push_back("ZWRjYmE="); //"edcba" base_64_encode is "ZWRjYmE=" - write_row.from_string(val_string_array); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); } ColumnDataHeaderMessage header; @@ -2230,14 +2511,24 @@ TEST_F(TestColumn, VectorizedDirectVarcharColumnWithPresent) { write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2279,16 +2570,27 @@ TEST_F(TestColumn, SkipDirectVarcharColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back("YWFhYWE="); //"aaaaa" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2328,18 +2630,29 @@ TEST_F(TestColumn, SeekDirectVarcharColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); _column_writer->create_row_index_entry(); val_string_array.clear(); val_string_array.push_back("YWFhYWE="); //"aaaaa" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); _column_writer->create_row_index_entry(); @@ -2399,18 +2712,29 @@ TEST_F(TestColumn, SeekDirectVarcharColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("YWJjZGU="); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); _column_writer->create_row_index_entry(); val_string_array.clear(); val_string_array.push_back("YWFhYWE="); //"aaaaa" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); _column_writer->create_row_index_entry(); @@ -2470,19 +2794,32 @@ TEST_F(TestColumn, VectorizedStringColumnWithoutPresent) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); - + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back("abcde"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); } val_string_array.clear(); val_string_array.push_back("edcba"); //"edcba" base_64_encode is "ZWRjYmE=" - write_row.from_string(val_string_array); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); } ColumnDataHeaderMessage header; @@ -2531,14 +2868,23 @@ TEST_F(TestColumn, VectorizedStringColumnWithPresent) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + write_row.set_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(0, write_row); std::vector val_string_array; val_string_array.push_back("abcde"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); + OlapTuple tuple(val_string_array); + write_row.from_tuple(tuple); write_row.set_not_null(0); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + block.set_row(1, write_row); + block.finalize(2); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2580,27 +2926,51 @@ TEST_F(TestColumn, VectorizedStringColumnWithoutoutPresent2) { RowCursor write_row; write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); + + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); std::vector val_string_array; val_string_array.push_back("abcde"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + write_row.from_tuple(tuple1); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); + val_string_array.clear(); val_string_array.push_back("aaaaa"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); + val_string_array.clear(); val_string_array.push_back("bbbbb"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple3(val_string_array); + write_row.from_tuple(tuple3); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); + val_string_array.clear(); val_string_array.push_back("ccccc"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple4(val_string_array); + write_row.from_tuple(tuple4); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); + val_string_array.clear(); val_string_array.push_back("ddddd"); //"abcde" base_64_encode is "YWJjZGU=" - write_row.from_string(val_string_array); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple5(val_string_array); + write_row.from_tuple(tuple5); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); @@ -2651,17 +3021,28 @@ TEST_F(TestColumn, VectorizedDirectVarcharColumnWith65533) { write_row.init(tablet_schema); write_row.allocate_memory_for_string_type(tablet_schema); + RowBlock block(tablet_schema); + RowBlockInfo block_info; + block_info.row_num = 10000; + block.init(block_info); + std::vector val_string_array; val_string_array.push_back(std::string(65533, 'a')); - ASSERT_EQ(OLAP_SUCCESS, write_row.from_string(val_string_array)); - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); + OlapTuple tuple1(val_string_array); + ASSERT_EQ(OLAP_SUCCESS, write_row.from_tuple(tuple1)); + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); val_string_array.clear(); val_string_array.push_back("edcba"); //"edcba" base_64_encode is "ZWRjYmE=" - write_row.from_string(val_string_array); + OlapTuple tuple2(val_string_array); + write_row.from_tuple(tuple2); for (uint32_t i = 0; i < 2; i++) { - ASSERT_EQ(_column_writer->write(&write_row), OLAP_SUCCESS); - } + block.set_row(0, write_row); + block.finalize(1); + ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); + } ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); diff --git a/be/test/olap/command_executor_test.cpp b/be/test/olap/command_executor_test.cpp deleted file mode 100644 index cfc2d73ba2..0000000000 --- a/be/test/olap/command_executor_test.cpp +++ /dev/null @@ -1,1849 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "olap/command_executor.h" -#include "olap/field.h" -#include "olap/olap_engine.h" -#include "olap/olap_main.cpp" -#include "olap/olap_table.h" -#include "olap/utils.h" -#include "util/logging.h" - -using std::nothrow; -using std::stringstream; - -namespace palo { - -// SQL for generate BASE_TABLE_PUSH_DATA & BASE_TABLE_PUSH_DATA_BIG & ROLLUP_TABLE_PUSH_DATA: -// -// create table delete_test_row (k1 tinyint, k2 bigint, k3 char(64), -// k6 DECIMAL, v bigint sum) engine=olap distributed by -// random buckets 1 properties ("storage_type" = "row", "short_key" = "2"); -// -// alter table delete_test_row add rollup delete_test_row_rollup(k1, k3, v); -// -// load label label1 (data infile -// ("hdfs://host:port/dir") -// into table `delete_test_row` (k1,k2,v,k3,k4,k5,k6)); -// -// load label label2 (data infile -// ("hdfs://host:port/dir") -// into table `delete_test_row` (k1,k2,v,k3,k4,k5,k6)); -static const int64_t BASE_TABLE_PUSH_DATA_ROW_COUNT = 100; -static const char* BASE_TABLE_PUSH_DATA = "./be/test/olap/test_data/all_types_100"; -static const int64_t BASE_TABLE_PUSH_DATA_BIG_ROW_COUNT = 100000; -static const char* BASE_TABLE_PUSH_DATA_BIG = "./be/test/olap/test_data/all_types_100000"; -static const int64_t ROLLUP_TABLE_PUSH_DATA_ROW_COUNT = 100; -static const char* ROLLUP_TABLE_PUSH_DATA = "./be/test/olap/test_data/all_types_100_rollup"; - -// checksum for base table push data -static const uint32_t MAX_RETRY_TIMES = 10; -static const uint32_t BASE_TABLE_PUSH_DATA_CHECKSUM = 1401759800; - -static const uint32_t MAX_PATH_LEN = 1024; - -void set_up() { - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_test"; - system("rm -rf ./test_run && mkdir -p ./test_run"); - create_dir(config::storage_root_path); - touch_all_singleton(); -} - -void tear_down() { - system("rm -rf ./test_run"); - remove_all_dir(string(getenv("DORIS_HOME")) + UNUSED_PREFIX); -} - -void set_default_create_tablet_request(TCreateTabletReq* request) { - request->tablet_id = 10003; - request->__set_version(1); - request->__set_version_hash(0); - request->tablet_schema.schema_hash = 270068375; - request->tablet_schema.short_key_column_count = 2; - request->tablet_schema.keys_type = TKeysType::AGG_KEYS; - request->tablet_schema.storage_type = TStorageType::ROW; - - TColumn k1; - k1.column_name = "k1"; - k1.__set_is_key(true); - k1.column_type.type = TPrimitiveType::TINYINT; - request->tablet_schema.columns.push_back(k1); - - TColumn k2; - k2.column_name = "k2"; - k2.__set_is_key(true); - k2.column_type.type = TPrimitiveType::BIGINT; - request->tablet_schema.columns.push_back(k2); - - TColumn k3; - k3.column_name = "k3"; - k3.__set_is_key(true); - k3.column_type.type = TPrimitiveType::CHAR; - k3.column_type.__set_len(64); - request->tablet_schema.columns.push_back(k3); - - TColumn k6; - k6.column_name = "k6"; - k6.__set_is_key(true); - k6.column_type.type = TPrimitiveType::DECIMAL; - k6.column_type.__set_precision(6); - k6.column_type.__set_scale(3); - request->tablet_schema.columns.push_back(k6); - - TColumn v; - v.column_name = "v"; - v.__set_is_key(false); - v.column_type.type = TPrimitiveType::BIGINT; - v.__set_aggregation_type(TAggregationType::SUM); - request->tablet_schema.columns.push_back(v); -} - -void set_bloom_filter_create_tablet_request(TCreateTabletReq* request) { - request->tablet_id = 10004; - request->__set_version(1); - request->__set_version_hash(0); - request->tablet_schema.schema_hash = 270076533; - request->tablet_schema.short_key_column_count = 2; - request->tablet_schema.keys_type = TKeysType::AGG_KEYS; - request->tablet_schema.storage_type = TStorageType::COLUMN; - - TColumn k1; - k1.column_name = "k1"; - k1.__set_is_key(true); - k1.column_type.type = TPrimitiveType::TINYINT; - request->tablet_schema.columns.push_back(k1); - - TColumn k2; - k2.column_name = "k2"; - k2.__set_is_key(true); - k2.column_type.type = TPrimitiveType::BIGINT; - request->tablet_schema.columns.push_back(k2); - - TColumn k3; - k3.column_name = "k3"; - k3.__set_is_key(true); - k3.column_type.type = TPrimitiveType::CHAR; - k3.column_type.__set_len(64); - request->tablet_schema.columns.push_back(k3); - - TColumn k6; - k6.column_name = "k6"; - k6.__set_is_key(true); - k6.column_type.type = TPrimitiveType::DECIMAL; - k6.column_type.__set_precision(6); - k6.column_type.__set_scale(3); - request->tablet_schema.columns.push_back(k6); - - TColumn v; - v.column_name = "v"; - v.__set_is_key(false); - v.column_type.type = TPrimitiveType::BIGINT; - v.__set_aggregation_type(TAggregationType::SUM); - request->tablet_schema.columns.push_back(v); -} - -void set_default_push_request(const TCreateTabletReq& request, TPushReq* push_request) { - push_request->tablet_id = request.tablet_id; - push_request->schema_hash = request.tablet_schema.schema_hash; - push_request->__set_version(request.version + 1); - push_request->__set_version_hash(request.version_hash + 1); - push_request->timeout = 86400; - push_request->push_type = TPushType::LOAD; - push_request->__set_http_file_path(BASE_TABLE_PUSH_DATA); -} - -void set_alter_tablet_request(const TCreateTabletReq& base_tablet, TAlterTabletReq* request) { - request->base_tablet_id = base_tablet.tablet_id; - request->base_schema_hash = base_tablet.tablet_schema.schema_hash; -} - -class TestCreateTable : public ::testing::Test { -public: - TestCreateTable() : _command_executor(NULL) {} - ~TestCreateTable() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_create_table"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestCreateTable, create_tablet) { - OLAPStatus res = OLAP_SUCCESS; - - // 1. Create table with error param. - TCreateTabletReq request; - set_default_create_tablet_request(&request); - request.tablet_schema.short_key_column_count = 5; - - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_ERR_CE_CMD_PARAMS_ERROR, res); - - // 2. Create table normally. - request.tablet_schema.short_key_column_count = 2; - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // check create table result - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - ASSERT_EQ(0, access(tablet->header_file_name().c_str(), F_OK)); - - Version base_version(0, request.version); - string index_name = tablet->construct_index_file_path(base_version, request.version_hash, 0); - string data_name = tablet->construct_data_file_path(base_version, request.version_hash, 0); - ASSERT_EQ(0, access(index_name.c_str(), F_OK)); - ASSERT_EQ(0, access(data_name.c_str(), F_OK)); - - Version delta_version(request.version + 1, request.version + 1); - index_name = tablet->construct_index_file_path(delta_version, 0, 0); - data_name = tablet->construct_data_file_path(delta_version, 0, 0); - ASSERT_EQ(0, access(index_name.c_str(), F_OK)); - ASSERT_EQ(0, access(data_name.c_str(), F_OK)); - - // 3. Create table already existed. - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 4. Create table with different schema_hash. - request.tablet_schema.schema_hash = 0; - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_ERR_CE_TABLET_ID_EXIST, res); -} - -TEST_F(TestCreateTable, column_create_tablet) { - OLAPStatus res = OLAP_SUCCESS; - TCreateTabletReq request; - set_default_create_tablet_request(&request); - request.tablet_id += 1; - request.tablet_schema.schema_hash += 1; - request.tablet_schema.storage_type = TStorageType::COLUMN; - - res = _command_executor->create_table(request); - - // check create table result - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - ASSERT_EQ(0, access(tablet->header_file_name().c_str(), F_OK)); - - Version base_version(0, request.version); - string index_name = tablet->construct_index_file_path(base_version, request.version_hash, 0); - string data_name = tablet->construct_data_file_path(base_version, request.version_hash, 0); - ASSERT_EQ(0, access(index_name.c_str(), F_OK)); - ASSERT_EQ(0, access(data_name.c_str(), F_OK)); - - Version delta_version(request.version + 1, request.version + 1); - index_name = tablet->construct_index_file_path(delta_version, 0, 0); - data_name = tablet->construct_data_file_path(delta_version, 0, 0); - ASSERT_EQ(0, access(index_name.c_str(), F_OK)); - ASSERT_EQ(0, access(data_name.c_str(), F_OK)); -} - -class TestGetTable : public ::testing::Test { -public: - TestGetTable() : _command_executor(NULL) {} - ~TestGetTable() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_get_table"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestGetTable, get_table) { - SmartOLAPTable tablet; - OLAPStatus res = OLAP_SUCCESS; - TCreateTabletReq request; - set_default_create_tablet_request(&request); - - // 1. Get table not existed. - tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() == NULL); - - // 2. Get table normally. - // create table first - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // get table - tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); -} - -class TestDropTable : public ::testing::Test { -public: - TestDropTable() : _command_executor(NULL) {} - ~TestDropTable() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_drop_table"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestDropTable, drop_table) { - SmartOLAPTable tablet; - OLAPStatus res = OLAP_SUCCESS; - - TCreateTabletReq request; - set_default_create_tablet_request(&request); - - TDropTabletReq drop_request; - drop_request.tablet_id = request.tablet_id; - drop_request.schema_hash = request.tablet_schema.schema_hash; - - // 1. Drop table not existed. - res = _command_executor->drop_table(drop_request); - ASSERT_EQ(OLAP_SUCCESS, res); - double usage = -1; - ASSERT_EQ(OLAPEngine::get_instance()->start_trash_sweep(&usage), OLAP_SUCCESS); - - // 2. Drop table normally. - // create table first - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - res = _command_executor->drop_table(drop_request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 3. Clear trash - // check dropped table exist after trash scan - std::set dirs; - ASSERT_EQ(OLAPEngine::get_instance()->start_trash_sweep(&usage), OLAP_SUCCESS); - dir_walk(config::storage_root_path + TRASH_PREFIX, &dirs, nullptr); - OLAP_LOG_INFO("max disk usage is: %f", usage); - const double guard_usage = config::disk_capacity_insufficient_percentage / 100.0; - ASSERT_TRUE(usage > guard_usage ? dirs.empty() : !dirs.empty()); - // check dirs really removed after timeout - config::trash_file_expire_time_sec = 1; - sleep(2); // wait for timeout - ASSERT_EQ(OLAPEngine::get_instance()->start_trash_sweep(&usage), OLAP_SUCCESS); - ASSERT_TRUE(0 <= usage && usage <= 100); - dirs.clear(); - dir_walk(config::storage_root_path + TRASH_PREFIX, &dirs, nullptr); - ASSERT_TRUE(dirs.empty()); -} - -class TestReportTablet : public ::testing::Test { -public: - TestReportTablet() : _command_executor(NULL) {} - ~TestReportTablet() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_report"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestReportTablet, report_tablet_info) { - OLAPStatus res = OLAP_SUCCESS; - TCreateTabletReq request; - set_default_create_tablet_request(&request); - - TTabletInfo tablet_info; - tablet_info.tablet_id = request.tablet_id; - tablet_info.schema_hash = request.tablet_schema.schema_hash; - - // 1. Report tablet info not existed. - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // 2. Report tablet info normally. - // create tablet first - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // report tablet info and check. - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(request.version + 1, tablet_info.version); - ASSERT_EQ(request.version_hash, tablet_info.version_hash); - ASSERT_EQ(0, tablet_info.row_count); -} - -class TestReportAllTablets : public ::testing::Test { -public: - TestReportAllTablets() : _command_executor(NULL) {} - ~TestReportAllTablets() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_report_all"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestReportAllTablets, report_all_tablets_info) { - OLAPStatus res = OLAP_SUCCESS; - std::map tablets_info; - - // 1. Report empty tablet. - tablets_info.clear(); - res = _command_executor->report_all_tablets_info(&tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(0, tablets_info.size()); - - // 2. Report one tablets. - // create default tablet. - TCreateTabletReq request; - set_default_create_tablet_request(&request); - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - tablets_info.clear(); - res = _command_executor->report_all_tablets_info(&tablets_info); - - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(1, tablets_info[request.tablet_id].tablet_infos.size()); - ASSERT_EQ(request.tablet_id, tablets_info[request.tablet_id].tablet_infos[0].tablet_id); - - // 3. Report two tablets. - // create another tablet. - request.tablet_id = request.tablet_id + 1; - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - tablets_info.clear(); - res = _command_executor->report_all_tablets_info(&tablets_info); - - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(2, tablets_info.size()); - ASSERT_EQ(1, tablets_info[request.tablet_id].tablet_infos.size()); - ASSERT_EQ(request.tablet_id, tablets_info[request.tablet_id].tablet_infos[0].tablet_id); -} - -class TestReloadRootPath : public ::testing::Test { -public: - TestReloadRootPath() : _command_executor(NULL) {} - ~TestReloadRootPath() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_reload"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestReloadRootPath, reload_root_path) { - string root_path; - OLAPStatus res = OLAP_SUCCESS; - - // create table in current root path - TCreateTabletReq request; - set_default_create_tablet_request(&request); - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // 1. Reload empty root path - root_path = ";"; - res = _command_executor->reload_root_path(root_path); - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, res); - - // 2. Reload new root path not existed - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - root_path = string(buffer) + "/test_run/data_reload_new"; - res = _command_executor->reload_root_path(root_path); - ASSERT_EQ(OLAP_ERR_INPUT_PARAMETER_ERROR, res); - tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // 3. Reload new root path normally - remove_all_dir(root_path); - ASSERT_EQ(create_dir(root_path), OLAP_SUCCESS); - res = _command_executor->reload_root_path(root_path); - ASSERT_EQ(OLAP_SUCCESS, res); - tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() == NULL); - - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // 3. Reload same root path - res = _command_executor->reload_root_path(root_path); - ASSERT_EQ(OLAP_SUCCESS, res); - tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(root_path)); -} - -class TestGetRootPathInfo : public ::testing::Test { -public: - TestGetRootPathInfo() : _command_executor(NULL) {} - ~TestGetRootPathInfo() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_root_path_stat"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestGetRootPathInfo, get_all_root_path_info) { - OLAPStatus res = OLAP_SUCCESS; - std::vector root_paths_info; - - res = _command_executor->get_all_root_path_info(&root_paths_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, root_paths_info.size()); - EXPECT_STREQ(config::storage_root_path.c_str(), root_paths_info[0].path.c_str()); -} - -class TestPush : public ::testing::Test { -public: - TestPush() : _command_executor(NULL) {} - ~TestPush() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_push"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestPush, push) { - OLAPStatus res = OLAP_SUCCESS; - TCreateTabletReq request; - set_default_create_tablet_request(&request); - TPushReq push_req; - set_default_push_request(request, &push_req); - std::vector tablets_info; - - // 1. Push before tablet created. - tablets_info.clear(); - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - ASSERT_EQ(0, tablets_info.size()); - - // create tablet first - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // 2. Push with wrong version. - push_req.version = 0; - tablets_info.clear(); - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_ERR_PUSH_VERSION_INCORRECT, res); - ASSERT_EQ(0, tablets_info.size()); - - // 3. Push next version normally. - push_req.version = request.version + 1; - tablets_info.clear(); - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(push_req.tablet_id, tablets_info[0].tablet_id); - ASSERT_EQ(push_req.schema_hash, tablets_info[0].schema_hash); - ASSERT_EQ(push_req.version, tablets_info[0].version); - ASSERT_EQ(push_req.version_hash, tablets_info[0].version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_ROW_COUNT, tablets_info[0].row_count); - - // 4. Push the same batch. - tablets_info.clear(); - int64_t row_count = BASE_TABLE_PUSH_DATA_ROW_COUNT; - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(push_req.tablet_id, tablets_info[0].tablet_id); - ASSERT_EQ(push_req.schema_hash, tablets_info[0].schema_hash); - ASSERT_EQ(push_req.version, tablets_info[0].version); - ASSERT_EQ(push_req.version_hash, tablets_info[0].version_hash); - ASSERT_EQ(row_count, tablets_info[0].row_count); - - // 5. Push the next batch. - push_req.version += 1; - push_req.version_hash += 1; - tablets_info.clear(); - row_count += BASE_TABLE_PUSH_DATA_ROW_COUNT; - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(push_req.tablet_id, tablets_info[0].tablet_id); - ASSERT_EQ(push_req.schema_hash, tablets_info[0].schema_hash); - ASSERT_EQ(push_req.version, tablets_info[0].version); - ASSERT_EQ(push_req.version_hash, tablets_info[0].version_hash); - ASSERT_EQ(row_count, tablets_info[0].row_count); -} - -TEST_F(TestPush, column_push) { - OLAPStatus res = OLAP_SUCCESS; - TCreateTabletReq request; - set_default_create_tablet_request(&request); - std::vector tablets_info; - - // create tablet first - request.tablet_id += 1; - request.tablet_schema.schema_hash += 1; - request.tablet_schema.storage_type = TStorageType::COLUMN; - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - tablets_info.clear(); - TPushReq push_req; - set_default_push_request(request, &push_req); - push_req.tablet_id = request.tablet_id; - push_req.schema_hash = request.tablet_schema.schema_hash; - push_req.__set_http_file_path(BASE_TABLE_PUSH_DATA_BIG); - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(push_req.tablet_id, tablets_info[0].tablet_id); - ASSERT_EQ(push_req.schema_hash, tablets_info[0].schema_hash); - ASSERT_EQ(push_req.version, tablets_info[0].version); - ASSERT_EQ(push_req.version_hash, tablets_info[0].version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_BIG_ROW_COUNT, tablets_info[0].row_count); -} - -class TestComputeChecksum : public ::testing::Test { -public: - TestComputeChecksum() : _command_executor(NULL) {} - ~TestComputeChecksum() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_compute_checksum"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestComputeChecksum, compute_checksum) { - uint32_t checksum = 0; - OLAPStatus res = OLAP_SUCCESS; - - TCreateTabletReq request; - set_default_create_tablet_request(&request); - - TPushReq push_req; - set_default_push_request(request, &push_req); - std::vector tablets_info; - - // 1. Compute checksum before tablet created. - res = _command_executor->compute_checksum( - request.tablet_id, request.tablet_schema.schema_hash, - request.version, request.version_hash, &checksum); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // 2. Compute checksum for empty tablet. - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - res = _command_executor->compute_checksum( - request.tablet_id, request.tablet_schema.schema_hash, - request.version, request.version_hash, &checksum); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(0, checksum); - - // 3. Compute checksum normally. - tablets_info.clear(); - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - - res = _command_executor->compute_checksum( - push_req.tablet_id, push_req.schema_hash, - push_req.version, push_req.version_hash, &checksum); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_CHECKSUM, checksum); -} - -class TestBaseCompaction : public ::testing::Test { -public: - TestBaseCompaction() : _command_executor(NULL) {} - ~TestBaseCompaction() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_base_compaction"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestBaseCompaction, TestBaseCompaction) { - OLAPStatus res = OLAP_SUCCESS; - TCreateTabletReq request; - set_default_create_tablet_request(&request); - - TPushReq push_req; - set_default_push_request(request, &push_req); - std::vector tablets_info; - - // 1. Start BE before tablet created. - res = _command_executor->base_compaction( - push_req.tablet_id, push_req.schema_hash, push_req.version); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // 2. Start BE for error new base version. - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - res = _command_executor->base_compaction( - request.tablet_id, request.tablet_schema.schema_hash, request.version + 1); - ASSERT_EQ(OLAP_ERR_BE_NO_SUITABLE_VERSION, res); -} - -// ######################### ALTER TABLE TEST BEGIN ######################### - -void set_create_tablet_request_1(const TCreateTabletReq& base_request, TCreateTabletReq* request) { - //sorting schema change - request->tablet_id = base_request.tablet_id + 1; - request->__set_version(base_request.version); - request->__set_version_hash(base_request.version_hash); - request->tablet_schema.schema_hash = base_request.tablet_schema.schema_hash + 1; - request->tablet_schema.short_key_column_count = 2; - request->tablet_schema.storage_type = TStorageType::ROW; - - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[0]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[2]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[4]); -} - -void set_create_tablet_request_2(const TCreateTabletReq& base_request, TCreateTabletReq* request) { - //directly schema change - request->tablet_id = base_request.tablet_id + 2; - request->__set_version(base_request.version); - request->__set_version_hash(base_request.version_hash); - request->tablet_schema.schema_hash = base_request.tablet_schema.schema_hash + 2; - request->tablet_schema.short_key_column_count = 1; - request->tablet_schema.storage_type = TStorageType::COLUMN; - - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[0]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[4]); -} - -void set_create_tablet_request_3(const TCreateTabletReq& base_request, TCreateTabletReq* request) { - //linked schema change, add a value column - request->tablet_id = base_request.tablet_id + 3; - request->__set_version(base_request.version); - request->__set_version_hash(base_request.version_hash); - request->tablet_schema.schema_hash = base_request.tablet_schema.schema_hash + 3; - request->tablet_schema.short_key_column_count = 1; - request->tablet_schema.storage_type = TStorageType::COLUMN; - - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[0]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[4]); - - TColumn v2; - v2.column_name = "v2"; - v2.column_type.type = TPrimitiveType::BIGINT; - v2.__set_is_key(false); - v2.__set_default_value("0"); - v2.__set_aggregation_type(TAggregationType::SUM); - request->tablet_schema.columns.push_back(v2); -} - -void set_create_tablet_request_4(const TCreateTabletReq& base_request, TCreateTabletReq* request) { - //directly schema change, modify a key column type - request->tablet_id = base_request.tablet_id + 4; - request->__set_version(base_request.version); - request->__set_version_hash(base_request.version_hash); - request->tablet_schema.schema_hash = base_request.tablet_schema.schema_hash + 1; - request->tablet_schema.short_key_column_count = 1; - request->tablet_schema.storage_type = TStorageType::COLUMN; - - TColumn k1; - k1.column_name = "k1"; - k1.__set_is_key(true); - k1.column_type.type = TPrimitiveType::LARGEINT; - request->tablet_schema.columns.push_back(k1); - - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[1]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[2]); -} - -void set_create_tablet_request_bloom_filter( - const TCreateTabletReq& base_request, TCreateTabletReq* request) { - //sorting schema change - request->tablet_id = base_request.tablet_id + 5; - request->__set_version(base_request.version); - request->__set_version_hash(base_request.version_hash); - request->tablet_schema.schema_hash = base_request.tablet_schema.schema_hash + 1; - request->tablet_schema.short_key_column_count = 4; - request->tablet_schema.storage_type = TStorageType::COLUMN; - - TColumn k3; - k3.column_name = "k3"; - k3.__set_is_key(true); - k3.column_type.type = TPrimitiveType::CHAR; - k3.column_type.__set_len(64); - k3.is_bloom_filter_column = true; - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[0]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[1]); - request->tablet_schema.columns.push_back(k3); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[3]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[4]); -} - -void set_create_tablet_request_char_to_varchar( - const TCreateTabletReq& base_request, TCreateTabletReq* request) { - //sorting schema change - request->tablet_id = base_request.tablet_id + 6; - request->__set_version(base_request.version); - request->__set_version_hash(base_request.version_hash); - request->tablet_schema.schema_hash = base_request.tablet_schema.schema_hash + 1; - request->tablet_schema.short_key_column_count = 4; - request->tablet_schema.storage_type = TStorageType::COLUMN; - - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[0]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[1]); - TColumn k3; - k3.column_name = "k3"; - k3.__set_is_key(true); - k3.column_type.type = TPrimitiveType::VARCHAR; - k3.column_type.__set_len(128); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[2]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[3]); - request->tablet_schema.columns.push_back(base_request.tablet_schema.columns[4]); -} - -AlterTableStatus show_alter_table_status( - CommandExecutor* command_executor, - const TAlterTabletReq& request) { - AlterTableStatus status = ALTER_TABLE_RUNNING; - - uint32_t max_retry = MAX_RETRY_TIMES; - while (max_retry > 0) { - status = command_executor->show_alter_table_status( - request.base_tablet_id, request.base_schema_hash); - if (status != ALTER_TABLE_RUNNING) { - break; - } - - OLAP_LOG_INFO("doing alter table......"); - --max_retry; - sleep(1); - } - - return status; -} - -class TestSchemaChange : public ::testing::Test { -public: - TestSchemaChange() : _command_executor(NULL) {} - ~TestSchemaChange() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_schema_change"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestSchemaChange, tablet_schema_change_abnormal) { - OLAPStatus res = OLAP_SUCCESS; - - // check not existed tablet id - TCreateTabletReq create_base_tablet; - set_default_create_tablet_request(&create_base_tablet); - TCreateTabletReq create_new_tablet; - set_create_tablet_request_1(create_base_tablet, &create_new_tablet); - - TAlterTabletReq request; - set_alter_tablet_request(create_base_tablet, &request); - request.__set_new_tablet_req(create_new_tablet); - - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_ERR_TRY_LOCK_FAILED, res); - - AlterTableStatus status = _command_executor->show_alter_table_status( - request.base_tablet_id, request.base_schema_hash); - ASSERT_EQ(ALTER_TABLE_FAILED, status); -} - -TEST_F(TestSchemaChange, schema_change_bloom_filter) { - OLAPStatus res = OLAP_SUCCESS; - - TCreateTabletReq create_base_tablet; - set_bloom_filter_create_tablet_request(&create_base_tablet); - res = _command_executor->create_table(create_base_tablet); - ASSERT_EQ(OLAP_SUCCESS, res); - - TCreateTabletReq create_new_tablet; - set_create_tablet_request_bloom_filter(create_base_tablet, &create_new_tablet); - - TAlterTabletReq request; - set_alter_tablet_request(create_base_tablet, &request); - request.__set_new_tablet_req(create_new_tablet); - - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - AlterTableStatus status = ALTER_TABLE_WAITING; - status = show_alter_table_status(_command_executor, request); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - res = OLAPEngine::get_instance()->drop_table( - request.base_tablet_id, request.base_schema_hash); - ASSERT_EQ(OLAP_SUCCESS, res); -} - -TEST_F(TestSchemaChange, schema_change_char_to_varchar) { - OLAPStatus res = OLAP_SUCCESS; - AlterTableStatus status = ALTER_TABLE_WAITING; - - // 1. Prepare for schema change. - // create base tablet - TCreateTabletReq create_base_tablet; - set_default_create_tablet_request(&create_base_tablet); - res = _command_executor->create_table(create_base_tablet); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - create_base_tablet.tablet_id, create_base_tablet.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // push data - TPushReq push_req; - set_default_push_request(create_base_tablet, &push_req); - std::vector tablets_info; - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - - // set schema change request - TCreateTabletReq create_new_tablet; - set_create_tablet_request_char_to_varchar(create_base_tablet, &create_new_tablet); - TAlterTabletReq request; - set_alter_tablet_request(create_base_tablet, &request); - request.__set_new_tablet_req(create_new_tablet); - - // 2. Submit schema change. - request.base_schema_hash = create_base_tablet.tablet_schema.schema_hash; - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 3. Verify schema change result. - // show schema change status - status = show_alter_table_status(_command_executor, request); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - // check new tablet information - TTabletInfo tablet_info; - tablet_info.tablet_id = create_new_tablet.tablet_id; - tablet_info.schema_hash = create_new_tablet.tablet_schema.schema_hash; - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(push_req.version, tablet_info.version); - ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_ROW_COUNT, tablet_info.row_count); - - // 4. Retry the same schema change request. - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_SUCCESS, res); - status = _command_executor->show_alter_table_status( - request.base_tablet_id, request.base_schema_hash); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - // 5. Do schema change continuously. - res = OLAPEngine::get_instance()->drop_table( - request.base_tablet_id, request.base_schema_hash); - ASSERT_EQ(OLAP_SUCCESS, res); -} - -TEST_F(TestSchemaChange, schema_change) { - OLAPStatus res = OLAP_SUCCESS; - AlterTableStatus status = ALTER_TABLE_WAITING; - - // 1. Prepare for schema change. - // create base tablet - TCreateTabletReq create_base_tablet; - set_default_create_tablet_request(&create_base_tablet); - res = _command_executor->create_table(create_base_tablet); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - create_base_tablet.tablet_id, create_base_tablet.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // push data - TPushReq push_req; - set_default_push_request(create_base_tablet, &push_req); - std::vector tablets_info; - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - - // set schema change request - TCreateTabletReq create_new_tablet; - set_create_tablet_request_1(create_base_tablet, &create_new_tablet); - TAlterTabletReq request; - set_alter_tablet_request(create_base_tablet, &request); - request.__set_new_tablet_req(create_new_tablet); - - // check schema change for non_existed tablet - request.base_schema_hash = 0; - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // 2. Submit schema change. - request.base_schema_hash = create_base_tablet.tablet_schema.schema_hash; - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 3. Verify schema change result. - // show schema change status - status = show_alter_table_status(_command_executor, request); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - // check new tablet information - TTabletInfo tablet_info; - tablet_info.tablet_id = create_new_tablet.tablet_id; - tablet_info.schema_hash = create_new_tablet.tablet_schema.schema_hash; - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(push_req.version, tablet_info.version); - ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_ROW_COUNT, tablet_info.row_count); - - // 4. Retry the same schema change request. - res = _command_executor->schema_change(request); - ASSERT_EQ(OLAP_SUCCESS, res); - status = _command_executor->show_alter_table_status( - request.base_tablet_id, request.base_schema_hash); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - // 5. Do schema change continuously. - res = OLAPEngine::get_instance()->drop_table( - request.base_tablet_id, request.base_schema_hash); - ASSERT_EQ(OLAP_SUCCESS, res); - - TCreateTabletReq create_new_new_tablet; - set_create_tablet_request_2(create_base_tablet, &create_new_new_tablet); - TAlterTabletReq new_request; - set_alter_tablet_request(create_new_tablet, &new_request); - new_request.__set_new_tablet_req(create_new_new_tablet); - - res = _command_executor->schema_change(new_request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // show alter table status - status = show_alter_table_status(_command_executor, new_request); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - res = OLAPEngine::get_instance()->drop_table( - new_request.base_tablet_id, new_request.base_schema_hash); - - // check new tablet information - tablet_info.tablet_id = create_new_new_tablet.tablet_id; - tablet_info.schema_hash = create_new_new_tablet.tablet_schema.schema_hash; - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(push_req.version, tablet_info.version); - ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); - ASSERT_EQ(100, tablet_info.row_count); - - //schema change, add a value column - TCreateTabletReq create_new_tablet3; - set_create_tablet_request_3(create_base_tablet, &create_new_tablet3); - TAlterTabletReq request3; - set_alter_tablet_request(create_new_new_tablet, &request3); - request3.__set_new_tablet_req(create_new_tablet3); - - res = _command_executor->schema_change(request3); - ASSERT_EQ(OLAP_SUCCESS, res); - - // show alter table status - status = show_alter_table_status(_command_executor, request3); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - res = OLAPEngine::get_instance()->drop_table( - request3.base_tablet_id, request3.base_schema_hash); - - // check new tablet information - tablet_info.tablet_id = create_new_tablet3.tablet_id; - tablet_info.schema_hash = create_new_tablet3.tablet_schema.schema_hash; - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(push_req.version, tablet_info.version); - ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); - ASSERT_EQ(100, tablet_info.row_count); - - //schema change, modify a key column - TCreateTabletReq create_new_tablet4; - set_create_tablet_request_4(create_new_tablet3, &create_new_tablet4); - TAlterTabletReq request4; - set_alter_tablet_request(create_new_tablet3, &request4); - request4.__set_new_tablet_req(create_new_tablet4); - - res = _command_executor->schema_change(request4); - ASSERT_EQ(OLAP_SUCCESS, res); - - // show alter table status - status = show_alter_table_status(_command_executor, request4); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - res = OLAPEngine::get_instance()->drop_table( - request4.base_tablet_id, request4.base_schema_hash); - - // check new tablet information - tablet_info.tablet_id = create_new_tablet4.tablet_id; - tablet_info.schema_hash = create_new_tablet4.tablet_schema.schema_hash; - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(push_req.version, tablet_info.version); - ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); - ASSERT_EQ(100, tablet_info.row_count); -} - -class TestCreateRollupTable : public ::testing::Test { -public: - TestCreateRollupTable() : _command_executor(NULL) {} - ~TestCreateRollupTable() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_create_rollup"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - } - - void TearDown(){ - // Remove all dir. - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; -}; - -TEST_F(TestCreateRollupTable, create_rollup_table) { - OLAPStatus res = OLAP_SUCCESS; - AlterTableStatus status = ALTER_TABLE_WAITING; - - // 1. Prepare for schema change. - // create base tablet - TCreateTabletReq create_base_tablet; - set_default_create_tablet_request(&create_base_tablet); - res = _command_executor->create_table(create_base_tablet); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - create_base_tablet.tablet_id, create_base_tablet.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - - // push data - TPushReq push_req; - set_default_push_request(create_base_tablet, &push_req); - std::vector tablets_info; - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - - // set schema change request - TCreateTabletReq create_new_tablet; - set_create_tablet_request_1(create_base_tablet, &create_new_tablet); - TAlterTabletReq request; - set_alter_tablet_request(create_base_tablet, &request); - request.__set_new_tablet_req(create_new_tablet); - - // 2. Submit schema change. - res = _command_executor->create_rollup_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 3. Verify schema change result. - // show schema change status - status = show_alter_table_status(_command_executor, request); - ASSERT_EQ(ALTER_TABLE_DONE, status); - - // check new tablet information - int64_t rollup_row_count = BASE_TABLE_PUSH_DATA_ROW_COUNT; - TTabletInfo tablet_info; - tablet_info.tablet_id = create_new_tablet.tablet_id; - tablet_info.schema_hash = create_new_tablet.tablet_schema.schema_hash; - res = _command_executor->report_tablet_info(&tablet_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(push_req.version, tablet_info.version); - ASSERT_EQ(push_req.version_hash, tablet_info.version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_ROW_COUNT, tablet_info.row_count); - - // 4. Push base tablet. - tablets_info.clear(); - push_req.version += 1; - push_req.version_hash += 1; - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(2, tablets_info.size()); - ASSERT_TRUE(tablet->is_schema_changing() == true); - rollup_row_count += BASE_TABLE_PUSH_DATA_ROW_COUNT; - - // 5. Push rollup tablet. - tablets_info.clear(); - push_req.version += 1; - push_req.version_hash += 1; - push_req.tablet_id = create_new_tablet.tablet_id; - push_req.schema_hash = create_new_tablet.tablet_schema.schema_hash; - push_req.__set_http_file_path(ROLLUP_TABLE_PUSH_DATA); - res = _command_executor->push(push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - rollup_row_count += ROLLUP_TABLE_PUSH_DATA_ROW_COUNT; - ASSERT_EQ(rollup_row_count, tablets_info[0].row_count); - ASSERT_TRUE(tablet->is_schema_changing() == false); -} - -// ######################### ALTER TABLE TEST END ######################### - -// ######################### ALTER CLONE BEGIN ######################### - -class TestClone : public ::testing::Test { -public: - TestClone() : _command_executor(NULL) {} - ~TestClone() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_clone"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - - // 1. Prepare for query split key. - // create base tablet - OLAPStatus res = OLAP_SUCCESS; - set_default_create_tablet_request(&_create_tablet); - res = _command_executor->create_table(_create_tablet); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); - - // push data - set_default_push_request(_create_tablet, &_push_req); - std::vector tablets_info; - res = _command_executor->push(_push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - } - - void TearDown(){ - // Remove all dir. - OLAPEngine::get_instance()->drop_table( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - ASSERT_EQ(access(_header_file_name.c_str(), F_OK), -1); - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; - std::string _header_file_name; - TCreateTabletReq _create_tablet; - TPushReq _push_req; -}; - -TEST_F(TestClone, make_snapshot_abnormal) { - OLAPStatus res = OLAP_SUCCESS; - std::string snapshot_path; - - // check tablet not existed - res = _command_executor->make_snapshot(0, 0, &snapshot_path); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // check tablet without delta - TCreateTabletReq request = _create_tablet; - request.tablet_id = 0; - request.__isset.version = false; - res = _command_executor->create_table(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - SmartOLAPTable tablet = _command_executor->get_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - std::string header_file_name = tablet->header_file_name(); - - res = _command_executor->make_snapshot( - request.tablet_id, request.tablet_schema.schema_hash, &snapshot_path); - // ASSERT_EQ(OLAP_ERR_VERSION_NOT_EXIST, res); - - // clear - tablet.reset(); - OLAPEngine::get_instance()->drop_table( - request.tablet_id, request.tablet_schema.schema_hash); - ASSERT_EQ(access(header_file_name.c_str(), F_OK), -1); -} - -TEST_F(TestClone, make_snapshot) { - OLAPStatus res = OLAP_SUCCESS; - std::string snapshot_path; - - res = _command_executor->make_snapshot( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash, &snapshot_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(0, access(snapshot_path.c_str(), F_OK)); - - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(snapshot_path)); -} - -TEST_F(TestClone, release_snapshot_abnormal) { - std::string long_path = "/empty_storage_root/snapshot/path"; - std::string short_path = "/s"; - - ASSERT_EQ(OLAP_ERR_CE_CMD_PARAMS_ERROR, _command_executor->release_snapshot(long_path)); - ASSERT_EQ(OLAP_ERR_CE_CMD_PARAMS_ERROR, _command_executor->release_snapshot(short_path)); -} - -TEST_F(TestClone, release_snapshot) { - OLAPStatus res = OLAP_SUCCESS; - std::string snapshot_path; - - res = _command_executor->make_snapshot( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash, &snapshot_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(0, access(snapshot_path.c_str(), F_OK)); - - res = _command_executor->release_snapshot(snapshot_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(access(snapshot_path.c_str(), F_OK), -1); -} - -TEST_F(TestClone, obtain_root_path_abnormal) { - OLAPStatus res = OLAP_SUCCESS; - std::string root_path; - - OLAPRootPath::get_instance()->clear(); - res = _command_executor->obtain_shard_path(TStorageMedium::HDD, &root_path); - ASSERT_EQ(OLAP_ERR_NO_AVAILABLE_ROOT_PATH, res); - OLAPRootPath::get_instance()->init(); -} - -TEST_F(TestClone, obtain_root_path) { - OLAPStatus res = OLAP_SUCCESS; - std::string root_path; - - res = _command_executor->obtain_shard_path(TStorageMedium::HDD, &root_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(config::storage_root_path + DATA_PREFIX + "/0", root_path); -} - -void set_clone_request(const TCreateTabletReq& create_tablet, TCloneReq* request) { - request->tablet_id = create_tablet.tablet_id; - request->schema_hash = create_tablet.tablet_schema.schema_hash; -} - -TEST_F(TestClone, load_header_abnormal) { - OLAPStatus res = OLAP_SUCCESS; - std::string shard_path = config::storage_root_path + DATA_PREFIX + "/0"; - - TCloneReq request; - set_clone_request(_create_tablet, &request); - request.tablet_id = 0; - res = _command_executor->load_header(shard_path, request); - ASSERT_EQ(OLAP_ERR_FILE_NOT_EXIST, res); - - request.tablet_id = _create_tablet.tablet_id; - res = _command_executor->load_header(shard_path, request); - ASSERT_EQ(OLAP_SUCCESS, res); -} - -TEST_F(TestClone, load_header) { - OLAPStatus res = OLAP_SUCCESS; - std::string snapshot_path; - std::string root_path; - - // 1. Make snapshot. - res = _command_executor->make_snapshot( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash, &snapshot_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(0, access(snapshot_path.c_str(), F_OK)); - - // 2. Obtain root path. - res = _command_executor->obtain_shard_path(TStorageMedium::HDD, &root_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(config::storage_root_path + DATA_PREFIX + "/0", root_path); - - // 3. Drop the old tablet and copy the snapshot to root_path. - // to avoid delete tablet has same name: .delete.schema_hash.datetime - OLAPEngine::get_instance()->drop_table( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - ASSERT_EQ(access(_header_file_name.c_str(), F_OK), -1); - system(("rm -fr " + root_path + "/[^s]*").c_str()); - system(("cp -r " + snapshot_path + "/* " + root_path).c_str()); - - // 4. Load header. - TCloneReq request; - set_clone_request(_create_tablet, &request); - res = _command_executor->load_header(root_path, request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 5. Release snapshot. - res = _command_executor->release_snapshot(snapshot_path); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(access(snapshot_path.c_str(), F_OK), -1); -} - -// ######################### ALTER CLONE END ######################### - -class TestDeleteData : public ::testing::Test { -public: - TestDeleteData() : _command_executor(NULL) {} - ~TestDeleteData() { - SAFE_DELETE(_command_executor); - } - - void SetUp() { - // Create local data dir for OLAPEngine. - char buffer[MAX_PATH_LEN]; - getcwd(buffer, MAX_PATH_LEN); - config::storage_root_path = string(buffer) + "/test_run/data_delete_data"; - remove_all_dir(config::storage_root_path); - ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); - - // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); - - // 1. Prepare for query split key. - // create base tablet - OLAPStatus res = OLAP_SUCCESS; - set_default_create_tablet_request(&_create_tablet); - res = _command_executor->create_table(_create_tablet); - ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = _command_executor->get_table( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); - - // push data - set_default_push_request(_create_tablet, &_push_req); - std::vector tablets_info; - res = _command_executor->push(_push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - } - - void TearDown(){ - // Remove all dir. - OLAPEngine::get_instance()->drop_table( - _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { - sleep(1); - } - ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - } - - CommandExecutor* _command_executor; - std::string _header_file_name; - TCreateTabletReq _create_tablet; - TPushReq _push_req; -}; - -void set_delete_data_condition(TPushReq* request) { - TCondition condition; - condition.column_name = "k1"; - condition.condition_op = "="; - condition.condition_values.clear(); - condition.condition_values.push_back("-120"); - request->delete_conditions.push_back(condition); - - condition.column_name = "k3"; - condition.condition_op = "="; - condition.condition_values.clear(); - condition.condition_values.push_back("ccc1aa42-e403-4964-a065-583f77e7ee98"); - request->delete_conditions.push_back(condition); - - condition.column_name = "k6"; - condition.condition_op = "="; - condition.condition_values.clear(); - condition.condition_values.push_back("89.779"); - request->delete_conditions.push_back(condition); -} - -TEST_F(TestDeleteData, delete_data_abnormal) { - OLAPStatus res = OLAP_SUCCESS; - std::vector tablets_info; - set_delete_data_condition(&_push_req); - - // check non_existed tablet - _push_req.tablet_id = 0; - res = _command_executor->delete_data(_push_req, &tablets_info); - ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // check invalid delete version - _push_req.version = 1; - _push_req.tablet_id = _create_tablet.tablet_id; - res = _command_executor->delete_data(_push_req, &tablets_info); - ASSERT_EQ(OLAP_ERR_PUSH_VERSION_INCORRECT, res); - - // check invalid delete condition - _push_req.version += 2; - TCondition condition; - condition.column_name = "k1"; - condition.condition_op = "="; - condition.condition_values.push_back("128"); - _push_req.delete_conditions.push_back(condition); - res = _command_executor->delete_data(_push_req, &tablets_info); - ASSERT_EQ(OLAP_ERR_DELETE_INVALID_CONDITION, res); -} - -TEST_F(TestDeleteData, delete_data) { - OLAPStatus res = OLAP_SUCCESS; - std::vector tablets_info; - - // 1. Submit delete data normally. - tablets_info.clear(); - _push_req.version += 1; - _push_req.version_hash += 1; - _push_req.__isset.http_file_path = false; - set_delete_data_condition(&_push_req); - - res = _command_executor->delete_data(_push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(_push_req.version, tablets_info[0].version); - ASSERT_EQ(_push_req.version_hash, tablets_info[0].version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_ROW_COUNT, tablets_info[0].row_count); - - // 2. Submit the the request. - tablets_info.clear(); - res = _command_executor->delete_data(_push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(_push_req.version, tablets_info[0].version); - ASSERT_EQ(_push_req.version_hash, tablets_info[0].version_hash); - ASSERT_EQ(BASE_TABLE_PUSH_DATA_ROW_COUNT, tablets_info[0].row_count); -} - -void set_cancel_delete_data_request( - const TPushReq& push_req, - TCancelDeleteDataReq* request) { - request->tablet_id = push_req.tablet_id; - request->schema_hash = push_req.schema_hash; - request->version = push_req.version; - request->version_hash = push_req.version_hash; -} - -TEST_F(TestDeleteData, cancel_delete_abnormal) { - OLAPStatus res = OLAP_SUCCESS; - - // check non_existed tablet - TCancelDeleteDataReq request; - set_cancel_delete_data_request(_push_req, &request); - request.tablet_id = 0; - res = _command_executor->cancel_delete(request); - // ASSERT_EQ(OLAP_ERR_TABLE_NOT_FOUND, res); - - // check invalid version - request.version = -1; - request.tablet_id = _push_req.tablet_id; - res = _command_executor->cancel_delete(request); - ASSERT_EQ(OLAP_ERR_DELETE_INVALID_PARAMETERS, res); -} - -TEST_F(TestDeleteData, cancel_delete) { - OLAPStatus res = OLAP_SUCCESS; - std::vector tablets_info; - - // 1. Submit delete data first. - tablets_info.clear(); - _push_req.version += 1; - _push_req.version_hash += 1; - _push_req.__isset.http_file_path = false; - set_delete_data_condition(&_push_req); - - res = _command_executor->delete_data(_push_req, &tablets_info); - ASSERT_EQ(OLAP_SUCCESS, res); - ASSERT_EQ(1, tablets_info.size()); - ASSERT_EQ(_push_req.version, tablets_info[0].version); - - // 2. Cancel delete data request. - TCancelDeleteDataReq request; - set_cancel_delete_data_request(_push_req, &request); - res = _command_executor->cancel_delete(request); - ASSERT_EQ(OLAP_SUCCESS, res); - - // 3. Cancel the delete data request again. - res = _command_executor->cancel_delete(request); - ASSERT_EQ(OLAP_SUCCESS, res); -} - -} // namespace palo - -int main(int argc, char** argv) { - std::string conffile = std::string(getenv("DORIS_HOME")) + "/conf/be.conf"; - if (!palo::config::init(conffile.c_str(), false)) { - fprintf(stderr, "error read config file. \n"); - return -1; - } - palo::init_glog("be-test"); - int ret = palo::OLAP_SUCCESS; - testing::InitGoogleTest(&argc, argv); - palo::CpuInfo::init(); - - palo::set_up(); - ret = RUN_ALL_TESTS(); - palo::tear_down(); - - google::protobuf::ShutdownProtobufLibrary(); - return ret; -} diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index a333a63e1a..5fdb682e27 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -24,13 +24,12 @@ #include #include -#include "olap/command_executor.h" #include "olap/delete_handler.h" #include "olap/olap_define.h" #include "olap/olap_engine.h" -#include "olap/olap_main.cpp" #include "olap/push_handler.h" #include "olap/utils.h" +#include "olap/options.h" #include "util/logging.h" using namespace std; @@ -41,6 +40,7 @@ using google::protobuf::RepeatedPtrField; namespace palo { static const uint32_t MAX_PATH_LEN = 1024; +static OLAPEngine* k_engine = nullptr; void set_up() { char buffer[MAX_PATH_LEN]; @@ -49,7 +49,12 @@ void set_up() { remove_all_dir(config::storage_root_path); remove_all_dir(string(getenv("DORIS_HOME")) + UNUSED_PREFIX); create_dir(config::storage_root_path); - touch_all_singleton(); + std::vector paths; + paths.emplace_back(config::storage_root_path, -1); + + palo::EngineOptions options; + options.store_paths = paths; + palo::OLAPEngine::open(options, &k_engine); } void tear_down() { @@ -67,7 +72,7 @@ void set_default_create_tablet_request(TCreateTabletReq* request) { request->tablet_schema.schema_hash = 270068375; request->tablet_schema.short_key_column_count = 2; request->tablet_schema.keys_type = TKeysType::AGG_KEYS; - request->tablet_schema.storage_type = TStorageType::ROW; + request->tablet_schema.storage_type = TStorageType::COLUMN; TColumn k1; k1.column_name = "k1"; @@ -161,21 +166,18 @@ protected: ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); + // OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); // 1. Prepare for query split key. // create base tablet OLAPStatus res = OLAP_SUCCESS; set_default_create_tablet_request(&_create_tablet); - res = _command_executor->create_table(_create_tablet); + res = k_engine->create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - _olap_table = _command_executor->get_table( + _olap_table = k_engine->get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(_olap_table.get() != NULL); - _header_file_name = _olap_table->header_file_name(); + _tablet_path = _olap_table->tablet_path(); } OLAPStatus push_empty_delta(int32_t version) { @@ -185,7 +187,7 @@ protected: push_req.version = version; push_req.version_hash = version; std::vector tablets_info; - return _command_executor->push(push_req, &tablets_info); + return k_engine->push(push_req, &tablets_info); } void TearDown() { @@ -193,20 +195,17 @@ protected: _olap_table.reset(); OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_path.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - - SAFE_DELETE(_command_executor); } - typedef RepeatedPtrField del_cond_array; + typedef RepeatedPtrField del_cond_array; - std::string _header_file_name; - SmartOLAPTable _olap_table; + std::string _tablet_path; + OLAPTablePtr _olap_table; TCreateTabletReq _create_tablet; - CommandExecutor* _command_executor; DeleteConditionHandler _delete_condition_handler; }; @@ -478,21 +477,18 @@ protected: ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); + // OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); // 1. Prepare for query split key. // create base tablet OLAPStatus res = OLAP_SUCCESS; set_default_create_tablet_request(&_create_tablet); - res = _command_executor->create_table(_create_tablet); + res = k_engine->create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - _olap_table = _command_executor->get_table( + _olap_table = k_engine->get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(_olap_table.get() != NULL); - _header_file_name = _olap_table->header_file_name(); + _tablet_path = _olap_table->tablet_path(); } void TearDown() { @@ -500,20 +496,17 @@ protected: _olap_table.reset(); OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_path.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - - SAFE_DELETE(_command_executor); } - typedef RepeatedPtrField del_cond_array; + typedef RepeatedPtrField del_cond_array; - std::string _header_file_name; - SmartOLAPTable _olap_table; + std::string _tablet_path; + OLAPTablePtr _olap_table; TCreateTabletReq _create_tablet; - CommandExecutor* _command_executor; }; TEST_F(TestDeleteConditionHandler2, ValidConditionValue) { @@ -793,21 +786,18 @@ protected: ASSERT_EQ(create_dir(config::storage_root_path), OLAP_SUCCESS); // Initialize all singleton object. - OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); - - _command_executor = new(nothrow) CommandExecutor(); - ASSERT_TRUE(_command_executor != NULL); + // OLAPRootPath::get_instance()->reload_root_paths(config::storage_root_path.c_str()); // 1. Prepare for query split key. // create base tablet OLAPStatus res = OLAP_SUCCESS; set_default_create_tablet_request(&_create_tablet); - res = _command_executor->create_table(_create_tablet); + res = k_engine->create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - _olap_table = _command_executor->get_table( + _olap_table = k_engine->get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(_olap_table.get() != NULL); - _header_file_name = _olap_table->header_file_name(); + _tablet_path = _olap_table->tablet_path(); _data_row_cursor.init(_olap_table->tablet_schema()); _data_row_cursor.allocate_memory_for_string_type(_olap_table->tablet_schema()); @@ -820,7 +810,7 @@ protected: push_req.version = version; push_req.version_hash = version; std::vector tablets_info; - return _command_executor->push(push_req, &tablets_info); + return k_engine->push(push_req, &tablets_info); } void TearDown() { @@ -829,22 +819,19 @@ protected: _delete_handler.finalize(); OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_path.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); - - SAFE_DELETE(_command_executor); } - typedef RepeatedPtrField del_cond_array; + typedef RepeatedPtrField del_cond_array; - std::string _header_file_name; + std::string _tablet_path; RowCursor _data_row_cursor; - SmartOLAPTable _olap_table; + OLAPTablePtr _olap_table; TCreateTabletReq _create_tablet; DeleteHandler _delete_handler; - CommandExecutor* _command_executor; }; TEST_F(TestDeleteHandler, InitSuccess) { @@ -968,13 +955,15 @@ TEST_F(TestDeleteHandler, FilterDataSubconditions) { data_str.push_back("YWFH"); data_str.push_back("YWFH=="); data_str.push_back("1"); - res = _data_row_cursor.from_string(data_str); + OlapTuple tuple1(data_str); + res = _data_row_cursor.from_tuple(tuple1); ASSERT_EQ(OLAP_SUCCESS, res); ASSERT_TRUE(_delete_handler.is_filter_data(1, _data_row_cursor)); // æž„é€ ä¸€è¡Œæµ‹è¯•æ•°æ® data_str[1] = "4"; - res = _data_row_cursor.from_string(data_str); + OlapTuple tuple2(data_str); + res = _data_row_cursor.from_tuple(tuple2); ASSERT_EQ(OLAP_SUCCESS, res); // 䏿»¡è¶³å­æ¡ä»¶ï¼šk2!=4 ASSERT_FALSE(_delete_handler.is_filter_data(1, _data_row_cursor)); @@ -1047,7 +1036,8 @@ TEST_F(TestDeleteHandler, FilterDataConditions) { data_str.push_back("YWFH"); data_str.push_back("YWFH=="); data_str.push_back("1"); - res = _data_row_cursor.from_string(data_str); + OlapTuple tuple(data_str); + res = _data_row_cursor.from_tuple(tuple); ASSERT_EQ(OLAP_SUCCESS, res); // 这行数æ®ä¼šå› ä¸ºè¿‡æ»¤æ¡ä»¶3而被过滤 ASSERT_TRUE(_delete_handler.is_filter_data(1, _data_row_cursor)); @@ -1108,7 +1098,8 @@ TEST_F(TestDeleteHandler, FilterDataVersion) { data_str.push_back("YWFH"); data_str.push_back("YWFH=="); data_str.push_back("1"); - res = _data_row_cursor.from_string(data_str); + OlapTuple tuple(data_str); + res = _data_row_cursor.from_tuple(tuple); ASSERT_EQ(OLAP_SUCCESS, res); // 如果数æ®ç‰ˆæœ¬å°äºŽ6,则过滤æ¡ä»¶1ç”Ÿæ•ˆï¼Œè¿™æ¡æ•°æ®è¢«è¿‡æ»¤ ASSERT_TRUE(_delete_handler.is_filter_data(1, _data_row_cursor)); diff --git a/be/test/olap/mock_command_executor.h b/be/test/olap/mock_command_executor.h index ae6e90bf34..99b106feb6 100644 --- a/be/test/olap/mock_command_executor.h +++ b/be/test/olap/mock_command_executor.h @@ -17,14 +17,14 @@ #define BDG_PALO_BE_SRC_OLAP_MOCK_MOCK_COMMAND_EXECUTOR_H #include "gmock/gmock.h" -#include "olap/command_executor.h" +#include "olap/olap_engine.h" namespace palo { -class MockCommandExecutor : public CommandExecutor { +class MockCommandExecutor : public OLAPEngine { public: MOCK_METHOD1(create_table, OLAPStatus(const TCreateTabletReq& request)); - MOCK_METHOD2(get_table, SmartOLAPTable(TTabletId tablet_id, TSchemaHash schema_hash)); + MOCK_METHOD2(get_table, OLAPTablePtr(TTabletId tablet_id, TSchemaHash schema_hash)); MOCK_METHOD1(drop_table, OLAPStatus(const TDropTabletReq& request)); MOCK_METHOD2( push, @@ -86,6 +86,23 @@ public: MOCK_METHOD1( get_all_root_path_info, OLAPStatus(std::vector* root_paths_info)); + MOCK_METHOD2( + publish_version, + OLAPStatus(const TPublishVersionRequest& request, + std::vector* error_tablet_ids)); + MOCK_METHOD3( + get_info_before_incremental_clone, + std::string( + OLAPTablePtr tablet, + int64_t committed_version, + std::vector* missing_versions)); + MOCK_METHOD4( + finish_clone, + OLAPStatus( + OLAPTablePtr tablet, + const std::string& clone_dir, + int64_t committed_version, + bool is_incremental_clone)); }; } // namespace palo diff --git a/be/test/olap/olap_header_manager_test.cpp b/be/test/olap/olap_header_manager_test.cpp old mode 100644 new mode 100755 diff --git a/be/test/olap/olap_reader_test.cpp b/be/test/olap/olap_reader_test.cpp index 56586adac4..721e9ded73 100755 --- a/be/test/olap/olap_reader_test.cpp +++ b/be/test/olap/olap_reader_test.cpp @@ -56,7 +56,7 @@ void set_default_create_tablet_request(TCreateTabletReq* request) { request->__set_version_hash(0); request->tablet_schema.schema_hash = 1508825676; request->tablet_schema.short_key_column_count = 2; - request->tablet_schema.storage_type = TStorageType::ROW; + request->tablet_schema.storage_type = TStorageType::COLUMN; TColumn k1; k1.column_name = "k1"; @@ -146,7 +146,7 @@ public: // Remove all dir. OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_path.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); @@ -168,10 +168,10 @@ public: CommandExecutor command_executor = CommandExecutor(); res = command_executor.create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = command_executor.get_table( + OLAPTablePtr tablet = command_executor.get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); + _tablet_path = tablet->tablet_path(); // push data set_default_push_request(&_push_req); @@ -435,7 +435,7 @@ public: private: TCreateTabletReq _create_tablet; - std::string _header_file_name; + std::string _tablet_path; TPushReq _push_req; TPlanNode _tnode; @@ -681,7 +681,7 @@ public: // Remove all dir. OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_path.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); @@ -704,10 +704,10 @@ public: CommandExecutor command_executor = CommandExecutor(); res = command_executor.create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = command_executor.get_table( + OLAPTablePtr tablet = command_executor.get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); + _tablet_path = tablet->tablet_path(); // push data set_default_push_request(&_push_req); @@ -972,7 +972,7 @@ public: private: TCreateTabletReq _create_tablet; - std::string _header_file_name; + std::string _tablet_path; TPushReq _push_req; TPlanNode _tnode; @@ -1165,7 +1165,7 @@ public: // Remove all dir. OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_path.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); @@ -1187,10 +1187,10 @@ public: CommandExecutor command_executor = CommandExecutor(); res = command_executor.create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = command_executor.get_table( + OLAPTablePtr tablet = command_executor.get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); + _tablet_path = tablet->tablet_path(); // push data set_default_push_request(&_push_req); @@ -1472,7 +1472,7 @@ public: private: TCreateTabletReq _create_tablet; - std::string _header_file_name; + std::string _tablet_path; TPushReq _push_req; TPushReq _delete_req; diff --git a/be/test/olap/row_cursor_test.cpp b/be/test/olap/row_cursor_test.cpp index 7edc3df87c..0b5ab01cda 100644 --- a/be/test/olap/row_cursor_test.cpp +++ b/be/test/olap/row_cursor_test.cpp @@ -299,12 +299,13 @@ TEST_F(TestRowCursor, InitRowCursorWithScanKey) { ASSERT_EQ(row.get_fixed_len(), 34); ASSERT_EQ(row.get_variable_len(), 39); - res = row.from_string(scan_keys); + OlapTuple tuple1(scan_keys); + res = row.from_tuple(tuple1); ASSERT_EQ(res, OLAP_SUCCESS); - std::vector vec_string = row.to_string_vector(); - ASSERT_TRUE(strncmp(vec_string[0].c_str(), "0&char_exceed_length", vec_string[0].size())); - ASSERT_TRUE(strncmp(vec_string[1].c_str(), "0&varchar_exceed_length", vec_string[1].size())); + OlapTuple tuple2 = row.to_tuple(); + ASSERT_TRUE(strncmp(tuple2.get_value(0).c_str(), "0&char_exceed_length", 20)); + ASSERT_TRUE(strncmp(tuple2.get_value(1).c_str(), "0&varchar_exceed_length", 23)); } TEST_F(TestRowCursor, SetMinAndMaxKey) { diff --git a/be/test/olap/run_length_integer_test.cpp b/be/test/olap/run_length_integer_test.cpp index 86cd0c9ff3..82159d0a0e 100755 --- a/be/test/olap/run_length_integer_test.cpp +++ b/be/test/olap/run_length_integer_test.cpp @@ -841,12 +841,6 @@ TEST_F(TestRunLengthSignInteger, DirectEncodingForDeltaOverflows2) { } int main(int argc, char** argv) { - std::string conffile = std::string(getenv("DORIS_HOME")) + "/conf/be.conf"; - if (!palo::config::init(conffile.c_str(), false)) { - fprintf(stderr, "error read config file. \n"); - return -1; - } - palo::init_glog("be-test"); int ret = palo::OLAP_SUCCESS; testing::InitGoogleTest(&argc, argv); ret = RUN_ALL_TESTS(); diff --git a/be/test/olap/vectorized_olap_reader_test.cpp b/be/test/olap/vectorized_olap_reader_test.cpp index 3d0ca39534..5a00f674c2 100644 --- a/be/test/olap/vectorized_olap_reader_test.cpp +++ b/be/test/olap/vectorized_olap_reader_test.cpp @@ -59,7 +59,7 @@ void set_default_create_tablet_request(TCreateTabletReq* request) { request->__set_version_hash(0); request->tablet_schema.schema_hash = 1508825676; request->tablet_schema.short_key_column_count = 2; - request->tablet_schema.storage_type = TStorageType::ROW; + request->tablet_schema.storage_type = TStorageType::COLUMN; TColumn k1; k1.column_name = "k1"; @@ -148,7 +148,7 @@ public: // Remove all dir. OLAPEngine::get_instance()->drop_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); - while (0 == access(_header_file_name.c_str(), F_OK)) { + while (0 == access(_tablet_name.c_str(), F_OK)) { sleep(1); } ASSERT_EQ(OLAP_SUCCESS, remove_all_dir(config::storage_root_path)); @@ -170,10 +170,10 @@ public: CommandExecutor command_executor = CommandExecutor(); res = command_executor.create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = command_executor.get_table( + OLAPTablePtr tablet = command_executor.get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); + _tablet_name = tablet->tablet_name(); // push data set_default_push_request(&_push_req); @@ -200,10 +200,10 @@ public: CommandExecutor command_executor = CommandExecutor(); res = command_executor.create_table(_create_tablet); ASSERT_EQ(OLAP_SUCCESS, res); - SmartOLAPTable tablet = command_executor.get_table( + OLAPTablePtr tablet = command_executor.get_table( _create_tablet.tablet_id, _create_tablet.tablet_schema.schema_hash); ASSERT_TRUE(tablet.get() != NULL); - _header_file_name = tablet->header_file_name(); + _tablet_name = tablet->tablet_name(); // push data set_default_push_request(&_push_req); @@ -528,7 +528,7 @@ public: } private: TCreateTabletReq _create_tablet; - std::string _header_file_name; + std::string _tablet_name; TPushReq _push_req; TPlanNode _tnode; diff --git a/be/test/runtime/CMakeLists.txt b/be/test/runtime/CMakeLists.txt index 869278f4cc..5b9dcb7d92 100644 --- a/be/test/runtime/CMakeLists.txt +++ b/be/test/runtime/CMakeLists.txt @@ -53,5 +53,7 @@ ADD_BE_TEST(disk_io_mgr_test) ADD_BE_TEST(mem_limit_test) ADD_BE_TEST(buffered_block_mgr2_test) ADD_BE_TEST(buffered_tuple_stream2_test) +ADD_BE_TEST(stream_load_pipe_test) +ADD_BE_TEST(tablet_writer_mgr_test) #ADD_BE_TEST(export_task_mgr_test) ADD_BE_TEST(snapshot_loader_test) diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt index 71d37deb0a..83dee99593 100644 --- a/be/test/util/CMakeLists.txt +++ b/be/test/util/CMakeLists.txt @@ -32,5 +32,10 @@ ADD_BE_TEST(cidr_test) ADD_BE_TEST(new_metrics_test) ADD_BE_TEST(palo_metrics_test) ADD_BE_TEST(system_metrics_test) +ADD_BE_TEST(string_util_test) ADD_BE_TEST(core_local_test) ADD_BE_TEST(types_test) +ADD_BE_TEST(json_util_test) +ADD_BE_TEST(byte_buffer_test2) +ADD_BE_TEST(uid_util_test) +ADD_BE_TEST(arena_test) diff --git a/be/test/util/system_metrics_test.cpp b/be/test/util/system_metrics_test.cpp index 32b854c54e..9af9e945fe 100644 --- a/be/test/util/system_metrics_test.cpp +++ b/be/test/util/system_metrics_test.cpp @@ -284,6 +284,12 @@ TEST_F(SystemMetricsTest, no_proc_file) { } int main(int argc, char** argv) { + std::string conffile = std::string(getenv("DORIS_HOME")) + "/conf/be.conf"; + if (!palo::config::init(conffile.c_str(), false)) { + fprintf(stderr, "error read config file. \n"); + return -1; + } + palo::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/build.sh b/build.sh index 21d6c44f6c..5a5f1a84b1 100755 --- a/build.sh +++ b/build.sh @@ -33,7 +33,7 @@ export DORIS_HOME=${ROOT} . ${DORIS_HOME}/env.sh -PARALLEL=8 +PARALLEL=4 # Check args usage() { diff --git a/fe/pom.xml b/fe/pom.xml index eb22db3141..d7227a76cd 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -68,7 +68,7 @@ under the License. general-env - !env.THIRDPARTY_REPO + !env.CUSTOM_MAVEN_REPO diff --git a/fe/src/main/cup/sql_parser.cup b/fe/src/main/cup/sql_parser.cup index e703fd7ef8..74848859be 100644 --- a/fe/src/main/cup/sql_parser.cup +++ b/fe/src/main/cup/sql_parser.cup @@ -193,13 +193,13 @@ parser code {: :}; // Total keywords of palo -terminal String KW_ADD, KW_AFTER, KW_AGGREGATE, KW_ALL, KW_ALTER, KW_AND, KW_ANTI, KW_AS, KW_ASC, KW_AUTHORS, +terminal String KW_ADD, KW_ADMIN, KW_AFTER, KW_AGGREGATE, KW_ALL, KW_ALTER, KW_AND, KW_ANTI, KW_AS, KW_ASC, KW_AUTHORS, KW_BACKEND, KW_BACKUP, KW_BETWEEN, KW_BEGIN, KW_BIGINT, KW_BOOLEAN, KW_BOTH, KW_BROKER, KW_BACKENDS, KW_BY, KW_CANCEL, KW_CASE, KW_CAST, KW_CHAIN, KW_CHAR, KW_CHARSET, KW_CLUSTER, KW_CLUSTERS, KW_COLLATE, KW_COLLATION, KW_COLUMN, KW_COLUMNS, KW_COMMENT, KW_COMMIT, KW_COMMITTED, KW_CONNECTION, KW_CONNECTION_ID, KW_CONSISTENT, KW_COUNT, KW_CREATE, KW_CROSS, KW_CURRENT, KW_CURRENT_USER, KW_DATA, KW_DATABASE, KW_DATABASES, KW_DATE, KW_DATETIME, KW_DECIMAL, KW_DECOMMISSION, KW_DEFAULT, KW_DESC, KW_DESCRIBE, - KW_DELETE, KW_DISTINCT, KW_DISTINCTPC, KW_DISTINCTPCSA, KW_DISTRIBUTED, KW_BUCKETS, KW_DIV, KW_DOUBLE, KW_DROP, KW_DROPP, KW_DUPLICATE, + KW_DELETE, KW_DISTINCT, KW_DISTINCTPC, KW_DISTINCTPCSA, KW_DISTRIBUTED, KW_DISTRIBUTION, KW_BUCKETS, KW_DIV, KW_DOUBLE, KW_DROP, KW_DROPP, KW_DUPLICATE, KW_ELSE, KW_END, KW_ENGINE, KW_ENGINES, KW_ENTER, KW_ERRORS, KW_EVENTS, KW_EXISTS, KW_EXPORT, KW_EXTERNAL, KW_EXTRACT, KW_FALSE, KW_FOLLOWER, KW_FOLLOWING, KW_FREE, KW_FROM, KW_FIRST, KW_FLOAT, KW_FOR, KW_FRONTENDS, KW_FULL, KW_FUNCTION, KW_GLOBAL, KW_GRANT, KW_GRANTS, KW_GROUP, @@ -218,7 +218,7 @@ terminal String KW_ADD, KW_AFTER, KW_AGGREGATE, KW_ALL, KW_ALTER, KW_AND, KW_ANT KW_PROC, KW_PROCEDURE, KW_PROCESSLIST, KW_PROPERTIES, KW_PROPERTY, KW_QUERY, KW_QUOTA, KW_RANDOM, KW_RANGE, KW_READ, KW_RECOVER, KW_REGEXP, KW_RELEASE, KW_RENAME, - KW_REPEATABLE, KW_REPOSITORY, KW_REPOSITORIES, KW_REPLACE, KW_RESOURCE, KW_RESTORE, KW_REVOKE, + KW_REPEATABLE, KW_REPOSITORY, KW_REPOSITORIES, KW_REPLACE, KW_REPLICA, KW_RESOURCE, KW_RESTORE, KW_REVOKE, KW_RIGHT, KW_ROLE, KW_ROLES, KW_ROLLBACK, KW_ROLLUP, KW_ROW, KW_ROWS, KW_SCHEMAS, KW_SELECT, KW_SEMI, KW_SERIALIZABLE, KW_SESSION, KW_SET, KW_SHOW, KW_SMALLINT, KW_SNAPSHOT, KW_SONAME, KW_SPLIT, KW_START, KW_STATUS, KW_STORAGE, KW_STRING, @@ -245,7 +245,10 @@ terminal String COMMENTED_PLAN_HINTS; // Statement that the result of this parser. nonterminal StatementBase query, stmt, show_stmt, show_param, help_stmt, load_stmt, describe_stmt, alter_stmt, use_stmt, kill_stmt, drop_stmt, recover_stmt, grant_stmt, revoke_stmt, create_stmt, set_stmt, sync_stmt, cancel_stmt, cancel_param, delete_stmt, - link_stmt, migrate_stmt, enter_stmt, unsupported_stmt, export_stmt; + link_stmt, migrate_stmt, enter_stmt, unsupported_stmt, export_stmt, admin_stmt, import_columns_stmt, import_where_stmt; + +nonterminal ImportColumnDesc import_column_desc; +nonterminal List import_column_descs; // unsupported statement nonterminal opt_with_consistent_snapshot, opt_work, opt_chain, opt_release; @@ -381,7 +384,7 @@ nonterminal String opt_from_rollup, opt_to_rollup; nonterminal ColumnPosition opt_col_pos; // Alter statement -nonterminal AlterClause alter_system_clause, alter_cluster_clause, alter_table_clause, alter_user_clause; +nonterminal AlterClause alter_system_clause, alter_cluster_clause, alter_table_clause; nonterminal List alter_table_clause_list; // @@ -423,6 +426,51 @@ query ::= {: RESULT = stmt; :} + | import_columns_stmt:stmt + {: + RESULT = stmt; + :} + | import_where_stmt:stmt + {: + RESULT = stmt; + :} + ; + +import_columns_stmt ::= + KW_COLUMNS import_column_descs:columns + {: + RESULT = new ImportColumnsStmt(columns); + :} + ; + +import_column_descs ::= + import_column_desc:column + {: + RESULT = Lists.newArrayList(column); + :} + | import_column_descs:columns COMMA import_column_desc:column + {: + columns.add(column); + RESULT = columns; + :} + ; + +import_column_desc ::= + ident:name + {: + RESULT = new ImportColumnDesc(name, null); + :} + | ident:name EQUAL expr:expr + {: + RESULT = new ImportColumnDesc(name, expr); + :} + ; + +import_where_stmt ::= + KW_WHERE expr:expr + {: + RESULT = new ImportWhereStmt(expr); + :} ; stmt ::= @@ -476,6 +524,8 @@ stmt ::= {: RESULT = stmt; :} | export_stmt : stmt {: RESULT = stmt; :} + | admin_stmt : stmt + {: RESULT = stmt; :} | /* empty: query only has comments */ {: RESULT = new EmptyStmt(); @@ -543,10 +593,6 @@ alter_stmt ::= {: RESULT = new AlterDatabaseRename(dbName, newDbName); :} - | KW_ALTER KW_USER user_identity:userIdent alter_user_clause:clause - {: - RESULT = new AlterUserStmt(userIdent, clause); - :} ; opt_user ::= @@ -749,16 +795,6 @@ alter_cluster_clause ::= RESULT = new AlterClusterClause(AlterClusterType.ALTER_CLUSTER_PROPERTIES, properties); :} ; -alter_user_clause ::= - KW_ADD KW_WHITELIST string_list:hostPorts - {: - RESULT = new AlterUserClause(AlterUserType.ADD_USER_WHITELIST, hostPorts); - :} - | KW_DELETE KW_WHITELIST string_list:hostPorts - {: - RESULT = new AlterUserClause(AlterUserType.DELETE_USER_WHITELIST, hostPorts); - :} - ; // Sync Stmt sync_stmt ::= @@ -3480,6 +3516,17 @@ integer_list ::= :} ; +admin_stmt ::= + KW_ADMIN KW_SHOW KW_REPLICA KW_STATUS KW_FROM base_table_ref:table_ref opt_wild_where + {: + RESULT = new AdminShowReplicaStatusStmt(table_ref, parser.where); + :} + | KW_ADMIN KW_SHOW KW_REPLICA KW_DISTRIBUTION KW_FROM base_table_ref:table_ref + {: + RESULT = new AdminShowReplicaDistributionStmt(table_ref); + :} + ; + unsupported_stmt ::= KW_START KW_TRANSACTION opt_with_consistent_snapshot:v {: diff --git a/fe/src/main/java/com/baidu/palo/alter/AlterHandler.java b/fe/src/main/java/com/baidu/palo/alter/AlterHandler.java index a17eb1ef5c..bb54af1c52 100644 --- a/fe/src/main/java/com/baidu/palo/alter/AlterHandler.java +++ b/fe/src/main/java/com/baidu/palo/alter/AlterHandler.java @@ -26,176 +26,125 @@ import com.baidu.palo.common.DdlException; import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.common.util.Daemon; import com.baidu.palo.common.util.TimeUtils; -import com.baidu.palo.system.Backend; -import com.baidu.palo.system.BackendEvent; -import com.baidu.palo.system.BackendEvent.BackendEventType; -import com.baidu.palo.system.SystemInfoObserver; import com.baidu.palo.task.AgentTask; import com.baidu.palo.thrift.TTabletInfo; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.locks.ReentrantLock; public abstract class AlterHandler extends Daemon { private static final Logger LOG = LogManager.getLogger(AlterHandler.class); // tableId -> AlterJob - protected Map alterJobs; - - protected List finishedOrCancelledAlterJobs; - - // observe cluster changed - protected AlterHandlerSystemInfoObserver clusterInfoObserver; - - /* - * ATTN: - * lock order is: - * db lock - * jobs lock - * synchronized - * - * if reversal is inevitable. use db.tryLock() instead to avoid dead lock - */ - protected ReentrantReadWriteLock jobsLock; + protected ConcurrentHashMap alterJobs = new ConcurrentHashMap(); - public void readLock() { - jobsLock.readLock().lock(); + protected ConcurrentLinkedQueue finishedOrCancelledAlterJobs = new ConcurrentLinkedQueue(); + + /* + * lock to perform atomic operations. + * eg. + * When job is finished, it will be moved from alterJobs to finishedOrCancelledAlterJobs, + * and this requires atomic operations. So the lock must be held to do this operations. + * Operations like Get or Put do not need lock. + */ + protected ReentrantLock lock = new ReentrantLock(); + + protected void lock() { + lock.lock(); } - public void readUnLock() { - jobsLock.readLock().unlock(); + protected void unlock() { + lock.unlock(); } public AlterHandler(String name) { super(name); - alterJobs = new HashMap(); - finishedOrCancelledAlterJobs = new LinkedList(); - jobsLock = new ReentrantReadWriteLock(); - - clusterInfoObserver = new AlterHandlerSystemInfoObserver(name); } protected void addAlterJob(AlterJob alterJob) { - this.jobsLock.writeLock().lock(); - try { - LOG.info("add {} job[{}]", alterJob.getType(), alterJob.getTableId()); - this.alterJobs.put(alterJob.getTableId(), alterJob); - } finally { - this.jobsLock.writeLock().unlock(); - } + this.alterJobs.put(alterJob.getTableId(), alterJob); + LOG.info("add {} job[{}]", alterJob.getType(), alterJob.getTableId()); } public AlterJob getAlterJob(long tableId) { - this.jobsLock.readLock().lock(); - try { - return this.alterJobs.get(tableId); - } finally { - this.jobsLock.readLock().unlock(); - } + return this.alterJobs.get(tableId); + } + + public boolean hasUnfinishedAlterJob(long tableId) { + return this.alterJobs.containsKey(tableId); } public int getAlterJobNum(JobState state, long dbId) { int jobNum = 0; - this.jobsLock.readLock().lock(); - try { - switch (state) { - case PENDING: - for (AlterJob alterJob : alterJobs.values()) { - if (alterJob.getState() == JobState.PENDING && alterJob.getDbId() == dbId) { - ++jobNum; - } - } - break; - case RUNNING: - for (AlterJob alterJob : alterJobs.values()) { - if (alterJob.getState() == JobState.RUNNING && alterJob.getDbId() == dbId) { - ++jobNum; - } - } - break; - case FINISHED: - for (AlterJob alterJob : alterJobs.values()) { - if (alterJob.getState() == JobState.FINISHED && alterJob.getDbId() == dbId) { - ++jobNum; - } + if (state == JobState.PENDING || state == JobState.RUNNING || state == JobState.FINISHING) { + for (AlterJob alterJob : alterJobs.values()) { + if (alterJob.getState() == state && alterJob.getDbId() == dbId) { + ++jobNum; + } + } + } else if (state == JobState.FINISHED) { + // lock to perform atomically + lock(); + try { + for (AlterJob alterJob : alterJobs.values()) { + if (alterJob.getState() == JobState.FINISHED && alterJob.getDbId() == dbId) { + ++jobNum; } + } - for (AlterJob alterJob : finishedOrCancelledAlterJobs) { - if (alterJob.getState() == JobState.FINISHED && alterJob.getDbId() == dbId) { - ++jobNum; - } + for (AlterJob alterJob : finishedOrCancelledAlterJobs) { + if (alterJob.getState() == JobState.FINISHED && alterJob.getDbId() == dbId) { + ++jobNum; } - - if (this instanceof SchemaChangeHandler) { - jobNum += ((SchemaChangeHandler) this).getDelayDeletingJobNum(dbId); - } - - break; - case CANCELLED: - for (AlterJob alterJob : finishedOrCancelledAlterJobs) { - if (alterJob.getState() == JobState.CANCELLED && alterJob.getDbId() == dbId) { - ++jobNum; - } - } - break; - default: - break; + } + } finally { + unlock(); } - return jobNum; - } finally { - this.jobsLock.readLock().unlock(); + } else if (state == JobState.CANCELLED) { + for (AlterJob alterJob : finishedOrCancelledAlterJobs) { + if (alterJob.getState() == JobState.CANCELLED && alterJob.getDbId() == dbId) { + ++jobNum; + } + } } + + return jobNum; } public Map unprotectedGetAlterJobs() { return this.alterJobs; } - public List unprotectedGetFinishedOrCancelledAlterJobs() { + public ConcurrentLinkedQueue unprotectedGetFinishedOrCancelledAlterJobs() { return this.finishedOrCancelledAlterJobs; } - + public void addFinishedOrCancelledAlterJob(AlterJob alterJob) { alterJob.clear(); - this.jobsLock.writeLock().lock(); - try { - LOG.info("add {} job[{}] to finished or cancel list", alterJob.getType(), alterJob.getTableId()); - this.finishedOrCancelledAlterJobs.add(alterJob); - } finally { - this.jobsLock.writeLock().unlock(); - } + LOG.info("add {} job[{}] to finished or cancel list", alterJob.getType(), alterJob.getTableId()); + this.finishedOrCancelledAlterJobs.add(alterJob); } protected AlterJob removeAlterJob(long tableId) { - this.jobsLock.writeLock().lock(); - try { - return this.alterJobs.remove(tableId); - } finally { - this.jobsLock.writeLock().unlock(); - } + return this.alterJobs.remove(tableId); } public void removeDbAlterJob(long dbId) { - this.jobsLock.writeLock().lock(); - try { - Iterator> iterator = alterJobs.entrySet().iterator(); - while (iterator.hasNext()) { - Map.Entry entry = iterator.next(); - AlterJob alterJob = entry.getValue(); - if (alterJob.getDbId() == dbId) { - iterator.remove(); - } + Iterator> iterator = alterJobs.entrySet().iterator(); + while (iterator.hasNext()) { + Map.Entry entry = iterator.next(); + AlterJob alterJob = entry.getValue(); + if (alterJob.getDbId() == dbId) { + iterator.remove(); } - } finally { - this.jobsLock.writeLock().unlock(); } } @@ -221,87 +170,79 @@ public abstract class AlterHandler extends Daemon { */ public void cancelWithTable(OlapTable olapTable) { // make sure to hold to db write lock before calling this - this.jobsLock.writeLock().lock(); - try { - if (alterJobs.containsKey(olapTable.getId())) { - AlterJob alterJob = alterJobs.remove(olapTable.getId()); - alterJob.cancel(olapTable, "table is dropped"); - this.finishedOrCancelledAlterJobs.add(alterJob); - LOG.info("cancel {} job in table[{}] finished", alterJob.getType(), olapTable.getId()); - } - } finally { - this.jobsLock.writeLock().unlock(); + AlterJob alterJob = getAlterJob(olapTable.getId()); + if (alterJob == null) { + return; } - } + alterJob.cancel(olapTable, "table is dropped"); - /* - * when backend is removed or dead, handle related replicas - * backendId: - * id of backend which is removed or dead - */ - private void handleBackendRemoveEvent(long backendId) { - this.jobsLock.readLock().lock(); + // remove from alterJobs and add to finishedOrCancelledAlterJobs operation should be perform atomically + lock(); try { - Iterator> iterator = this.alterJobs.entrySet().iterator(); - while (iterator.hasNext()) { - AlterJob job = iterator.next().getValue(); - job.handleBackendRemoveEvent(backendId); + alterJob = alterJobs.remove(olapTable.getId()); + if (alterJob != null) { + alterJob.clear(); + finishedOrCancelledAlterJobs.add(alterJob); } } finally { - this.jobsLock.readLock().unlock(); + unlock(); } } protected void cancelInternal(AlterJob alterJob, OlapTable olapTable, String msg) { - // remove job - removeAlterJob(alterJob.getTableId()); - // cancel alterJob.cancel(olapTable, msg); + jobDone(alterJob); + } - // add to finishedOrCancelledAlterJobs - addFinishedOrCancelledAlterJob(alterJob); + protected void jobDone(AlterJob alterJob) { + lock(); + try { + // remove job + AlterJob alterJobRemoved = removeAlterJob(alterJob.getTableId()); + // add to finishedOrCancelledAlterJobs + if (alterJobRemoved != null) { + // add alterjob not alterJobRemoved, because the alterjob maybe a new object + // deserialized from journal, and the finished state is set to the new object + addFinishedOrCancelledAlterJob(alterJob); + } + } finally { + unlock(); + } } public void replayInitJob(AlterJob alterJob, Catalog catalog) { Database db = catalog.getDb(alterJob.getDbId()); - db.writeLock(); - try { - alterJob.unprotectedReplayInitJob(db); - // add rollup job - addAlterJob(alterJob); - } finally { - db.writeUnlock(); - } + alterJob.replayInitJob(db); + // add rollup job + addAlterJob(alterJob); + } + + public void replayFinishing(AlterJob alterJob, Catalog catalog) { + Database db = catalog.getDb(alterJob.getDbId()); + alterJob.replayFinishing(db); + alterJob.setState(JobState.FINISHING); + // !!! the alter job should add to the cache again, because the alter job is deserialized from journal + // it is a different object compared to the cache + addAlterJob(alterJob); } public void replayFinish(AlterJob alterJob, Catalog catalog) { Database db = catalog.getDb(alterJob.getDbId()); - db.writeLock(); - try { - removeAlterJob(alterJob.getTableId()); - alterJob.unprotectedReplayFinish(db); - alterJob.setState(JobState.FINISHED); - addFinishedOrCancelledAlterJob(alterJob); - } finally { - db.writeUnlock(); - } + alterJob.replayFinish(db); + alterJob.setState(JobState.FINISHED); + + jobDone(alterJob); } public void replayCancel(AlterJob alterJob, Catalog catalog) { removeAlterJob(alterJob.getTableId()); - alterJob.setState(JobState.CANCELLED); Database db = catalog.getDb(alterJob.getDbId()); if (db != null) { // we log rollup job cancelled even if db is dropped. // so check db != null here - db.writeLock(); - try { - alterJob.unprotectedReplayCancel(db); - } finally { - db.writeUnlock(); - } + alterJob.replayCancel(db); } addFinishedOrCancelledAlterJob(alterJob); @@ -310,26 +251,20 @@ public abstract class AlterHandler extends Daemon { @Override protected void runOneCycle() { // clean history job - this.jobsLock.writeLock().lock(); - try { - Iterator iter = finishedOrCancelledAlterJobs.iterator(); - while (iter.hasNext()) { - AlterJob historyJob = iter.next(); - if ((System.currentTimeMillis() - historyJob.getCreateTimeMs()) / 1000 > Config.label_keep_max_second) { - iter.remove(); - LOG.info("remove history {} job[{}]. created at {}", historyJob.getType(), - historyJob.getTableId(), TimeUtils.longToTimeString(historyJob.getCreateTimeMs())); - } + Iterator iter = finishedOrCancelledAlterJobs.iterator(); + while (iter.hasNext()) { + AlterJob historyJob = iter.next(); + if ((System.currentTimeMillis() - historyJob.getCreateTimeMs()) / 1000 > Config.label_keep_max_second) { + iter.remove(); + LOG.info("remove history {} job[{}]. created at {}", historyJob.getType(), + historyJob.getTableId(), TimeUtils.longToTimeString(historyJob.getCreateTimeMs())); } - } finally { - this.jobsLock.writeLock().unlock(); } } @Override public void start() { // register observer - Catalog.getCurrentSystemInfo().registerObserver(clusterInfoObserver); super.start(); } @@ -352,58 +287,13 @@ public abstract class AlterHandler extends Daemon { */ public abstract void cancel(CancelStmt stmt) throws DdlException; - private class AlterHandlerSystemInfoObserver extends SystemInfoObserver { - - public AlterHandlerSystemInfoObserver(String name) { - super(name); - } - - @Override - public void listen(BackendEvent backendEvent) { - BackendEventType type = backendEvent.getType(); - Long[] backendIds = backendEvent.getBackendIds(); - LOG.info("catch backend event: {}", backendEvent.toString()); - switch (type) { - case BACKEND_DROPPED: - case BACKEND_DECOMMISSION: - for (int i = 0; i < backendIds.length; i++) { - handleBackendRemoveEvent(backendIds[i]); - } - break; - case BACKEND_DOWN: - for (int i = 0; i < backendIds.length; i++) { - Backend backend = Catalog.getCurrentSystemInfo().getBackend(backendIds[i]); - if (backend != null) { - long currentTime = System.currentTimeMillis(); - if (currentTime - backend.getLastUpdateMs() > Config.max_backend_down_time_second * 1000) { - // this backend is done for a long time and not restart automatically. - // we consider it as dead - LOG.warn("backend[{}-{}] is down for a long time. last heartbeat: {}", - backendIds[i], backend.getHost(), - TimeUtils.longToTimeString(backend.getLastUpdateMs())); - handleBackendRemoveEvent(backendIds[i]); - } - } - } - break; - default: - break; - } - } - } - public Integer getAlterJobNumByState(JobState state) { int jobNum = 0; - this.jobsLock.readLock().lock(); - try { - for (AlterJob alterJob : alterJobs.values()) { - if (alterJob.getState() == state) { - ++jobNum; - } + for (AlterJob alterJob : alterJobs.values()) { + if (alterJob.getState() == state) { + ++jobNum; } - return jobNum; - } finally { - this.jobsLock.readLock().unlock(); } + return jobNum; } } diff --git a/fe/src/main/java/com/baidu/palo/alter/AlterJob.java b/fe/src/main/java/com/baidu/palo/alter/AlterJob.java index 73e1fc6b11..974c86fd3f 100644 --- a/fe/src/main/java/com/baidu/palo/alter/AlterJob.java +++ b/fe/src/main/java/com/baidu/palo/alter/AlterJob.java @@ -20,12 +20,12 @@ import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.catalog.Replica; import com.baidu.palo.common.Config; +import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; import com.baidu.palo.system.Backend; -import com.baidu.palo.system.BackendEvent; -import com.baidu.palo.system.BackendEvent.BackendEventType; +import com.baidu.palo.task.AgentBatchTask; import com.baidu.palo.task.AgentTask; import com.baidu.palo.thrift.TResourceInfo; import com.baidu.palo.thrift.TTabletInfo; @@ -33,7 +33,6 @@ import com.baidu.palo.thrift.TTabletInfo; import com.google.common.base.Preconditions; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; -import com.google.common.collect.Sets; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -41,7 +40,7 @@ import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.util.Set; +import java.util.List; public abstract class AlterJob implements Writable { private static final Logger LOG = LogManager.getLogger(AlterJob.class); @@ -49,6 +48,7 @@ public abstract class AlterJob implements Writable { public enum JobState { PENDING, RUNNING, + FINISHING, FINISHED, CANCELLED } @@ -79,6 +79,10 @@ public abstract class AlterJob implements Writable { protected long dbId; protected long tableId; + protected long transactionId = -1; + // not serialize it + protected boolean hasPreviousLoadFinished = false; + protected AgentBatchTask batchClearAlterTask = null; protected long createTime; protected long finishedTime; @@ -86,7 +90,7 @@ public abstract class AlterJob implements Writable { protected String cancelMsg; protected TResourceInfo resourceInfo; - + // backendId -> replicaIds // this map show which replica is still alive // if backend is down, replica is not reachable in BE, remove replica from this map @@ -95,12 +99,12 @@ public abstract class AlterJob implements Writable { public AlterJob(JobType type) { // for persist this.type = type; - this.backendIdToReplicaIds = HashMultimap.create(); this.state = JobState.PENDING; this.createTime = System.currentTimeMillis(); this.finishedTime = -1L; + this.backendIdToReplicaIds = HashMultimap.create(); } public AlterJob(JobType type, long dbId, long tableId, TResourceInfo resourceInfo) { @@ -115,7 +119,6 @@ public abstract class AlterJob implements Writable { this.finishedTime = -1L; this.cancelMsg = ""; - this.backendIdToReplicaIds = HashMultimap.create(); } @@ -138,6 +141,10 @@ public abstract class AlterJob implements Writable { public final long getTableId() { return tableId; } + + public final long getTransactionId() { + return transactionId; + } public final long getCreateTimeMs() { return this.createTime; @@ -154,18 +161,6 @@ public abstract class AlterJob implements Writable { public final synchronized String getMsg() { return this.cancelMsg; } - - public synchronized void handleBackendRemoveEvent(long backendId) { - if (this.backendIdToReplicaIds.containsKey(backendId)) { - LOG.warn("{} job[{}] is handling backend[{}] removed event", type, tableId, backendId); - Set replicaIds = Sets.newHashSet(this.backendIdToReplicaIds.get(backendId)); - for (Long replicaId : replicaIds) { - LOG.debug("remove replica[{}] from {} job[{}] cause backend[{}] removed", - replicaId, type, tableId, backendId); - directRemoveReplicaTask(replicaId, backendId); - } - } - } public boolean isTimeout() { // 0 means never timeout @@ -183,24 +178,22 @@ public abstract class AlterJob implements Writable { * otherwise, * alter job will not perceived backend's down event during job created and first handle round. */ - public boolean checkBackendState(Replica replica) { + protected boolean checkBackendState(Replica replica) { LOG.debug("check backend[{}] state for replica[{}]", replica.getBackendId(), replica.getId()); Backend backend = Catalog.getCurrentSystemInfo().getBackend(replica.getBackendId()); + // not send event to event bus because there is a dead lock, job --> check state --> bus lock --> handle backend down + // backenddown --> bus lock --> handle backend down --> job.lock if (backend == null) { - Catalog.getCurrentSystemInfo().getEventBus() - .post(new BackendEvent(BackendEventType.BACKEND_DROPPED, "does not found", - Long.valueOf(replica.getBackendId()))); return false; } else if (!backend.isAlive()) { - Catalog.getCurrentSystemInfo().getEventBus() - .post(new BackendEvent(BackendEventType.BACKEND_DOWN, "is not alive", - Long.valueOf(replica.getBackendId()))); - return false; + long currentTime = System.currentTimeMillis(); + if (currentTime - backend.getLastUpdateMs() > Config.max_backend_down_time_second * 1000) { + // this backend is done for a long time and not restart automatically. + // we consider it as dead + return false; + } + return true; } else if (backend.isDecommissioned()) { - Catalog.getCurrentSystemInfo().getEventBus() - .post(new BackendEvent(BackendEventType.BACKEND_DECOMMISSION, - "is decommissioned", - Long.valueOf(replica.getBackendId()))); return false; } @@ -250,11 +243,6 @@ public abstract class AlterJob implements Writable { */ public abstract void removeReplicaRelatedTask(long parentId, long tabletId, long replicaId, long backendId); - /* - * remove task directly - */ - public abstract void directRemoveReplicaTask(long replicaId, long backendId); - /* * handle replica finish task report */ @@ -278,12 +266,26 @@ public abstract class AlterJob implements Writable { * replay methods * corresponding to start/finished/cancelled */ - public abstract void unprotectedReplayInitJob(Database db); + public abstract void replayInitJob(Database db); + + public abstract void replayFinishing(Database db); - public abstract void unprotectedReplayFinish(Database db); + public abstract void replayFinish(Database db); - public abstract void unprotectedReplayCancel(Database db); + public abstract void replayCancel(Database db); + public abstract void getJobInfo(List> jobInfos, OlapTable tbl); + + public boolean checkPreviousLoadFinished() { + if (hasPreviousLoadFinished) { + return true; + } else { + hasPreviousLoadFinished = Catalog.getCurrentGlobalTransactionMgr() + .hasPreviousTransactionsFinished(transactionId, dbId); + return hasPreviousLoadFinished; + } + } + @Override public synchronized void readFields(DataInput in) throws IOException { // read common members as write in AlterJob.write(). @@ -307,6 +309,9 @@ public abstract class AlterJob implements Writable { String group = Text.readString(in); resourceInfo = new TResourceInfo(user, group); } + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + transactionId = in.readLong(); + } } @Override @@ -332,5 +337,8 @@ public abstract class AlterJob implements Writable { Text.writeString(out, resourceInfo.getUser()); Text.writeString(out, resourceInfo.getGroup()); } + + out.writeLong(transactionId); + } } diff --git a/fe/src/main/java/com/baidu/palo/alter/DecommissionBackendJob.java b/fe/src/main/java/com/baidu/palo/alter/DecommissionBackendJob.java index 304277164e..6c4fd82620 100644 --- a/fe/src/main/java/com/baidu/palo/alter/DecommissionBackendJob.java +++ b/fe/src/main/java/com/baidu/palo/alter/DecommissionBackendJob.java @@ -15,19 +15,6 @@ package com.baidu.palo.alter; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.commons.lang.NotImplementedException; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.MaterializedIndex; @@ -43,22 +30,36 @@ import com.baidu.palo.catalog.TabletMeta; import com.baidu.palo.clone.Clone; import com.baidu.palo.clone.CloneJob.JobPriority; import com.baidu.palo.cluster.Cluster; -import com.baidu.palo.persist.BackendIdsUpdateInfo; import com.baidu.palo.common.Config; import com.baidu.palo.common.DdlException; import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.common.io.Text; +import com.baidu.palo.persist.BackendIdsUpdateInfo; import com.baidu.palo.system.Backend; import com.baidu.palo.system.Backend.BackendState; import com.baidu.palo.system.SystemInfoService; import com.baidu.palo.task.AgentTask; import com.baidu.palo.thrift.TTabletInfo; + import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; +import org.apache.commons.lang.NotImplementedException; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + public class DecommissionBackendJob extends AlterJob { public enum DecommissionType { @@ -343,11 +344,6 @@ public class DecommissionBackendJob extends AlterJob { throw new NotImplementedException(); } - @Override - public void directRemoveReplicaTask(long replicaId, long backendId) { - throw new NotImplementedException(); - } - @Override public synchronized void handleFinishedReplica(AgentTask task, TTabletInfo finishTabletInfo, long reportVersion) throws MetaNotFoundException { @@ -533,17 +529,27 @@ public class DecommissionBackendJob extends AlterJob { } @Override - public void unprotectedReplayInitJob(Database db) { + public void replayInitJob(Database db) { + // do nothing + } + + @Override + public void replayFinishing(Database db) { + // do nothing + } + + @Override + public void replayFinish(Database db) { // do nothing } @Override - public void unprotectedReplayFinish(Database db) { + public void replayCancel(Database db) { // do nothing } @Override - public void unprotectedReplayCancel(Database db) { + public void getJobInfo(List> jobInfos, OlapTable tbl) { // do nothing } @@ -617,5 +623,4 @@ public class DecommissionBackendJob extends AlterJob { decommissionBackendJob.readFields(in); return decommissionBackendJob; } - } diff --git a/fe/src/main/java/com/baidu/palo/alter/RollupHandler.java b/fe/src/main/java/com/baidu/palo/alter/RollupHandler.java index 177dbdbed5..de0a3bc2a9 100644 --- a/fe/src/main/java/com/baidu/palo/alter/RollupHandler.java +++ b/fe/src/main/java/com/baidu/palo/alter/RollupHandler.java @@ -45,7 +45,6 @@ import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.util.ListComparator; import com.baidu.palo.common.util.PropertyAnalyzer; -import com.baidu.palo.common.util.TimeUtils; import com.baidu.palo.common.util.Util; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.persist.DropInfo; @@ -60,6 +59,7 @@ import com.baidu.palo.thrift.TStorageType; import com.google.common.base.Preconditions; import com.google.common.base.Strings; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.logging.log4j.LogManager; @@ -67,11 +67,9 @@ import org.apache.logging.log4j.Logger; import java.util.ArrayList; import java.util.Collections; -import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Set; public class RollupHandler extends AlterHandler { @@ -83,12 +81,12 @@ public class RollupHandler extends AlterHandler { private void processAddRollup(AddRollupClause alterClause, Database db, OlapTable olapTable, boolean isRestore) throws DdlException { - + if (!isRestore) { - if (olapTable.getState() == OlapTableState.ROLLUP) { + // table is under rollup or has a finishing alter job + if (olapTable.getState() == OlapTableState.ROLLUP || this.hasUnfinishedAlterJob(olapTable.getId())) { throw new DdlException("Table[" + olapTable.getName() + "]'s is under ROLLUP"); } - // up to here, table's state can only be NORMAL Preconditions.checkState(olapTable.getState() == OlapTableState.NORMAL, olapTable.getState().name()); } @@ -319,10 +317,12 @@ public class RollupHandler extends AlterHandler { Catalog catalog = Catalog.getInstance(); long rollupIndexId = catalog.getNextId(); + + long transactionId = Catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().getNextTransactionId(); RollupJob rollupJob = new RollupJob(dbId, tableId, baseIndexId, rollupIndexId, baseIndexName, rollupIndexName, rollupSchema, baseSchemaHash, rollupSchemaHash, rollupStorageType, - rollupShortKeyColumnCount, resourceInfo, rollupKeysType); + rollupShortKeyColumnCount, resourceInfo, rollupKeysType, transactionId); for (Partition partition : olapTable.getPartitions()) { long partitionId = partition.getId(); @@ -348,17 +348,25 @@ public class RollupHandler extends AlterHandler { for (Replica baseReplica : baseReplicas) { long rollupReplicaId = catalog.getNextId(); long backendId = baseReplica.getBackendId(); - if (baseReplica.getState() == ReplicaState.CLONE) { + if (baseReplica.getState() == ReplicaState.CLONE + || baseReplica.getLastFailedVersion() > 0) { // just skip it. continue; } Preconditions.checkState(baseReplica.getState() == ReplicaState.NORMAL); ++replicaNum; - + // the new replica's init version is -1 until finished history rollup Replica rollupReplica = new Replica(rollupReplicaId, backendId, ReplicaState.ROLLUP); + // new replica's last failed version is equal to the partition's next version - 1 + // has to set failed verison and version hash here, because there will be no load after rollup + // so that if not set here, last failed version will not be set + rollupReplica.updateVersionInfo(rollupReplica.getVersion(), rollupReplica.getVersionHash(), + partition.getCurrentVersion(), partition.getCurrentVersionHash(), + rollupReplica.getLastSuccessVersion(), rollupReplica.getLastSuccessVersionHash()); if (isRestore) { rollupReplica.setState(ReplicaState.NORMAL); } + // yiguolei: the rollup tablet's replica num maybe less than base tablet's replica num newTablet.addReplica(rollupReplica); } // end for baseReplica @@ -433,7 +441,16 @@ public class RollupHandler extends AlterHandler { throw new DdlException("Rollup index[" + ((RollupJob) alterJob).getRollupIndexName() + "] is doing rollup based on this index[" + rollupIndexName + "] and not finished yet."); } - + + // if the index is a during rollup and in finishing state, then it could not be dropped + // because the finishing state could not be roll back, it is very difficult + alterJob = getAlterJob(tableId); + if (alterJob != null && ((RollupJob) alterJob).getRollupIndexName().equals(rollupIndexName) + && alterJob.getState() == JobState.FINISHING) { + throw new DdlException("Rollup index[" + rollupIndexName + "] in table[" + + olapTable.getName() + "] is in finishing state, waiting it to finish"); + } + // drop rollup for each partition long rollupIndexId = olapTable.getIndexIdByName(rollupIndexName); int rollupSchemaHash = olapTable.getSchemaHashByIndexId(rollupIndexId); @@ -521,195 +538,167 @@ public class RollupHandler extends AlterHandler { // this is for handle delete replica op private AlterJob checkIfAnyRollupBasedOn(long tableId, long baseIndexId) { - this.jobsLock.readLock().lock(); - try { - AlterJob alterJob = this.alterJobs.get(tableId); - if (alterJob != null && ((RollupJob) alterJob).getBaseIndexId() == baseIndexId) { - return alterJob; - } - return null; - } finally { - this.jobsLock.readLock().unlock(); + AlterJob alterJob = this.alterJobs.get(tableId); + if (alterJob != null && ((RollupJob) alterJob).getBaseIndexId() == baseIndexId) { + return alterJob; } + return null; } // this is for drop rollup op private AlterJob checkIfAnyRollupBasedOn(long tableId, String baseIndexName) { - this.jobsLock.readLock().lock(); - try { - AlterJob alterJob = this.alterJobs.get(tableId); - if (alterJob != null && ((RollupJob) alterJob).getBaseIndexName().equals(baseIndexName)) { - return alterJob; - } - return null; - } finally { - this.jobsLock.readLock().unlock(); + AlterJob alterJob = this.alterJobs.get(tableId); + if (alterJob != null && ((RollupJob) alterJob).getBaseIndexName().equals(baseIndexName)) { + return alterJob; } - } - - private void getJobInfo(List> rollupJobInfos, - RollupJob rollupJob, Database db) { - if (rollupJob.getDbId() != db.getId()) { - return; - } - - OlapTable olapTable = (OlapTable) db.getTable(rollupJob.getTableId()); - if (olapTable == null) { - return; - } - - // check auth - if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), db.getFullName(), - olapTable.getName(), - PrivPredicate.ALTER)) { - // no priv, return - LOG.debug("No priv for user {} to table {}.{}", ConnectContext.get().getQualifiedUser(), - ConnectContext.get().getRemoteIP(), db.getFullName(), olapTable.getName()); - return; - } - - List jobInfo = new ArrayList(); - - // job id - jobInfo.add(rollupJob.getTableId()); - - // table name - jobInfo.add(olapTable.getName()); - - // create time - long createTime = rollupJob.getCreateTimeMs(); - jobInfo.add(TimeUtils.longToTimeString(createTime)); - - long finishedTime = rollupJob.getFinishedTime(); - jobInfo.add(TimeUtils.longToTimeString(finishedTime)); - - // base index and rollup index name - jobInfo.add(rollupJob.getBaseIndexName()); - jobInfo.add(rollupJob.getRollupIndexName()); - - // job state - jobInfo.add(rollupJob.getState().name()); - - // msg - jobInfo.add(rollupJob.getMsg()); - - // progress - if (rollupJob.getState() == JobState.PENDING) { - jobInfo.add("0%"); - } else if (rollupJob.getState() == JobState.RUNNING) { - int unfinishedReplicaNum = rollupJob.getUnfinishedReplicaNum(); - int totalReplicaNum = rollupJob.getTotalReplicaNum(); - Preconditions.checkState(unfinishedReplicaNum <= totalReplicaNum); - jobInfo.add(((totalReplicaNum - unfinishedReplicaNum) * 100 / totalReplicaNum) + "%"); - } else { - jobInfo.add("N/A"); - } - - rollupJobInfos.add(jobInfo); - return; + return null; } @Override protected void runOneCycle() { super.runOneCycle(); + List cancelledJobs = Lists.newArrayList(); + List finishedJobs = Lists.newArrayList(); - List cancelledJobs = new LinkedList(); - this.jobsLock.writeLock().lock(); - try { - Iterator> iterator = this.alterJobs.entrySet().iterator(); - while (iterator.hasNext()) { - Entry entry = iterator.next(); - AlterJob rollupJob = entry.getValue(); - - JobState state = rollupJob.getState(); - switch (state) { - case PENDING: { - // if rollup job's status is PENDING, we need to send tasks. - if (!rollupJob.sendTasks()) { - cancelledJobs.add(rollupJob); - LOG.warn("sending rollup job[" + rollupJob.getTableId() + "] tasks failed. cancel it."); - } - break; + for (AlterJob alterJob : alterJobs.values()) { + RollupJob rollupJob = (RollupJob) alterJob; + if (rollupJob.getTransactionId() < 0) { + // it means this is an old type job and current version is real time load version + // then kill this job + cancelledJobs.add(rollupJob); + continue; + } + JobState state = rollupJob.getState(); + switch (state) { + case PENDING: { + // if rollup job's status is PENDING, we need to send tasks. + if (!rollupJob.sendTasks()) { + cancelledJobs.add(rollupJob); + LOG.warn("sending rollup job[" + rollupJob.getTableId() + "] tasks failed. cancel it."); } - case RUNNING: { - if (rollupJob.isTimeout()) { - cancelledJobs.add(rollupJob); - } else { - int res = rollupJob.tryFinishJob(); - if (res == -1) { - // cancel rollup - cancelledJobs.add(rollupJob); - LOG.warn("cancel rollup[{}] cause bad rollup job[{}]", - ((RollupJob) rollupJob).getRollupIndexName(), rollupJob.getTableId()); - } - } - break; - } - case FINISHED: { - // remove from alterJobs - iterator.remove(); - addFinishedOrCancelledAlterJob(rollupJob); - break; - } - case CANCELLED: { - // all CANCELLED state should be handled immediately - Preconditions.checkState(false); - break; - } - default: - Preconditions.checkState(false); - break; + break; } - } // end for jobs - } finally { - this.jobsLock.writeLock().unlock(); - } + case RUNNING: { + if (rollupJob.isTimeout()) { + cancelledJobs.add(rollupJob); + } else { + int res = rollupJob.tryFinishJob(); + if (res == -1) { + // cancel rollup + cancelledJobs.add(rollupJob); + LOG.warn("cancel rollup[{}] cause bad rollup job[{}]", + ((RollupJob) rollupJob).getRollupIndexName(), rollupJob.getTableId()); + } + } + break; + } + case FINISHING: { + // check previous load job finished + if (rollupJob.checkPreviousLoadFinished()) { + // if all previous load job finished, then send clear alter tasks to all related be + int res = rollupJob.checkOrResendClearTasks(); + if (res != 0) { + if (res == -1) { + LOG.warn("rollup job is in finishing state, but could not finished, " + + "just finish it, maybe a fatal error {}", rollupJob); + } + finishedJobs.add(rollupJob); + } + } + break; + } + case FINISHED: { + break; + } + case CANCELLED: { + // the alter job could be cancelled in 3 ways + // 1. the table or db is dropped + // 2. user cancels the job + // 3. the job meets errors when running + // for the previous 2 scenarios, user will call jobdone to finish the job and set its state to cancelled + // so that there exists alter job whose state is cancelled + // for the third scenario, the thread will add to cancelled job list and will be dealt by call jobdone + // Preconditions.checkState(false); + break; + } + default: + Preconditions.checkState(false); + break; + } + } // end for jobs // handle cancelled rollup jobs for (AlterJob rollupJob : cancelledJobs) { Database db = Catalog.getInstance().getDb(rollupJob.getDbId()); if (db == null) { cancelInternal(rollupJob, null, null); + continue; } + db.writeLock(); try { OlapTable olapTable = (OlapTable) db.getTable(rollupJob.getTableId()); - cancelInternal(rollupJob, olapTable, null); + rollupJob.cancel(olapTable, "cancelled"); } finally { db.writeUnlock(); } + jobDone(rollupJob); + } + + // handle finished rollup jobs + for (AlterJob alterJob : finishedJobs) { + alterJob.setState(JobState.FINISHED); + // remove from alterJobs. + // has to remove here, because the job maybe finished and it still in alter job list, + // then user could submit schema change task, and auto load to two table flag will be set false. + // then schema change job will be failed. + jobDone(alterJob); + Catalog.getInstance().getEditLog().logFinishRollup((RollupJob) alterJob); } } @Override public List> getAlterJobInfosByDb(Database db) { List> rollupJobInfos = new LinkedList>(); + List jobs = Lists.newArrayList(); + + // lock to perform atomically + lock(); + try { + for (AlterJob alterJob : this.alterJobs.values()) { + if (alterJob.getDbId() == db.getId()) { + jobs.add(alterJob); + } + } + + for (AlterJob alterJob : this.finishedOrCancelledAlterJobs) { + if (alterJob.getDbId() == db.getId()) { + jobs.add(alterJob); + } + } + } finally { + unlock(); + } + db.readLock(); try { - long dbId = db.getId(); - this.jobsLock.readLock().lock(); - try { - for (AlterJob alterJob : this.alterJobs.values()) { - getJobInfo(rollupJobInfos, (RollupJob) alterJob, db); + for (AlterJob selectedJob : jobs) { + OlapTable olapTable = (OlapTable) db.getTable(selectedJob.getTableId()); + if (olapTable == null) { + continue; } - for (AlterJob alterJob : this.finishedOrCancelledAlterJobs) { - getJobInfo(rollupJobInfos, (RollupJob) alterJob, db); - } - - // sort by - // "JobId", "TableName", "CreateTime", "FinishedTime", "BaseIndexName", "RollupIndexName" - ListComparator> comparator = new ListComparator>(0, 1, 2, 3, 4, 5); - Collections.sort(rollupJobInfos, comparator); - - } catch (Exception e) { - LOG.warn("failed to get rollup job info.", e); - } finally { - this.jobsLock.readLock().unlock(); + selectedJob.getJobInfo(rollupJobInfos, olapTable); } } finally { db.readUnlock(); } + + // sort by + // "JobId", "TableName", "CreateTime", "FinishedTime", "BaseIndexName", "RollupIndexName" + ListComparator> comparator = new ListComparator>(0, 1, 2, 3, 4, 5); + Collections.sort(rollupJobInfos, comparator); + return rollupJobInfos; } @@ -746,6 +735,7 @@ public class RollupHandler extends AlterHandler { ErrorReport.reportDdlException(ErrorCode.ERR_BAD_DB_ERROR, dbName); } + AlterJob rollupJob = null; db.writeLock(); try { Table table = db.getTable(tableName); @@ -761,12 +751,13 @@ public class RollupHandler extends AlterHandler { + "Use 'ALTER TABLE DROP ROLLUP' if you want to."); } - AlterJob rollupJob = getAlterJob(olapTable.getId()); + rollupJob = getAlterJob(olapTable.getId()); Preconditions.checkNotNull(rollupJob); - - cancelInternal(rollupJob, olapTable, "user cancelled"); + rollupJob.cancel(olapTable, "user cancelled"); } finally { db.writeUnlock(); } + + jobDone(rollupJob); } } diff --git a/fe/src/main/java/com/baidu/palo/alter/RollupJob.java b/fe/src/main/java/com/baidu/palo/alter/RollupJob.java index 2c085d3570..1dec46e5a7 100644 --- a/fe/src/main/java/com/baidu/palo/alter/RollupJob.java +++ b/fe/src/main/java/com/baidu/palo/alter/RollupJob.java @@ -29,13 +29,16 @@ import com.baidu.palo.catalog.Replica.ReplicaState; import com.baidu.palo.catalog.Tablet; import com.baidu.palo.catalog.TabletInvertedIndex; import com.baidu.palo.catalog.TabletMeta; +import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.util.ListComparator; -import com.baidu.palo.load.Load; +import com.baidu.palo.common.util.TimeUtils; import com.baidu.palo.persist.ReplicaPersistInfo; +import com.baidu.palo.task.AgentBatchTask; import com.baidu.palo.task.AgentTask; import com.baidu.palo.task.AgentTaskQueue; +import com.baidu.palo.task.ClearAlterTask; import com.baidu.palo.task.CreateRollupTask; import com.baidu.palo.thrift.TKeysType; import com.baidu.palo.thrift.TResourceInfo; @@ -47,6 +50,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.HashMultimap; import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import org.apache.logging.log4j.LogManager; @@ -60,12 +64,10 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.concurrent.TimeUnit; public class RollupJob extends AlterJob { private static final Logger LOG = LogManager.getLogger(RollupJob.class); @@ -112,10 +114,14 @@ public class RollupJob extends AlterJob { this.finishedPartitionIds = new HashSet(); } + // yiguolei: every job has a transactionid to identify the occurrent time, for example + // a load job's transactionid is 10 and a rollup job's transaction id is 12, then we could + // find load job is occurred before rollup job public RollupJob(long dbId, long tableId, long baseIndexId, long rollupIndexId, String baseIndexName, String rollupIndexName, List rollupSchema, int baseSchemaHash, int rollupSchemaHash, TStorageType rollupStorageType, - short rollupShortKeyColumnCount, TResourceInfo resourceInfo, TKeysType rollupKeysType) { + short rollupShortKeyColumnCount, TResourceInfo resourceInfo, TKeysType rollupKeysType, + long transactionId) { super(JobType.ROLLUP, dbId, tableId, resourceInfo); // rollup and base info @@ -141,6 +147,8 @@ public class RollupJob extends AlterJob { this.partitionIdToReplicaInfos = LinkedHashMultimap.create(); this.finishedPartitionIds = new HashSet(); + + this.transactionId = transactionId; } public final long getBaseIndexId() { @@ -276,11 +284,15 @@ public class RollupJob extends AlterJob { return tabletInfos; } + + public synchronized MaterializedIndex getRollupIndex(long partitionId) { + MaterializedIndex index = this.partitionIdToRollupIndex.get(partitionId); + return index; + } @Override public synchronized void addReplicaId(long parentId, long replicaId, long backendId) { this.partitionIdToUnfinishedReplicaIds.put(parentId, replicaId); - this.backendIdToReplicaIds.put(backendId, replicaId); ++this.totalReplicaNum; } @@ -296,7 +308,86 @@ public class RollupJob extends AlterJob { this.partitionIdToUnfinishedReplicaIds.get(parentId).remove(replicaId); } } + + public int checkOrResendClearTasks() { + Preconditions.checkState(this.state == JobState.FINISHING); + // 1. check if all task finished + boolean clearFailed = false; + if (batchClearAlterTask != null) { + List allTasks = batchClearAlterTask.getAllTasks(); + for (AgentTask oneClearAlterTask : allTasks) { + ClearAlterTask clearAlterTask = (ClearAlterTask) oneClearAlterTask; + if (!clearAlterTask.isFinished()) { + clearFailed = true; + } + AgentTaskQueue.removeTask(clearAlterTask.getBackendId(), + TTaskType.CLEAR_ALTER_TASK, clearAlterTask.getSignature()); + } + } + if (!clearFailed && batchClearAlterTask != null) { + return 1; + } + Database db = Catalog.getInstance().getDb(dbId); + if (db == null) { + String msg = "db[" + dbId + "] does not exist"; + setMsg(msg); + LOG.warn(msg); + return -1; + } + batchClearAlterTask = new AgentBatchTask(); + db.readLock(); + try { + synchronized (this) { + OlapTable olapTable = (OlapTable) db.getTable(tableId); + if (olapTable == null) { + cancelMsg = "table[" + tableId + "] does not exist"; + LOG.warn(cancelMsg); + return -1; + } + boolean allAddSuccess = true; + LOG.info("sending clear rollup job tasks for table [{}]", tableId); + for (Partition partition : olapTable.getPartitions()) { + long partitionId = partition.getId(); + // has to use rollup base index, could not use partition.getBaseIndex() + // because the rollup index could be created based on another rollup index + MaterializedIndex baseIndex = partition.getIndex(this.getBaseIndexId()); + for (Tablet baseTablet : baseIndex.getTablets()) { + long baseTabletId = baseTablet.getId(); + List baseReplicas = baseTablet.getReplicas(); + for (Replica baseReplica : baseReplicas) { + long backendId = baseReplica.getBackendId(); + ClearAlterTask clearRollupTask = new ClearAlterTask(backendId, dbId, tableId, + partitionId, baseIndexId, baseTabletId, baseSchemaHash); + if (AgentTaskQueue.addTask(clearRollupTask)) { + batchClearAlterTask.addTask(clearRollupTask); + } else { + allAddSuccess = false; + break; + } + } // end for rollupReplicas + if (!allAddSuccess) { + break; + } + } // end for rollupTablets + if (!allAddSuccess) { + break; + } + } + if (!allAddSuccess) { + for (AgentTask task : batchClearAlterTask.getAllTasks()) { + AgentTaskQueue.removeTask(task.getBackendId(), task.getTaskType(), task.getSignature()); + } + batchClearAlterTask = null; + } + } + } finally { + db.readUnlock(); + } + LOG.info("successfully sending clear rollup job[{}]", tableId); + return 0; + } + @Override public boolean sendTasks() { Preconditions.checkState(this.state == JobState.PENDING); @@ -305,15 +396,12 @@ public class RollupJob extends AlterJob { Database db = Catalog.getInstance().getDb(dbId); if (db == null) { - String msg = "db[" + dbId + "] does not exist"; - setMsg(msg); - LOG.warn(msg); + cancelMsg = "db[" + dbId + "] does not exist"; + LOG.warn(cancelMsg); return false; } - if (!db.tryReadLock(Database.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { - return true; - } + db.readLock(); try { synchronized (this) { OlapTable olapTable = (OlapTable) db.getTable(tableId); @@ -448,20 +536,13 @@ public class RollupJob extends AlterJob { LOG.debug("can not find rollup replica in rollup tablet[{}]. backend[{}]", rollupTabletId, backendId); return; } - + LOG.debug("remove replica {} from backend {}", rollupReplica, backendId, new Exception()); setReplicaFinished(parentId, rollupReplica.getId()); - this.backendIdToReplicaIds.get(backendId).remove(rollupReplica.getId()); // 4. remove task AgentTaskQueue.removeTask(backendId, TTaskType.ROLLUP, rollupTabletId); } - @Override - public synchronized void directRemoveReplicaTask(long replicaId, long backendId) { - setReplicaFinished(-1L, replicaId); - this.backendIdToReplicaIds.get(backendId).remove(replicaId); - } - @Override public synchronized void handleFinishedReplica(AgentTask task, TTabletInfo finishTabletInfo, long reportVersion) throws MetaNotFoundException { @@ -501,6 +582,9 @@ public class RollupJob extends AlterJob { long versionHash = finishTabletInfo.getVersion_hash(); long dataSize = finishTabletInfo.getData_size(); long rowCount = finishTabletInfo.getRow_count(); + // yiguolei: not check version here because the replica's first version will be set by rollup job + // the version is not set now + // the finish task thread doesn't own db lock here, maybe a bug? rollupReplica.updateInfo(version, versionHash, dataSize, rowCount); setReplicaFinished(partitionId, rollupReplicaId); @@ -513,26 +597,24 @@ public class RollupJob extends AlterJob { @Override public int tryFinishJob() { if (this.state != JobState.RUNNING) { - LOG.info("rollup job[{}] is not running.", tableId); + LOG.info("rollup job[{}] is not running or finishing.", tableId); return 0; } Database db = Catalog.getInstance().getDb(dbId); if (db == null) { - String msg = "db[" + dbId + "] does not exist"; - setMsg(msg); - LOG.warn(msg); + cancelMsg = "Db[" + dbId + "] does not exist"; + LOG.warn(cancelMsg); return -1; } - if (!db.tryWriteLock(Database.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { - return 0; - } + db.writeLock(); try { + // if all previous transaction has finished, then check base and rollup replica num synchronized (this) { OlapTable olapTable = (OlapTable) db.getTable(tableId); if (olapTable == null) { - cancelMsg = "db[" + dbId + "] does not exist"; + cancelMsg = "Table[" + tableId + "] does not exist"; LOG.warn(cancelMsg); return -1; } @@ -545,57 +627,42 @@ public class RollupJob extends AlterJob { continue; } - // check if all tablets finished load job - // FIXME(cmy): this may cause endless check?? - Load load = Catalog.getInstance().getLoadInstance(); - if (!load.checkPartitionLoadFinished(partitionId, null)) { - LOG.debug("partition[{}] has unfinished load job", partitionId); - return 0; - } - - short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); - - // check version and versionHash - long committedVersion = partition.getCommittedVersion(); - long committedVersionHash = partition.getCommittedVersionHash(); + short expectReplicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); MaterializedIndex rollupIndex = entry.getValue(); for (Tablet rollupTablet : rollupIndex.getTablets()) { - Iterator iterator = rollupTablet.getReplicas().iterator(); - boolean isCatchUp = true; - while (iterator.hasNext()) { - Replica replica = iterator.next(); - if (!this.backendIdToReplicaIds.get(replica.getBackendId()).contains(replica.getId())) { - // replica is dead, remove it - LOG.warn("rollup job[{}] find dead replica[{}] in backend[{}]. remove it", - tableId, replica.getId(), replica.getBackendId()); - iterator.remove(); - continue; - } - + // yiguolei: the rollup tablet only contains the replica that is healthy at rollup time + List replicas = rollupTablet.getReplicas(); + List errorReplicas = Lists.newArrayList(); + for (Replica replica : replicas) { if (!checkBackendState(replica)) { - continue; - } - - // check version - if (!replica.checkVersionCatchUp(committedVersion, committedVersionHash)) { - isCatchUp = false; - continue; + LOG.warn("backend {} state is abnormal, set replica {} as bad", replica.getBackendId(), + replica.getId()); + errorReplicas.add(replica); + } else if (replica.getLastFailedVersion() > 0 + && !partitionIdToUnfinishedReplicaIds.get(partitionId).contains(replica.getId())) { + // if the replica is finished history data, but failed during load, then it is a abnormal + // remove it from replica set + // have to use delete replica, it will remove it from tablet inverted index + LOG.warn("replica [{}] last failed version > 0 and have finished history rollup job, its a bad replica, remove it from rollup tablet", replica); + errorReplicas.add(replica); } } - if (rollupTablet.getReplicas().size() < (replicationNum / 2 + 1)) { - cancelMsg = String.format("rollup job[%d] cancelled. tablet[%d] has few replica." - + " num: %d", tableId, rollupTablet.getId(), rollupTablet.getReplicas().size()); + for (Replica errorReplica : errorReplicas) { + rollupTablet.deleteReplica(errorReplica); + setReplicaFinished(partitionId, errorReplica.getId()); + AgentTaskQueue.removeTask(errorReplica.getBackendId(), TTaskType.ROLLUP, rollupTablet.getId()); + } + + if (rollupTablet.getReplicas().size() < (expectReplicationNum / 2 + 1)) { + cancelMsg = String.format("rollup job[%d] cancelled. tablet[%d] has few health replica." + + " num: %d", tableId, rollupTablet.getId(), replicas.size()); LOG.warn(cancelMsg); return -1; } - - if (!isCatchUp) { - return 0; - } } // end for tablets - // check if partition is finised + // check if partition is finished if (!this.partitionIdToUnfinishedReplicaIds.get(partitionId).isEmpty()) { LOG.debug("partition[{}] has unfinished rollup replica: {}", partitionId, this.partitionIdToUnfinishedReplicaIds.get(partitionId).size()); @@ -624,6 +691,7 @@ public class RollupJob extends AlterJob { // all partition is finished rollup // add rollup index to each partition + // if for (Partition partition : olapTable.getPartitions()) { long partitionId = partition.getId(); MaterializedIndex rollupIndex = this.partitionIdToRollupIndex.get(partitionId); @@ -637,7 +705,11 @@ public class RollupJob extends AlterJob { ReplicaPersistInfo replicaInfo = ReplicaPersistInfo.createForRollup(rollupIndexId, tabletId, replica.getBackendId(), replica.getVersion(), replica.getVersionHash(), - replica.getDataSize(), replica.getRowCount()); + replica.getDataSize(), replica.getRowCount(), + replica.getLastFailedVersion(), + replica.getLastFailedVersionHash(), + replica.getLastSuccessVersion(), + replica.getLastSuccessVersionHash()); this.partitionIdToReplicaInfos.put(partitionId, replicaInfo); } @@ -675,14 +747,15 @@ public class RollupJob extends AlterJob { olapTable.setState(OlapTableState.NORMAL); this.finishedTime = System.currentTimeMillis(); - this.state = JobState.FINISHED; + this.state = JobState.FINISHING; + this.transactionId = Catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().getNextTransactionId(); } } finally { db.writeUnlock(); } // log rollup done operation - Catalog.getInstance().getEditLog().logFinishRollup(this); + Catalog.getInstance().getEditLog().logFinishingRollup(this); LOG.info("rollup job[{}] done.", this.getTableId()); return 1; @@ -701,109 +774,178 @@ public class RollupJob extends AlterJob { } @Override - public void unprotectedReplayInitJob(Database db) { - // set state - TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex(); - OlapTable olapTable = (OlapTable) db.getTable(tableId); - for (Map.Entry entry : this.partitionIdToRollupIndex.entrySet()) { - Partition partition = olapTable.getPartition(entry.getKey()); - partition.setState(PartitionState.ROLLUP); + public void replayInitJob(Database db) { + db.writeLock(); + try { + // set state + TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex(); + OlapTable olapTable = (OlapTable) db.getTable(tableId); + for (Map.Entry entry : this.partitionIdToRollupIndex.entrySet()) { + Partition partition = olapTable.getPartition(entry.getKey()); + partition.setState(PartitionState.ROLLUP); + + if (!Catalog.isCheckpointThread()) { + MaterializedIndex rollupIndex = entry.getValue(); + TabletMeta tabletMeta = new TabletMeta(dbId, tableId, entry.getKey(), rollupIndexId, + rollupSchemaHash); + for (Tablet tablet : rollupIndex.getTablets()) { + long tabletId = tablet.getId(); + invertedIndex.addTablet(tabletId, tabletMeta); + for (Replica replica : tablet.getReplicas()) { + invertedIndex.addReplica(tabletId, replica); + } + } + } + } // end for partitions + olapTable.setState(OlapTableState.ROLLUP); + + // reset status to PENDING for resending the tasks in polling thread + this.state = JobState.PENDING; + } finally { + db.writeUnlock(); + } + } + + @Override + public void replayFinishing(Database db) { + db.writeLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); + for (Map.Entry entry : this.partitionIdToRollupIndex.entrySet()) { + long partitionId = entry.getKey(); + Partition partition = olapTable.getPartition(partitionId); + MaterializedIndex rollupIndex = entry.getValue(); + + long rollupRowCount = 0L; + for (Tablet tablet : rollupIndex.getTablets()) { + for (Replica replica : tablet.getReplicas()) { + replica.setState(ReplicaState.NORMAL); + } + + // calculate rollup index row count + long tabletRowCount = 0L; + for (Replica replica : tablet.getReplicas()) { + long replicaRowCount = replica.getRowCount(); + if (replicaRowCount > tabletRowCount) { + tabletRowCount = replicaRowCount; + } + } + rollupRowCount += tabletRowCount; + } + + rollupIndex.setRowCount(rollupRowCount); + rollupIndex.setState(IndexState.NORMAL); + + MaterializedIndex baseIndex = partition.getIndex(baseIndexId); + if (baseIndex != null) { + baseIndex.setRollupIndexInfo(rollupIndexId, partition.getCommittedVersion()); + } + + partition.createRollupIndex(rollupIndex); + partition.setState(PartitionState.NORMAL); + + // Update database information + Collection replicaInfos = partitionIdToReplicaInfos.get(partitionId); + if (replicaInfos != null) { + for (ReplicaPersistInfo info : replicaInfos) { + MaterializedIndex mIndex = partition.getIndex(info.getIndexId()); + Tablet tablet = mIndex.getTablet(info.getTabletId()); + Replica replica = tablet.getReplicaByBackendId(info.getBackendId()); + replica.updateVersionInfo(info.getVersion(), info.getVersionHash(), + info.getLastFailedVersion(), + info.getLastFailedVersionHash(), + info.getLastSuccessVersion(), + info.getLastSuccessVersionHash()); + } + } + } + + olapTable.setIndexSchemaInfo(rollupIndexId, rollupIndexName, rollupSchema, 0, + rollupSchemaHash, rollupShortKeyColumnCount); + olapTable.setStorageTypeToIndex(rollupIndexId, rollupStorageType); + olapTable.setState(OlapTableState.NORMAL); + } finally { + db.writeUnlock(); + } + } + + @Override + public void replayFinish(Database db) { + // if this is an old job, then should also update table or replica's state + if (transactionId < 0) { + replayFinishing(db); + } + } + + @Override + public void replayCancel(Database db) { + db.writeLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); + if (olapTable == null) { + return; + } if (!Catalog.isCheckpointThread()) { - MaterializedIndex rollupIndex = entry.getValue(); - TabletMeta tabletMeta = new TabletMeta(dbId, tableId, entry.getKey(), rollupIndexId, rollupSchemaHash); - for (Tablet tablet : rollupIndex.getTablets()) { - long tabletId = tablet.getId(); - invertedIndex.addTablet(tabletId, tabletMeta); - for (Replica replica : tablet.getReplicas()) { - invertedIndex.addReplica(tabletId, replica); + // remove from inverted index + for (MaterializedIndex rollupIndex : partitionIdToRollupIndex.values()) { + for (Tablet tablet : rollupIndex.getTablets()) { + Catalog.getCurrentInvertedIndex().deleteTablet(tablet.getId()); } } } - } // end for partitions - olapTable.setState(OlapTableState.ROLLUP); - // reset status to PENDING for resending the tasks in polling thread - this.state = JobState.PENDING; + // set state + for (Partition partition : olapTable.getPartitions()) { + partition.setState(PartitionState.NORMAL); + } + olapTable.setState(OlapTableState.NORMAL); + } finally { + db.writeUnlock(); + } } @Override - public void unprotectedReplayFinish(Database db) { - OlapTable olapTable = (OlapTable) db.getTable(tableId); + public void getJobInfo(List> jobInfos, OlapTable tbl) { + List jobInfo = new ArrayList(); - for (Map.Entry entry : this.partitionIdToRollupIndex.entrySet()) { - long partitionId = entry.getKey(); - Partition partition = olapTable.getPartition(partitionId); - MaterializedIndex rollupIndex = entry.getValue(); + // job id + jobInfo.add(tableId); - long rollupRowCount = 0L; - for (Tablet tablet : rollupIndex.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - replica.setState(ReplicaState.NORMAL); - } + // table name + jobInfo.add(tbl.getName()); - // calculate rollup index row count - long tabletRowCount = 0L; - for (Replica replica : tablet.getReplicas()) { - long replicaRowCount = replica.getRowCount(); - if (replicaRowCount > tabletRowCount) { - tabletRowCount = replicaRowCount; - } - } - rollupRowCount += tabletRowCount; - } + // transactionid + jobInfo.add(transactionId); - rollupIndex.setRowCount(rollupRowCount); - rollupIndex.setState(IndexState.NORMAL); + // create time + jobInfo.add(TimeUtils.longToTimeString(createTime)); - MaterializedIndex baseIndex = partition.getIndex(baseIndexId); - if (baseIndex != null) { - baseIndex.setRollupIndexInfo(rollupIndexId, partition.getCommittedVersion()); - } + jobInfo.add(TimeUtils.longToTimeString(finishedTime)); - partition.createRollupIndex(rollupIndex); - partition.setState(PartitionState.NORMAL); + // base index and rollup index name + jobInfo.add(baseIndexName); + jobInfo.add(rollupIndexName); - // Update database information - Collection replicaInfos = partitionIdToReplicaInfos.get(partitionId); - if (replicaInfos != null) { - for (ReplicaPersistInfo info : replicaInfos) { - MaterializedIndex mIndex = partition.getIndex(info.getIndexId()); - Tablet tablet = mIndex.getTablet(info.getTabletId()); - Replica replica = tablet.getReplicaByBackendId(info.getBackendId()); - replica.updateInfo(info.getVersion(), info.getVersionHash(), - info.getDataSize(), info.getRowCount()); - } - } + // job state + jobInfo.add(state.name()); + + // msg + jobInfo.add(cancelMsg); + + // progress + if (state == JobState.PENDING) { + jobInfo.add("0%"); + } else if (state == JobState.RUNNING) { + int unfinishedReplicaNum = getUnfinishedReplicaNum(); + int totalReplicaNum = getTotalReplicaNum(); + Preconditions.checkState(unfinishedReplicaNum <= totalReplicaNum); + jobInfo.add(((totalReplicaNum - unfinishedReplicaNum) * 100 / totalReplicaNum) + "%"); + } else { + jobInfo.add("N/A"); } - olapTable.setIndexSchemaInfo(rollupIndexId, rollupIndexName, rollupSchema, 0, - rollupSchemaHash, rollupShortKeyColumnCount); - olapTable.setStorageTypeToIndex(rollupIndexId, rollupStorageType); - olapTable.setState(OlapTableState.NORMAL); - } - - @Override - public void unprotectedReplayCancel(Database db) { - OlapTable olapTable = (OlapTable) db.getTable(tableId); - if (olapTable == null) { - return; - } - - if (!Catalog.isCheckpointThread()) { - // remove from inverted index - for (MaterializedIndex rollupIndex : partitionIdToRollupIndex.values()) { - for (Tablet tablet : rollupIndex.getTablets()) { - Catalog.getCurrentInvertedIndex().deleteTablet(tablet.getId()); - } - } - } - - // set state - for (Partition partition : olapTable.getPartitions()) { - partition.setState(PartitionState.NORMAL); - } - olapTable.setState(OlapTableState.NORMAL); + jobInfos.add(jobInfo); } @Override @@ -856,6 +998,13 @@ public class RollupJob extends AlterJob { out.writeShort(rollupShortKeyColumnCount); Text.writeString(out, rollupStorageType.name()); + // when upgrade from 3.2, rollupKeysType == null + if (rollupKeysType != null) { + out.writeBoolean(true); + Text.writeString(out, rollupKeysType.name()); + } else { + out.writeBoolean(false); + } } @Override @@ -908,6 +1057,12 @@ public class RollupJob extends AlterJob { rollupShortKeyColumnCount = in.readShort(); rollupStorageType = TStorageType.valueOf(Text.readString(in)); + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + boolean hasRollKeysType = in.readBoolean(); + if (hasRollKeysType) { + rollupKeysType = TKeysType.valueOf(Text.readString(in)); + } + } } public static RollupJob read(DataInput in) throws IOException { @@ -919,4 +1074,14 @@ public class RollupJob extends AlterJob { public boolean equals(Object obj) { return true; } + + @Override + public String toString() { + return "RollupJob [baseIndexId=" + baseIndexId + ", rollupIndexId=" + rollupIndexId + ", baseIndexName=" + + baseIndexName + ", rollupIndexName=" + rollupIndexName + ", rollupSchema=" + rollupSchema + + ", baseSchemaHash=" + baseSchemaHash + ", rollupSchemaHash=" + rollupSchemaHash + ", type=" + type + + ", state=" + state + ", dbId=" + dbId + ", tableId=" + tableId + ", transactionId=" + transactionId + + ", hasPreviousLoadFinished=" + hasPreviousLoadFinished + ", createTime=" + createTime + + ", finishedTime=" + finishedTime + "]"; + } } diff --git a/fe/src/main/java/com/baidu/palo/alter/SchemaChangeHandler.java b/fe/src/main/java/com/baidu/palo/alter/SchemaChangeHandler.java index 2d31e743ea..74b8d12a2c 100644 --- a/fe/src/main/java/com/baidu/palo/alter/SchemaChangeHandler.java +++ b/fe/src/main/java/com/baidu/palo/alter/SchemaChangeHandler.java @@ -52,7 +52,6 @@ import com.baidu.palo.common.DdlException; import com.baidu.palo.common.FeConstants; import com.baidu.palo.common.util.ListComparator; import com.baidu.palo.common.util.PropertyAnalyzer; -import com.baidu.palo.common.util.TimeUtils; import com.baidu.palo.common.util.Util; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -79,12 +78,8 @@ import java.util.Set; public class SchemaChangeHandler extends AlterHandler { private static final Logger LOG = LogManager.getLogger(SchemaChangeHandler.class); - // delay delete SchemaChangeJob list - private List delayDeleteSchemaChangeJobs; - public SchemaChangeHandler() { super("schema change"); - delayDeleteSchemaChangeJobs = new LinkedList(); } private void processAddColumn(AddColumnClause alterClause, OlapTable olapTable, @@ -680,28 +675,16 @@ public class SchemaChangeHandler extends AlterHandler { if (olapTable.getState() == OlapTableState.ROLLUP) { throw new DdlException("Table[" + olapTable.getName() + "]'s is doing ROLLUP job"); } - - // check delay deleting old schema - this.jobsLock.readLock().lock(); - try { - for (SchemaChangeJob schemaChangeJob : delayDeleteSchemaChangeJobs) { - if (schemaChangeJob.getTableId() == olapTable.getId()) { - long delayTime = System.currentTimeMillis() - schemaChangeJob.getFinishedTime(); - // add ' + this.getInterval() ' because there will be a delay causing by thread running interval - long leftTime = Config.alter_delete_base_delay_second * 1000 + this.getInterval() - delayTime; - throw new DdlException("Old schema is not deleted. wait " + (leftTime / 1000) - + " second(s) and try again"); - } - } - } finally { - this.jobsLock.readLock().unlock(); + + if (this.hasUnfinishedAlterJob(olapTable.getId())) { + throw new DdlException("Table[" + olapTable.getName() + "]'s is doing ALTER job"); } // for now table's state can only be NORMAL Preconditions.checkState(olapTable.getState() == OlapTableState.NORMAL, olapTable.getState().name()); // process properties first - // for now. properties has 2 option + // for now. properties has 2 options // property 1. to specify short key column count. // eg. // "indexname1#short_key" = "3" @@ -800,9 +783,10 @@ public class SchemaChangeHandler extends AlterHandler { resourceInfo = ConnectContext.get().toResourceCtx(); } + long transactionId = Catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().getNextTransactionId(); // create job SchemaChangeJob schemaChangeJob = new SchemaChangeJob(dbId, olapTable.getId(), resourceInfo, - olapTable.getName()); + olapTable.getName(), transactionId); schemaChangeJob.setTableBloomFilterInfo(hasBfChange, bfColumns, bfFpp); // begin checking each table // ATTN: DO NOT change any meta in this loop @@ -973,7 +957,8 @@ public class SchemaChangeHandler extends AlterHandler { for (Tablet tablet : alterIndex.getTablets()) { int replicaNum = 0; for (Replica replica : tablet.getReplicas()) { - if (replica.getState() == ReplicaState.CLONE) { + if (replica.getState() == ReplicaState.CLONE + || replica.getLastFailedVersion() > 0) { // just skip it (replica cloned from old schema will be deleted) continue; } @@ -1025,6 +1010,7 @@ public class SchemaChangeHandler extends AlterHandler { // to avoid partial check success // 1. create schema change job + int newSchemaHash = -1; for (Partition onePartition : olapTable.getPartitions()) { for (Map.Entry> entry : schemaChangeJob.getChangedIndexToSchema().entrySet()) { long indexId = entry.getKey(); @@ -1035,14 +1021,26 @@ public class SchemaChangeHandler extends AlterHandler { int currentSchemaVersion = olapTable.getSchemaVersionByIndexId(indexId); int newSchemaVersion = currentSchemaVersion + 1; List alterColumns = entry.getValue(); - int newSchemaHash = Util.schemaHash(newSchemaVersion, alterColumns, bfColumns, bfFpp); + // int newSchemaHash = Util.schemaHash(newSchemaVersion, alterColumns, bfColumns, bfFpp); + // new schema hash should only be generate one time, or the schema hash will differenent from each other in different partitions + if (newSchemaHash == -1) { + newSchemaHash = Util.generateSchemaHash(); + int currentSchemaHash = olapTable.getSchemaHashByIndexId(indexId); + // has to generate a new schema hash not equal to current schema hash + while (currentSchemaHash == newSchemaHash) { + newSchemaHash = Util.generateSchemaHash(); + } + } short newShortKeyColumnCount = indexIdToShortKeyColumnCount.get(indexId); schemaChangeJob.setNewSchemaInfo(indexId, newSchemaVersion, newSchemaHash, newShortKeyColumnCount); // set replica state for (Tablet tablet : alterIndex.getTablets()) { for (Replica replica : tablet.getReplicas()) { - if (replica.getState() == ReplicaState.CLONE) { + // has to check last failed version here + // if the replica has version 1,2,3,5,6 not has 4 + // then fe will send schema change job to it and it will finish with missing 4 + if (replica.getState() == ReplicaState.CLONE || replica.getLastFailedVersion() > 0) { // just skip it (replica cloned from old schema will be deleted) continue; } @@ -1092,46 +1090,26 @@ public class SchemaChangeHandler extends AlterHandler { } } - public int getDelayDeletingJobNum(long dbId) { - int jobNum = 0; - this.jobsLock.readLock().lock(); - try { - for (AlterJob alterJob : delayDeleteSchemaChangeJobs) { - if (alterJob.getDbId() == dbId) { - ++jobNum; - } - } - return jobNum; - } finally { - this.jobsLock.readLock().unlock(); - } - } - @Override protected void runOneCycle() { super.runOneCycle(); - List cancelledJobs = Lists.newArrayList(); - // copied all jobs out of alterJobs to avoid lock problems - List copiedAlterJobs = Lists.newArrayList(); - Set removedIds = Sets.newHashSet(); + List finishedJobs = Lists.newArrayList(); - this.jobsLock.readLock().lock(); - try { - copiedAlterJobs.addAll(alterJobs.values()); - } finally { - this.jobsLock.readLock().unlock(); - } - - // handle all alter jobs - for (AlterJob alterJob : copiedAlterJobs) { + for (AlterJob alterJob : alterJobs.values()) { + SchemaChangeJob schemaChangeJob = (SchemaChangeJob) alterJob; + // it means this is an old type job and current version is real time load version + // then kill this job + if (alterJob.getTransactionId() < 0) { + cancelledJobs.add(alterJob); + continue; + } JobState state = alterJob.getState(); switch (state) { case PENDING: { if (!alterJob.sendTasks()) { cancelledJobs.add(alterJob); - LOG.warn("sending schema change job[" + alterJob.getTableId() - + "] tasks failed. cancel it."); + LOG.warn("sending schema change job {} tasks failed. cancel it.", alterJob.getTableId()); } break; } @@ -1143,21 +1121,42 @@ public class SchemaChangeHandler extends AlterHandler { if (res == -1) { cancelledJobs.add(alterJob); LOG.warn("cancel bad schema change job[{}]", alterJob.getTableId()); - } else if (res == 1) { - // finished - removedIds.add(alterJob.getTableId()); + } + } + break; + } + case FINISHING: { + // check previous load job finished + if (alterJob.checkPreviousLoadFinished()) { + LOG.info("schema change job has finished, send clear tasks to all be {}", alterJob); + // if all previous load job finished, then send clear alter tasks to all related be + int res = schemaChangeJob.checkOrResendClearTasks(); + if (res != 0) { + if (res == -1) { + LOG.warn("schema change job is in finishing state,but could not finished, " + + "just finish it, maybe a fatal error {}", alterJob); + } else { + LOG.info("send clear tasks to all be for job [{}] successfully, " + + "set status to finished", alterJob); + } + + finishedJobs.add(alterJob); } } break; } case FINISHED: { - // FINISHED state should be handled in RUNNING case - Preconditions.checkState(false); break; } case CANCELLED: { - // all CANCELLED state should be handled immediately - Preconditions.checkState(false); + // the alter job could be cancelled in 3 ways + // 1. the table or db is dropped + // 2. user cancels the job + // 3. the job meets errors when running + // for the previous 2 scenarios, user will call jobdone to finish the job and set its state to cancelled + // so that there exists alter job whose state is cancelled + // for the third scenario, the thread will add to cancelled job list and will be dealt by call jobdone + // Preconditions.checkState(false); break; } default: @@ -1166,99 +1165,76 @@ public class SchemaChangeHandler extends AlterHandler { } } // end for jobs - // remove job from alterJobs and add to delayDeleteSchemaChangeJobs - copiedAlterJobs.clear(); - this.jobsLock.writeLock().lock(); - try { - for (Long tblId : removedIds) { - AlterJob job = alterJobs.remove(tblId); - if (job != null) { - delayDeleteSchemaChangeJobs.add((SchemaChangeJob) job); - } - } - copiedAlterJobs.addAll(delayDeleteSchemaChangeJobs); - } finally { - this.jobsLock.writeLock().unlock(); - } - - // handle delay delete jobs - removedIds.clear(); - for (AlterJob alterJob : copiedAlterJobs) { - SchemaChangeJob job = (SchemaChangeJob) alterJob; - Preconditions.checkState(job.getFinishedTime() > 0L); - if (job.tryDeleteAllTableHistorySchema()) { - addFinishedOrCancelledAlterJob(job); - removedIds.add(alterJob.getTableId()); - } - } - - this.jobsLock.writeLock().lock(); - try { - for (Long tblId : removedIds) { - Iterator iter = delayDeleteSchemaChangeJobs.iterator(); - while (iter.hasNext()) { - SchemaChangeJob job = iter.next(); - if (job.getTableId() == tblId) { - iter.remove(); - } - } - } - } finally { - this.jobsLock.writeLock().unlock(); - } - - // handle cancelled rollup jobs + // handle cancelled schema change jobs for (AlterJob alterJob : cancelledJobs) { Database db = Catalog.getInstance().getDb(alterJob.getDbId()); if (db == null) { cancelInternal(alterJob, null, null); + continue; } + db.writeLock(); try { OlapTable olapTable = (OlapTable) db.getTable(alterJob.getTableId()); - if (olapTable == null) { - cancelInternal(alterJob, null, null); - } - cancelInternal(alterJob, olapTable, null); + alterJob.cancel(olapTable, "cancelled"); } finally { db.writeUnlock(); } + jobDone(alterJob); + } + + // handle finished schema change jobs + for (AlterJob alterJob : finishedJobs) { + alterJob.setState(JobState.FINISHED); + // has to remove here, because check is running every interval, it maybe finished but also in job list + // some check will failed + ((SchemaChangeJob) alterJob).deleteAllTableHistorySchema(); + jobDone(alterJob); + Catalog.getInstance().getEditLog().logFinishSchemaChange((SchemaChangeJob) alterJob); } } @Override public List> getAlterJobInfosByDb(Database db) { List> schemaChangeJobInfos = new LinkedList>(); + List selectedJobs = Lists.newArrayList(); + + lock(); + try { + // init or running + for (AlterJob alterJob : this.alterJobs.values()) { + if (alterJob.getDbId() == db.getId()) { + selectedJobs.add(alterJob); + } + } + + // finished or cancelled + for (AlterJob alterJob : this.finishedOrCancelledAlterJobs) { + if (alterJob.getDbId() == db.getId()) { + selectedJobs.add(alterJob); + } + } + + } finally { + unlock(); + } + db.readLock(); try { - this.jobsLock.readLock().lock(); - try { - // init or running - for (AlterJob alterJob : this.alterJobs.values()) { - getJobInfo(schemaChangeJobInfos, (SchemaChangeJob) alterJob, db, false); + for (AlterJob selectedJob : selectedJobs) { + OlapTable olapTable = (OlapTable) db.getTable(selectedJob.getTableId()); + if (olapTable == null) { + continue; } - - // delay deleting - for (AlterJob alterJob : this.delayDeleteSchemaChangeJobs) { - getJobInfo(schemaChangeJobInfos, (SchemaChangeJob) alterJob, db, true); - } - - // finished or cancelled - for (AlterJob alterJob : this.finishedOrCancelledAlterJobs) { - getJobInfo(schemaChangeJobInfos, (SchemaChangeJob) alterJob, db, true); - } - - // sort by "JobId", "PartitionName", "CreateTime", "FinishTime", "IndexName", "IndexState" - ListComparator> comparator = new ListComparator>(0, 1, 2, 3, 4, 5); - Collections.sort(schemaChangeJobInfos, comparator); - } catch (Exception e) { - LOG.warn("failed to get schema change job info", e); - } finally { - this.jobsLock.readLock().unlock(); + selectedJob.getJobInfo(schemaChangeJobInfos, olapTable); } } finally { db.readUnlock(); } + + // sort by "JobId", "PartitionName", "CreateTime", "FinishTime", "IndexName", "IndexState" + ListComparator> comparator = new ListComparator>(0, 1, 2, 3, 4, 5); + Collections.sort(schemaChangeJobInfos, comparator); return schemaChangeJobInfos; } @@ -1314,7 +1290,6 @@ public class SchemaChangeHandler extends AlterHandler { String dbName = cancelAlterTableStmt.getDbName(); String tableName = cancelAlterTableStmt.getTableName(); - final String clusterName = cancelAlterTableStmt.getClusterName(); Preconditions.checkState(!Strings.isNullOrEmpty(dbName)); Preconditions.checkState(!Strings.isNullOrEmpty(tableName)); @@ -1323,134 +1298,31 @@ public class SchemaChangeHandler extends AlterHandler { throw new DdlException("Database[" + dbName + "] does not exist"); } + AlterJob alterJob = null; db.writeLock(); try { - this.jobsLock.writeLock().lock(); - try { - // 1. get table - OlapTable olapTable = (OlapTable) db.getTable(tableName); - if (olapTable == null) { - throw new DdlException("Table[" + tableName + "] does not exist"); - } - - // 2. find schema change job - AlterJob alterJob = null; - Iterator> iterator = this.alterJobs.entrySet().iterator(); - while (iterator.hasNext()) { - alterJob = iterator.next().getValue(); - if (alterJob.getTableId() == olapTable.getId()) { - break; - } - } - if (alterJob == null) { - throw new DdlException("Table[" + tableName + "] is not under SCHEMA CHANGE"); - } - - // 3. cancel schema change job - cancelInternal(alterJob, olapTable, "user cancelled"); - - // 4. remove from job list - this.alterJobs.remove(alterJob.getTableId()); - } finally { - this.jobsLock.writeLock().unlock(); + // 1. get table + OlapTable olapTable = (OlapTable) db.getTable(tableName); + if (olapTable == null) { + throw new DdlException("Table[" + tableName + "] does not exist"); } + + // 2. find schema change job + alterJob = alterJobs.get(olapTable.getId()); + if (alterJob == null) { + throw new DdlException("Table[" + tableName + "] is not under SCHEMA CHANGE"); + } + + if (alterJob.getState() == JobState.FINISHING) { + throw new DdlException("The schemachange job related with table[" + olapTable.getName() + + "] is under finishing state, it could not be cancelled"); + } + // 3. cancel schema change job + alterJob.cancel(olapTable, "user cancelled"); } finally { db.writeUnlock(); } - } - private void getJobInfo(List> schemaChangeJobInfos, - SchemaChangeJob schemaChangeJob, Database db, boolean isFinished) { - if (schemaChangeJob.getDbId() != db.getId()) { - return; - } - - long tableId = schemaChangeJob.getTableId(); - OlapTable olapTable = (OlapTable) db.getTable(tableId); - if (olapTable == null) { - return; - } - - // check auth - if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), db.getFullName(), - olapTable.getName(), - PrivPredicate.ALTER)) { - // no priv, return - LOG.debug("No priv for user {} to table {}.{}", ConnectContext.get().getQualifiedUser(), - ConnectContext.get().getRemoteIP(), db.getFullName(), olapTable.getName()); - return; - } - - // create time - long createTime = schemaChangeJob.getCreateTimeMs(); - String createTimeStr = TimeUtils.longToTimeString(createTime); - - // finish time - long finishTime = schemaChangeJob.getFinishedTime(); - String finishTimeStr = TimeUtils.longToTimeString(finishTime); - - if (isFinished) { - List jobInfo = new ArrayList(); - jobInfo.add(tableId); - jobInfo.add(olapTable.getName()); - jobInfo.add(createTimeStr); - jobInfo.add(finishTimeStr); - jobInfo.add("N/A"); - jobInfo.add("N/A"); - jobInfo.add(schemaChangeJob.getState().name()); - jobInfo.add(schemaChangeJob.getMsg()); - jobInfo.add("N/A"); - - schemaChangeJobInfos.add(jobInfo); - return; - } - - // calc progress and state for each table - Map indexProgress = new HashMap(); - Map indexState = new HashMap(); - for (Long indexId : schemaChangeJob.getChangedIndexToSchema().keySet()) { - int totalReplicaNum = 0; - int finishedReplicaNum = 0; - String state = IndexState.NORMAL.name(); - for (Partition partition : olapTable.getPartitions()) { - MaterializedIndex index = partition.getIndex(indexId); - int tableReplicaNum = schemaChangeJob.getTotalReplicaNumByIndexId(indexId); - int tableFinishedReplicaNum = schemaChangeJob.getFinishedReplicaNumByIndexId(indexId); - Preconditions.checkState(!(tableReplicaNum == 0 && tableFinishedReplicaNum == -1)); - Preconditions.checkState(tableFinishedReplicaNum <= tableReplicaNum, - tableFinishedReplicaNum + "/" + tableReplicaNum); - totalReplicaNum += tableReplicaNum; - finishedReplicaNum += tableFinishedReplicaNum; - - if (index.getState() != IndexState.NORMAL) { - state = index.getState().name(); - } - } - if (Catalog.getInstance().isMaster() - && (schemaChangeJob.getState() == JobState.RUNNING - || schemaChangeJob.getState() == JobState.FINISHED)) { - indexProgress.put(indexId, (finishedReplicaNum * 100 / totalReplicaNum) + "%"); - indexState.put(indexId, state); - } else { - indexProgress.put(indexId, "0%"); - indexState.put(indexId, state); - } - } - - for (Long indexId : schemaChangeJob.getChangedIndexToSchema().keySet()) { - List jobInfo = new ArrayList(); - - jobInfo.add(tableId); - jobInfo.add(olapTable.getName()); - jobInfo.add(createTimeStr); - jobInfo.add(finishTimeStr); - jobInfo.add(olapTable.getIndexNameById(indexId)); - jobInfo.add(indexState.get(indexId)); - jobInfo.add(schemaChangeJob.getState().name()); - jobInfo.add(schemaChangeJob.getMsg()); - jobInfo.add(indexProgress.get(indexId)); - - schemaChangeJobInfos.add(jobInfo); - } // end for indexIds + jobDone(alterJob); } } diff --git a/fe/src/main/java/com/baidu/palo/alter/SchemaChangeJob.java b/fe/src/main/java/com/baidu/palo/alter/SchemaChangeJob.java index 5547dcf8f3..ab373cb688 100644 --- a/fe/src/main/java/com/baidu/palo/alter/SchemaChangeJob.java +++ b/fe/src/main/java/com/baidu/palo/alter/SchemaChangeJob.java @@ -29,14 +29,17 @@ import com.baidu.palo.catalog.Replica; import com.baidu.palo.catalog.Replica.ReplicaState; import com.baidu.palo.catalog.Table; import com.baidu.palo.catalog.Tablet; -import com.baidu.palo.common.Config; import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.common.io.Text; -import com.baidu.palo.load.Load; +import com.baidu.palo.common.util.TimeUtils; import com.baidu.palo.persist.ReplicaPersistInfo; +import com.baidu.palo.persist.ReplicaPersistInfo.ReplicaOperationType; +import com.baidu.palo.task.AgentBatchTask; import com.baidu.palo.task.AgentTask; +import com.baidu.palo.task.AgentTaskExecutor; import com.baidu.palo.task.AgentTaskQueue; +import com.baidu.palo.task.ClearAlterTask; import com.baidu.palo.task.SchemaChangeTask; import com.baidu.palo.thrift.TKeysType; import com.baidu.palo.thrift.TResourceInfo; @@ -48,6 +51,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.HashMultimap; import com.google.common.collect.HashMultiset; import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.Multiset; import com.google.common.collect.Sets; @@ -58,15 +62,14 @@ import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; -import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import java.util.concurrent.TimeUnit; public class SchemaChangeJob extends AlterJob { private static final Logger LOG = LogManager.getLogger(SchemaChangeJob.class); @@ -111,10 +114,10 @@ public class SchemaChangeJob extends AlterJob { private TStorageType newStorageType = null; private SchemaChangeJob() { - this(-1, -1, null, null); + this(-1, -1, null, null, -1); } - public SchemaChangeJob(long dbId, long tableId, TResourceInfo resourceInfo, String tableName) { + public SchemaChangeJob(long dbId, long tableId, TResourceInfo resourceInfo, String tableName, long transactionId) { super(JobType.SCHEMA_CHANGE, dbId, tableId, resourceInfo); this.tableName = tableName; @@ -135,6 +138,8 @@ public class SchemaChangeJob extends AlterJob { this.hasBfChange = false; this.bfColumns = null; this.bfFpp = 0; + + this.transactionId = transactionId; } public final String getTableName() { @@ -206,33 +211,19 @@ public class SchemaChangeJob extends AlterJob { return this.indexIdToFinishedReplicaNum.count(indexId); } - private synchronized boolean isDelayDeleting() { - if (System.currentTimeMillis() - this.finishedTime < Config.alter_delete_base_delay_second * 1000L) { - LOG.info("delay deleting old schema for querying. table[{}-{}]", dbId, tableId); - return true; - } - return false; - } - - public boolean tryDeleteAllTableHistorySchema() { - if (isDelayDeleting()) { - return false; - } - + public void deleteAllTableHistorySchema() { Database db = Catalog.getInstance().getDb(dbId); if (db == null) { LOG.warn("db[{}] does not exist", dbId); - return true; + return; } - if (!db.tryReadLock(Database.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { - return false; - } + db.readLock(); try { OlapTable olapTable = (OlapTable) db.getTable(tableId); if (olapTable == null) { LOG.warn("table[{}] does not exist in db[{}]", tableId, dbId); - return true; + return; } // drop all replicas with old schemaHash for (Partition partition : olapTable.getPartitions()) { @@ -245,7 +236,7 @@ public class SchemaChangeJob extends AlterJob { continue; } - // delele schema hash + // delete schema hash // the real drop task is handled by report process // we call 'deleteNewSchemaHash' but we delete old one actually. // cause schama hash is switched when job is finished. @@ -257,7 +248,7 @@ public class SchemaChangeJob extends AlterJob { } finally { db.readUnlock(); } - return true; + return; } @Override @@ -289,6 +280,78 @@ public class SchemaChangeJob extends AlterJob { this.indexIdToFinishedReplicaNum.add(parentId); } } + + public int checkOrResendClearTasks() { + Preconditions.checkState(this.state == JobState.FINISHING); + // 1. check if all task finished + boolean clearFailed = false; + if (batchClearAlterTask != null) { + List allTasks = batchClearAlterTask.getAllTasks(); + for (AgentTask oneClearAlterTask : allTasks) { + ClearAlterTask clearAlterTask = (ClearAlterTask) oneClearAlterTask; + if (!clearAlterTask.isFinished()) { + clearFailed = true; + } + AgentTaskQueue.removeTask(clearAlterTask.getBackendId(), + TTaskType.CLEAR_ALTER_TASK, clearAlterTask.getSignature()); + // not remove the task from batch task, remove it by gc + } + } + if (!clearFailed && batchClearAlterTask != null) { + return 1; + } + Database db = Catalog.getInstance().getDb(dbId); + if (db == null) { + String msg = "db[" + dbId + "] does not exist"; + setMsg(msg); + LOG.warn(msg); + return -1; + } + + batchClearAlterTask = new AgentBatchTask(); + db.readLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); + if (olapTable == null) { + cancelMsg = "could not find table[" + tableId + "] in db [" + dbId + "]"; + LOG.warn(cancelMsg); + return -1; + } + boolean allAddSuccess = true; + LOG.info("sending clear schema change job tasks for table [{}]", tableId); + OUTER_LOOP: + for (Partition partition : olapTable.getPartitions()) { + long partitionId = partition.getId(); + for (MaterializedIndex index : partition.getMaterializedIndices()) { + for (Tablet tablet : index.getTablets()) { + List replicas = tablet.getReplicas(); + for (Replica replica : replicas) { + long backendId = replica.getBackendId(); + ClearAlterTask clearAlterTask = new ClearAlterTask(backendId, dbId, tableId, + partitionId, index.getId(), tablet.getId(), + olapTable.getSchemaHashByIndexId(index.getId())); + if (AgentTaskQueue.addTask(clearAlterTask)) { + batchClearAlterTask.addTask(clearAlterTask); + } else { + allAddSuccess = false; + break OUTER_LOOP; + } + } // end for rollupReplicas + } // end for rollupTablets + } // end for index + } // end for partition + if (!allAddSuccess) { + for (AgentTask task : batchClearAlterTask.getAllTasks()) { + AgentTaskQueue.removeTask(task.getBackendId(), task.getTaskType(), task.getSignature()); + } + batchClearAlterTask = null; + } + } finally { + db.readUnlock(); + } + LOG.info("successfully sending clear schemachange job [{}]", tableId); + return 0; + } @Override public boolean sendTasks() { @@ -306,9 +369,7 @@ public class SchemaChangeJob extends AlterJob { return false; } - if (!db.tryReadLock(Database.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { - return true; - } + db.readLock(); try { synchronized (this) { OlapTable olapTable = (OlapTable) db.getTable(tableId); @@ -354,11 +415,9 @@ public class SchemaChangeJob extends AlterJob { long tabletId = tablet.getId(); short replicaSendNum = 0; for (Replica replica : tablet.getReplicas()) { - if (replica.getState() == ReplicaState.CLONE) { - // There may be MIGRATION or SUPPLEMENT clone replica generated - // before starting this schema change job. - // And we cannot add schema change task to this clone replica. - // So here we skip it. + if (replica.getState() != ReplicaState.SCHEMA_CHANGE) { + // yiguolei: if the replica is not in schema change, then skip send tasks, it maybe in clone + // why not do it in the past? continue; } long backendId = replica.getBackendId(); @@ -388,6 +447,7 @@ public class SchemaChangeJob extends AlterJob { } // end for alter indices } // end for partitions + AgentBatchTask batchTask = new AgentBatchTask(); // add all schemaChangeTask to AgentTaskQueue for (AgentTask task : tasks) { if (!AgentTaskQueue.addTask(task)) { @@ -395,9 +455,14 @@ public class SchemaChangeJob extends AlterJob { + ":" + task.getBackendId() + "]"; LOG.warn(cancelMsg); return false; + } else { + batchTask.addTask(task); } } + if (batchTask.getTaskNum() > 0) { + AgentTaskExecutor.submit(batchTask); + } // change schemaChangeJob's status this.state = JobState.RUNNING; } // end synchronized block @@ -457,24 +522,19 @@ public class SchemaChangeJob extends AlterJob { // 2. log Catalog.getInstance().getEditLog().logCancelSchemaChange(this); - LOG.info("cancel schema change job[" + olapTable.getId() + "] finished"); + LOG.info("cancel schema change job[" + olapTable == null ? -1 : olapTable.getId() + "] finished"); } @Override public synchronized void removeReplicaRelatedTask(long parentId, long tabletId, long replicaId, long backendId) { // parentId is unused here - directRemoveReplicaTask(replicaId, backendId); - + setReplicaFinished(-1, replicaId); + + this.backendIdToReplicaIds.get(backendId).remove(replicaId); // remove task AgentTaskQueue.removeTask(backendId, TTaskType.SCHEMA_CHANGE, tabletId); } - @Override - public synchronized void directRemoveReplicaTask(long replicaId, long backendId) { - setReplicaFinished(-1L, replicaId); - this.backendIdToReplicaIds.get(backendId).remove(replicaId); - } - @Override public void handleFinishedReplica(AgentTask task, TTabletInfo finishTabletInfo, long reportVersion) throws MetaNotFoundException { @@ -536,6 +596,7 @@ public class SchemaChangeJob extends AlterJob { long versionHash = finishTabletInfo.getVersion_hash(); long dataSize = finishTabletInfo.getData_size(); long rowCount = finishTabletInfo.getRow_count(); + // do not need check version > replica.getVersion, because the new replica's version is first set by sc replica.updateInfo(version, versionHash, dataSize, rowCount); } finally { db.writeUnlock(); @@ -549,6 +610,11 @@ public class SchemaChangeJob extends AlterJob { replicaId, indexId, tabletId, task.getBackendId()); } + /** + * should consider following cases: + * 1. replica is removed from this tablet, for example user changes the replica num + * 2. backend is dead or is dropped from system + */ @Override public int tryFinishJob() { if (this.state != JobState.RUNNING) { @@ -562,9 +628,7 @@ public class SchemaChangeJob extends AlterJob { return -1; } - if (!db.tryWriteLock(Database.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { - return 0; - } + db.writeLock(); try { synchronized (this) { Table table = db.getTable(tableId); @@ -573,63 +637,68 @@ public class SchemaChangeJob extends AlterJob { return -1; } - Load load = Catalog.getInstance().getLoadInstance(); + boolean hasUnfinishedPartition = false; OlapTable olapTable = (OlapTable) table; for (Partition partition : olapTable.getPartitions()) { long partitionId = partition.getId(); - - // check if all tablets finished load job - // FIXME(cmy): this may cause endless check?? - if (!load.checkPartitionLoadFinished(partitionId, null)) { - LOG.debug("partition[{}] has unfinished load job", partitionId); - return 0; - } - - short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); - long committedVersion = partition.getCommittedVersion(); - long committedVersionHash = partition.getCommittedVersionHash(); - + short expectReplicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); + boolean hasUnfinishedIndex = false; for (long indexId : this.changedIndexIdToSchema.keySet()) { MaterializedIndex materializedIndex = partition.getIndex(indexId); if (materializedIndex == null) { - LOG.warn("index[{}] does not exist inp partition[{}]", indexId, partitionId); + LOG.warn("index[{}] does not exist in partition[{}]", indexId, partitionId); continue; } - - int aliveReplica = 0; for (Tablet tablet : materializedIndex.getTablets()) { - Iterator iterator = tablet.getReplicas().iterator(); - while (iterator.hasNext()) { - Replica replica = iterator.next(); + List replicas = tablet.getReplicas(); + List errorReplicas = Lists.newArrayList(); + int healthNum = replicas.size(); + for (Replica replica : replicas) { + // if this replica is not under schema change, then fe will not sent schema change task + // ignore it when calculate health replica num + // if a tablet has 3 replicas and 2 of them is under clone, 1 is under schema change + // then schema change will fail + if (replica.getState() != ReplicaState.SCHEMA_CHANGE) { + -- healthNum; + continue; + } if (!this.backendIdToReplicaIds.get(replica.getBackendId()).contains(replica.getId())) { // replica is dead, skip it LOG.warn("schema change job[{}] find dead replica[{}]. skip it", tableId, replica.getId()); + -- healthNum; continue; } - - // if replica is still in backendIdToReplicaIds - // we think this replica is still alive. - // BackendEvent will handle dead replica - ++aliveReplica; - if (!checkBackendState(replica)) { + LOG.warn("backend {} state is abnormal, set replica {} as bad", + replica.getBackendId(), replica.getId()); + errorReplicas.add(replica); + --healthNum; continue; } - - // check version and versionHash - if (!replica.checkVersionCatchUp(committedVersion, committedVersionHash)) { - continue; + if (replica.getLastFailedVersion() > 0) { + -- healthNum; } } - - if (aliveReplica < (replicationNum / 2 + 1)) { - cancelMsg = String.format("schema change job[%d] cancelled." - + " tablet[%d] has few replica. num: %d.", - tableId, tablet.getId(), aliveReplica); + if (healthNum < (expectReplicationNum / 2 + 1)) { + cancelMsg = String.format("schema change job[%d] cancelled. " + + "tablet[%d] has few health replica." + + " num: %d", tableId, tablet.getId(), healthNum); LOG.warn(cancelMsg); return -1; } + + for (Replica errReplica : errorReplicas) { + // For now, err replicas are those replicas which the backends they belong to is dead. + // We need to set these replicas as finished to let the schema change job + // finished. + setReplicaFinished(indexId, errReplica.getId()); + // remove the replica from backend to replica map + backendIdToReplicaIds.get(errReplica.getBackendId()).remove(errReplica.getId()); + // remove error replica related task + AgentTaskQueue.removeTask(errReplica.getBackendId(), TTaskType.SCHEMA_CHANGE, + tablet.getId()); + } } // end for tablets // check if index is finished @@ -640,7 +709,9 @@ public class SchemaChangeJob extends AlterJob { if (finishedReplicaNum < totalReplicaNum) { LOG.debug("index[{}] has unfinished replica. {}/{}", indexId, finishedReplicaNum, totalReplicaNum); - return 0; + hasUnfinishedIndex = true; + // return 0; + continue; } } @@ -662,13 +733,22 @@ public class SchemaChangeJob extends AlterJob { } } // end for indices + if (hasUnfinishedIndex) { + hasUnfinishedPartition = true; + } + // all table finished in this partition LOG.debug("schema change finished in partition[" + partition.getId() + "]"); } // end for partitions + if (hasUnfinishedPartition) { + return 0; + } + Preconditions.checkState(unfinishedReplicaIds.isEmpty()); + // all partitions are finished // update state and save replica info Preconditions.checkState(olapTable.getState() == OlapTableState.SCHEMA_CHANGE); @@ -680,8 +760,11 @@ public class SchemaChangeJob extends AlterJob { Preconditions.checkState(materializedIndex.getState() == IndexState.SCHEMA_CHANGE); for (Tablet tablet : materializedIndex.getTablets()) { long tabletId = tablet.getId(); + ArrayList errorBackendIds = new ArrayList<>(); for (Replica replica : tablet.getReplicas()) { - if (replica.getState() == ReplicaState.SCHEMA_CHANGE) { + // the replica should in schema change and the replica's backend should contains this replica + if (replica.getState() == ReplicaState.SCHEMA_CHANGE + && this.backendIdToReplicaIds.get(replica.getBackendId()).contains(replica.getId())) { replica.setState(ReplicaState.NORMAL); ReplicaPersistInfo replicaInfo = ReplicaPersistInfo.createForSchemaChange(partitionId, indexId, tabletId, @@ -689,14 +772,30 @@ public class SchemaChangeJob extends AlterJob { replica.getVersion(), replica.getVersionHash(), replica.getDataSize(), - replica.getRowCount()); + replica.getRowCount(), + replica.getLastFailedVersion(), + replica.getLastFailedVersionHash(), + replica.getLastSuccessVersion(), + replica.getLastSuccessVersionHash()); this.replicaInfos.put(partitionId, replicaInfo); // remove tasks for safety AgentTaskQueue.removeTask(replica.getBackendId(), TTaskType.SCHEMA_CHANGE, tabletId); + } else { + // if the replcia is not under schema change state, then the replica has not done schema change + // its schema is invalid, should remove it from replica group + // could only remove it here, because we should ensure that the health replica > quorum + // fe followers shoud check the replica info and remove the unhealthy replicas + ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(dbId, tableId, partitionId, + indexId, tabletId, replica.getBackendId()); + this.replicaInfos.put(partitionId, info); + errorBackendIds.add(info.getBackendId()); } } // end for replicas + for (Long errorBackend : errorBackendIds) { + tablet.deleteReplicaByBackendId(errorBackend); + } } // end for tablets // update schema hash @@ -732,14 +831,15 @@ public class SchemaChangeJob extends AlterJob { } this.finishedTime = System.currentTimeMillis(); - this.state = JobState.FINISHED; + this.state = JobState.FINISHING; + this.transactionId = Catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().getNextTransactionId(); } } finally { db.writeUnlock(); } // log schema change done operation - Catalog.getInstance().getEditLog().logFinishSchemaChange(this); + Catalog.getInstance().getEditLog().logFinishingSchemaChange(this); LOG.info("schema change job done. table [{}]", tableId); return 1; } @@ -755,140 +855,239 @@ public class SchemaChangeJob extends AlterJob { unfinishedReplicaIds = null; indexIdToTotalReplicaNum = null; indexIdToFinishedReplicaNum = null; - backendIdToReplicaIds = null; partitionIdToFinishedIndexIds = null; + backendIdToReplicaIds = null; } @Override - public void unprotectedReplayInitJob(Database db) { - OlapTable olapTable = (OlapTable) db.getTable(tableId); + public void replayInitJob(Database db) { + db.writeLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); - // change the state of table/partition and replica, then add object to related List and Set - for (Partition partition : olapTable.getPartitions()) { - for (Map.Entry entry : changedIndexIdToSchemaHash.entrySet()) { - MaterializedIndex index = partition.getIndex(entry.getKey()); - // set state to SCHEMA_CHANGE - for (Tablet tablet : index.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - if (replica.getState() == ReplicaState.CLONE) { - // just skip it (old schema clone will be deleted) - continue; + // change the state of table/partition and replica, then add object to related List and Set + for (Partition partition : olapTable.getPartitions()) { + for (Map.Entry entry : changedIndexIdToSchemaHash.entrySet()) { + MaterializedIndex index = partition.getIndex(entry.getKey()); + // set state to SCHEMA_CHANGE + for (Tablet tablet : index.getTablets()) { + for (Replica replica : tablet.getReplicas()) { + if (replica.getState() == ReplicaState.CLONE) { + // just skip it (old schema clone will be deleted) + continue; + } + replica.setState(ReplicaState.SCHEMA_CHANGE); } - replica.setState(ReplicaState.SCHEMA_CHANGE); } + index.setState(IndexState.SCHEMA_CHANGE); + + Catalog.getCurrentInvertedIndex().setNewSchemaHash(partition.getId(), entry.getKey(), + entry.getValue()); } - index.setState(IndexState.SCHEMA_CHANGE); - Catalog.getCurrentInvertedIndex().setNewSchemaHash(partition.getId(), entry.getKey(), entry.getValue()); - } + partition.setState(PartitionState.SCHEMA_CHANGE); + } // end for partitions - partition.setState(PartitionState.SCHEMA_CHANGE); - } // end for partitions + olapTable.setState(OlapTableState.SCHEMA_CHANGE); - olapTable.setState(OlapTableState.SCHEMA_CHANGE); - - // reset status to PENDING for resending the tasks in polling thread - this.state = JobState.PENDING; + // reset status to PENDING for resending the tasks in polling thread + this.state = JobState.PENDING; + LOG.info("just trace", new Exception()); + } finally { + db.writeUnlock(); + } } @Override - public void unprotectedReplayFinish(Database db) { - OlapTable olapTable = (OlapTable) db.getTable(tableId); + public void replayFinishing(Database db) { + db.writeLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); - // set the status to normal - for (Partition partition : olapTable.getPartitions()) { - long partitionId = partition.getId(); - for (Map.Entry entry : changedIndexIdToSchemaHash.entrySet()) { - MaterializedIndex index = partition.getIndex(entry.getKey()); - for (Tablet tablet : index.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - replica.setState(ReplicaState.NORMAL); + // set the status to normal + for (Partition partition : olapTable.getPartitions()) { + long partitionId = partition.getId(); + for (Map.Entry entry : changedIndexIdToSchemaHash.entrySet()) { + MaterializedIndex index = partition.getIndex(entry.getKey()); + for (Tablet tablet : index.getTablets()) { + for (Replica replica : tablet.getReplicas()) { + replica.setState(ReplicaState.NORMAL); + } + } + + index.setState(IndexState.NORMAL); + + // update to new schema hash in inverted index + Catalog.getCurrentInvertedIndex().updateToNewSchemaHash(partitionId, index.getId()); + Catalog.getCurrentInvertedIndex().deleteNewSchemaHash(partitionId, index.getId()); + } + partition.setState(PartitionState.NORMAL); + + // update replica info + Collection replicaInfo = replicaInfos.get(partition.getId()); + if (replicaInfo != null) { + for (ReplicaPersistInfo info : replicaInfo) { + MaterializedIndex mIndex = (MaterializedIndex) partition.getIndex(info.getIndexId()); + Tablet tablet = mIndex.getTablet(info.getTabletId()); + if (info.getOpType() == ReplicaOperationType.SCHEMA_CHANGE) { + Replica replica = tablet.getReplicaByBackendId(info.getBackendId()); + replica.updateVersionInfo(info.getVersion(), info.getVersionHash(), + info.getLastFailedVersion(), + info.getLastFailedVersionHash(), + info.getLastSuccessVersion(), + info.getLastSuccessVersionHash()); + } else if (info.getOpType() == ReplicaOperationType.DELETE) { + // remove the replica from replica group + tablet.deleteReplicaByBackendId(info.getBackendId()); + } } } + } // end for partitions - index.setState(IndexState.NORMAL); + // update schema + for (Map.Entry> entry : changedIndexIdToSchema.entrySet()) { + long indexId = entry.getKey(); + int schemaVersion = getSchemaVersionByIndexId(indexId); + int schemaHash = getSchemaHashByIndexId(indexId); + short shortKeyColumnCount = getShortKeyColumnCountByIndexId(indexId); + olapTable.setIndexSchemaInfo(indexId, null, entry.getValue(), schemaVersion, schemaHash, + shortKeyColumnCount); - // update to new schema hash in inverted index - Catalog.getCurrentInvertedIndex().updateToNewSchemaHash(partitionId, index.getId()); - Catalog.getCurrentInvertedIndex().deleteNewSchemaHash(partitionId, index.getId()); - } - partition.setState(PartitionState.NORMAL); - - // update replica info - Collection replicaInfo = replicaInfos.get(partition.getId()); - if (replicaInfo != null) { - for (ReplicaPersistInfo info : replicaInfo) { - MaterializedIndex mIndex = (MaterializedIndex) partition.getIndex(info.getIndexId()); - Tablet tablet = mIndex.getTablet(info.getTabletId()); - Replica replica = tablet.getReplicaByBackendId(info.getBackendId()); - replica.updateInfo(info.getVersion(), info.getVersionHash(), - info.getDataSize(), info.getRowCount()); + if (newStorageType != null) { + olapTable.setIndexStorageType(indexId, newStorageType); + } + if (indexId == olapTable.getId()) { + olapTable.setNewBaseSchema(entry.getValue()); } } - } // end for partitions - // update schema - for (Map.Entry> entry : changedIndexIdToSchema.entrySet()) { - long indexId = entry.getKey(); - int schemaVersion = getSchemaVersionByIndexId(indexId); - int schemaHash = getSchemaHashByIndexId(indexId); - short shortKeyColumnCount = getShortKeyColumnCountByIndexId(indexId); - olapTable.setIndexSchemaInfo(indexId, null, entry.getValue(), schemaVersion, schemaHash, - shortKeyColumnCount); - - if (newStorageType != null) { - olapTable.setIndexStorageType(indexId, newStorageType); - } - - if (indexId == olapTable.getId()) { - olapTable.setNewBaseSchema(entry.getValue()); - } + // bloom filter columns + if (hasBfChange) { + olapTable.setBloomFilterInfo(bfColumns, bfFpp); + } // end for partitions + olapTable.setState(OlapTableState.NORMAL); + } finally { + db.writeUnlock(); } - - // bloom filter columns - if (hasBfChange) { - olapTable.setBloomFilterInfo(bfColumns, bfFpp); + } + + @Override + public void replayFinish(Database db) { + // if this is an old job, then should also update table or replica state + if (transactionId < 0) { + replayFinishing(db); } - - olapTable.setState(OlapTableState.NORMAL); } @Override - public void unprotectedReplayCancel(Database db) { - // restore partition's state - OlapTable olapTable = (OlapTable) db.getTable(tableId); - if (olapTable == null) { + public void replayCancel(Database db) { + db.writeLock(); + try { + // restore partition's state + OlapTable olapTable = (OlapTable) db.getTable(tableId); + if (olapTable == null) { + return; + } + for (Partition partition : olapTable.getPartitions()) { + long partitionId = partition.getId(); + for (Long indexId : this.changedIndexIdToSchema.keySet()) { + MaterializedIndex index = partition.getIndex(indexId); + if (index == null) { + continue; + } + for (Tablet tablet : index.getTablets()) { + for (Replica replica : tablet.getReplicas()) { + if (replica.getState() == ReplicaState.CLONE || replica.getState() == ReplicaState.NORMAL) { + continue; + } + replica.setState(ReplicaState.NORMAL); + } // end for replicas + } // end for tablets + + Preconditions.checkState(index.getState() == IndexState.SCHEMA_CHANGE, index.getState()); + index.setState(IndexState.NORMAL); + + // delete new schema hash in invered index + Catalog.getCurrentInvertedIndex().deleteNewSchemaHash(partitionId, indexId); + } // end for indices + + Preconditions.checkState(partition.getState() == PartitionState.SCHEMA_CHANGE, + partition.getState()); + partition.setState(PartitionState.NORMAL); + } // end for partitions + + olapTable.setState(OlapTableState.NORMAL); + } finally { + db.writeUnlock(); + } + } + + @Override + public void getJobInfo(List> jobInfos, OlapTable tbl) { + if (state == JobState.FINISHED || state == JobState.CANCELLED) { + List jobInfo = new ArrayList(); + jobInfo.add(tableId); + jobInfo.add(tbl.getName()); + jobInfo.add(transactionId); + jobInfo.add(TimeUtils.longToTimeString(createTime)); + jobInfo.add(TimeUtils.longToTimeString(finishedTime)); + jobInfo.add("N/A"); + jobInfo.add("N/A"); + jobInfo.add(state.name()); + jobInfo.add(cancelMsg); + jobInfo.add("N/A"); + + jobInfos.add(jobInfo); return; } - for (Partition partition : olapTable.getPartitions()) { - long partitionId = partition.getId(); - for (Long indexId : this.changedIndexIdToSchema.keySet()) { + + // calc progress and state for each table + Map indexProgress = new HashMap(); + Map indexState = new HashMap(); + for (Long indexId : getChangedIndexToSchema().keySet()) { + int totalReplicaNum = 0; + int finishedReplicaNum = 0; + String idxState = IndexState.NORMAL.name(); + for (Partition partition : tbl.getPartitions()) { MaterializedIndex index = partition.getIndex(indexId); - if (index == null) { - continue; + int tableReplicaNum = getTotalReplicaNumByIndexId(indexId); + int tableFinishedReplicaNum = getFinishedReplicaNumByIndexId(indexId); + Preconditions.checkState(!(tableReplicaNum == 0 && tableFinishedReplicaNum == -1)); + Preconditions.checkState(tableFinishedReplicaNum <= tableReplicaNum, + tableFinishedReplicaNum + "/" + tableReplicaNum); + totalReplicaNum += tableReplicaNum; + finishedReplicaNum += tableFinishedReplicaNum; + + if (index.getState() != IndexState.NORMAL) { + idxState = index.getState().name(); } - for (Tablet tablet : index.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - if (replica.getState() == ReplicaState.CLONE || replica.getState() == ReplicaState.NORMAL) { - continue; - } - replica.setState(ReplicaState.NORMAL); - } // end for replicas - } // end for tablets + } + if (Catalog.getInstance().isMaster() + && (state == JobState.RUNNING || state == JobState.FINISHED)) { + indexProgress.put(indexId, (finishedReplicaNum * 100 / totalReplicaNum) + "%"); + indexState.put(indexId, idxState); + } else { + indexProgress.put(indexId, "0%"); + indexState.put(indexId, idxState); + } + } - Preconditions.checkState(index.getState() == IndexState.SCHEMA_CHANGE, index.getState()); - index.setState(IndexState.NORMAL); + for (Long indexId : getChangedIndexToSchema().keySet()) { + List jobInfo = new ArrayList(); - // delete new schema hash in invered index - Catalog.getCurrentInvertedIndex().deleteNewSchemaHash(partitionId, indexId); - } // end for indices + jobInfo.add(tableId); + jobInfo.add(tbl.getName()); + jobInfo.add(transactionId); + jobInfo.add(TimeUtils.longToTimeString(createTime)); + jobInfo.add(TimeUtils.longToTimeString(finishedTime)); + jobInfo.add(tbl.getIndexNameById(indexId)); + jobInfo.add(indexState.get(indexId)); + jobInfo.add(state.name()); + jobInfo.add(cancelMsg); + jobInfo.add(indexProgress.get(indexId)); - Preconditions.checkState(partition.getState() == PartitionState.SCHEMA_CHANGE, - partition.getState()); - partition.setState(PartitionState.NORMAL); - } // end for partitions - - olapTable.setState(OlapTableState.NORMAL); + jobInfos.add(jobInfo); + } // end for indexIds } @Override @@ -1034,4 +1233,11 @@ public class SchemaChangeJob extends AlterJob { public boolean equals(Object obj) { return true; } + + @Override + public String toString() { + return "SchemaChangeJob [tableName=" + tableName + ", type=" + type + ", state=" + state + ", dbId=" + dbId + + ", tableId=" + tableId + ", transactionId=" + transactionId + ", hasPreviousLoadFinished=" + + hasPreviousLoadFinished + ", createTime=" + createTime + ", finishedTime=" + finishedTime + "]"; + } } diff --git a/fe/src/main/java/com/baidu/palo/alter/SystemHandler.java b/fe/src/main/java/com/baidu/palo/alter/SystemHandler.java index 84cbcf4ead..5edf99c498 100644 --- a/fe/src/main/java/com/baidu/palo/alter/SystemHandler.java +++ b/fe/src/main/java/com/baidu/palo/alter/SystemHandler.java @@ -18,16 +18,16 @@ package com.baidu.palo.alter; import com.baidu.palo.alter.AlterJob.JobState; import com.baidu.palo.alter.DecommissionBackendJob.DecommissionType; import com.baidu.palo.analysis.AddBackendClause; -import com.baidu.palo.analysis.AddObserverClause; import com.baidu.palo.analysis.AddFollowerClause; +import com.baidu.palo.analysis.AddObserverClause; import com.baidu.palo.analysis.AlterClause; import com.baidu.palo.analysis.AlterLoadErrorUrlClause; import com.baidu.palo.analysis.CancelAlterSystemStmt; import com.baidu.palo.analysis.CancelStmt; import com.baidu.palo.analysis.DecommissionBackendClause; import com.baidu.palo.analysis.DropBackendClause; -import com.baidu.palo.analysis.DropObserverClause; import com.baidu.palo.analysis.DropFollowerClause; +import com.baidu.palo.analysis.DropObserverClause; import com.baidu.palo.analysis.ModifyBrokerClause; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; @@ -50,12 +50,11 @@ import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.collect.Maps; + import org.apache.commons.lang.NotImplementedException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -81,54 +80,51 @@ public class SystemHandler extends AlterHandler { protected void runOneCycle() { super.runOneCycle(); - List cancelledJobs = new LinkedList(); - this.jobsLock.writeLock().lock(); - try { - Iterator> iterator = this.alterJobs.entrySet().iterator(); - while (iterator.hasNext()) { - Entry entry = iterator.next(); - AlterJob decommissionBackendJob = entry.getValue(); + List cancelledJobs = Lists.newArrayList(); + List finishedJobs = Lists.newArrayList(); - JobState state = decommissionBackendJob.getState(); - switch (state) { - case PENDING: { - // send tasks - decommissionBackendJob.sendTasks(); - break; - } - case RUNNING: { - // no timeout - - // send tasks - decommissionBackendJob.sendTasks(); - - // try finish job - decommissionBackendJob.tryFinishJob(); - - break; - } - case FINISHED: { - // remove from alterJobs - iterator.remove(); - addFinishedOrCancelledAlterJob(decommissionBackendJob); - break; - } - case CANCELLED: { - Preconditions.checkState(false); - break; - } - default: - Preconditions.checkState(false); - break; + for (AlterJob alterJob : alterJobs.values()) { + AlterJob decommissionBackendJob = (DecommissionBackendJob) alterJob; + JobState state = decommissionBackendJob.getState(); + switch (state) { + case PENDING: { + // send tasks + decommissionBackendJob.sendTasks(); + break; } - } // end for jobs - } finally { - this.jobsLock.writeLock().unlock(); - } + case RUNNING: { + // no timeout + + // send tasks + decommissionBackendJob.sendTasks(); + // try finish job + decommissionBackendJob.tryFinishJob(); + break; + } + case FINISHED: { + // remove from alterJobs + finishedJobs.add(decommissionBackendJob); + break; + } + case CANCELLED: { + Preconditions.checkState(false); + break; + } + default: + Preconditions.checkState(false); + break; + } + } // end for jobs // handle cancelled jobs for (AlterJob dropBackendJob : cancelledJobs) { - cancelInternal(dropBackendJob, null, null); + dropBackendJob.cancel(null, "cancelled"); + jobDone(dropBackendJob); + } + + // handle finished jobs + for (AlterJob dropBackendJob : finishedJobs) { + jobDone(dropBackendJob); } } diff --git a/fe/src/main/java/com/baidu/palo/analysis/AbstractBackupStmt.java b/fe/src/main/java/com/baidu/palo/analysis/AbstractBackupStmt.java index 49b16294cb..d7be26d9de 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/AbstractBackupStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/AbstractBackupStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.Config; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -66,7 +66,7 @@ public class AbstractBackupStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { labelName.analyze(analyzer); // check auth diff --git a/fe/src/main/java/com/baidu/palo/analysis/AlterClusterStmt.java b/fe/src/main/java/com/baidu/palo/analysis/AlterClusterStmt.java index 8a88903238..84e110a7d5 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/AlterClusterStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/AlterClusterStmt.java @@ -24,7 +24,6 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -43,7 +42,7 @@ public class AlterClusterStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.OPERATOR)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_CLUSTER_NO_AUTHORITY, "NODE"); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseQuotaStmt.java b/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseQuotaStmt.java index f30a1efcc6..ddd2d8b403 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseQuotaStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseQuotaStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -49,7 +49,7 @@ public class AlterDatabaseQuotaStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseRename.java b/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseRename.java index 3a089e336a..3306317fa6 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseRename.java +++ b/fe/src/main/java/com/baidu/palo/analysis/AlterDatabaseRename.java @@ -27,7 +27,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -53,7 +53,7 @@ public class AlterDatabaseRename extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { throw new AnalysisException("Database name is not set"); diff --git a/fe/src/main/java/com/baidu/palo/analysis/AlterSystemStmt.java b/fe/src/main/java/com/baidu/palo/analysis/AlterSystemStmt.java index de5d3ff019..1419649ccb 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/AlterSystemStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/AlterSystemStmt.java @@ -19,7 +19,6 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -38,7 +37,7 @@ public class AlterSystemStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.OPERATOR)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, diff --git a/fe/src/main/java/com/baidu/palo/analysis/AlterTableStmt.java b/fe/src/main/java/com/baidu/palo/analysis/AlterTableStmt.java index 2485e31b1a..400f5e93b1 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/AlterTableStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/AlterTableStmt.java @@ -24,7 +24,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Writable; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -65,7 +65,7 @@ public class AlterTableStmt extends DdlStmt implements Writable { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (tbl == null) { ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_TABLES_USED); diff --git a/fe/src/main/java/com/baidu/palo/analysis/AlterUserStmt.java b/fe/src/main/java/com/baidu/palo/analysis/AlterUserStmt.java deleted file mode 100644 index b1b234b8f2..0000000000 --- a/fe/src/main/java/com/baidu/palo/analysis/AlterUserStmt.java +++ /dev/null @@ -1,90 +0,0 @@ -// Modifications copyright (C) 2017, Baidu.com, Inc. -// Copyright 2017 The Apache Software Foundation - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package com.baidu.palo.analysis; - -import com.baidu.palo.catalog.Catalog; -import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.ErrorCode; -import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; -import com.baidu.palo.mysql.privilege.PrivPredicate; -import com.baidu.palo.qe.ConnectContext; - -import org.apache.commons.lang.NotImplementedException; - -import java.util.List; - -@Deprecated -public class AlterUserStmt extends DdlStmt { - private UserIdentity userIdent; - private AlterUserClause clause; - - public AlterUserStmt(UserIdentity userIdent, AlterClause clause) { - this.userIdent = userIdent; - this.clause = (AlterUserClause) clause; - } - - @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { - super.analyze(analyzer); - - userIdent.analyze(analyzer.getClusterName()); - - if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.GRANT)) { - ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ALTER USER"); - } - - // alter clause analysis - clause.analyze(analyzer); - } - - public UserIdentity getUserIdent() { - return userIdent; - } - - public List getHosts() { - return clause.getHosts(); - } - - public List getIps() { - return clause.getIps(); - } - - public List getStarIps() { - return clause.getStarIps(); - } - - public AlterUserType getAlterUserType() { - return clause.getAlterUserType(); - } - - @Override - public String toSql() { - throw new NotImplementedException(); - } - - @Override - public String toString() { - throw new NotImplementedException(); - } - - -} diff --git a/fe/src/main/java/com/baidu/palo/analysis/BackupStmt.java b/fe/src/main/java/com/baidu/palo/analysis/BackupStmt.java index feda2627d9..ef3a6a49f2 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/BackupStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/BackupStmt.java @@ -23,7 +23,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.PrintableMap; import com.google.common.base.Joiner; @@ -54,7 +54,7 @@ public class BackupStmt extends AbstractBackupStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); // tbl refs can not set alias in backup diff --git a/fe/src/main/java/com/baidu/palo/analysis/BinaryPredicate.java b/fe/src/main/java/com/baidu/palo/analysis/BinaryPredicate.java index bb3d72ea99..73426321d8 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/BinaryPredicate.java +++ b/fe/src/main/java/com/baidu/palo/analysis/BinaryPredicate.java @@ -23,8 +23,8 @@ package com.baidu.palo.analysis; import com.baidu.palo.catalog.Function; import com.baidu.palo.catalog.FunctionSet; import com.baidu.palo.catalog.PrimitiveType; -import com.baidu.palo.catalog.Type; import com.baidu.palo.catalog.ScalarFunction; +import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.Reference; @@ -35,10 +35,10 @@ import com.baidu.palo.thrift.TExprNodeType; import com.baidu.palo.thrift.TExprOpcode; import com.google.common.base.Preconditions; - import com.google.common.collect.Lists; -import org.apache.logging.log4j.Logger; + import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.DataOutput; diff --git a/fe/src/main/java/com/baidu/palo/analysis/CancelBackupStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CancelBackupStmt.java index c45398edc7..6c47125bb9 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CancelBackupStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CancelBackupStmt.java @@ -20,7 +20,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -45,7 +45,7 @@ public class CancelBackupStmt extends CancelStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/CancelLoadStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CancelLoadStmt.java index 9483335d55..6c8ea90efb 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CancelLoadStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CancelLoadStmt.java @@ -18,7 +18,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.analysis.BinaryPredicate.Operator; import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Strings; @@ -46,7 +46,7 @@ public class CancelLoadStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ColumnSeparator.java b/fe/src/main/java/com/baidu/palo/analysis/ColumnSeparator.java index 217590eceb..9ed9f5e1c4 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ColumnSeparator.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ColumnSeparator.java @@ -22,6 +22,8 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; +import com.google.common.base.Strings; + import java.io.StringWriter; public class ColumnSeparator { @@ -60,8 +62,8 @@ public class ColumnSeparator { } public static String convertSeparator(String originStr) throws AnalysisException { - if (originStr == null) { - throw new AnalysisException("Column separator is null"); + if (Strings.isNullOrEmpty(originStr)) { + throw new AnalysisException("Column separator is null or empty"); } if (originStr.toUpperCase().startsWith("\\X")) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateClusterStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateClusterStmt.java index 40cdc18f7b..0472cb27ef 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateClusterStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateClusterStmt.java @@ -20,7 +20,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.MysqlPassword; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -64,7 +64,7 @@ public class CreateClusterStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { FeNameFormat.checkDbName(clusterName); if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.OPERATOR)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_CLUSTER_NO_AUTHORITY, analyzer.getQualifiedUser()); diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateDbStmt.java index c3efafcbe7..d58b892b11 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateDbStmt.java @@ -26,7 +26,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -51,7 +51,7 @@ public class CreateDbStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(analyzer.getClusterName())) { ErrorReport.reportAnalysisException(ErrorCode.ERR_CLUSTER_NO_SELECT_CLUSTER); diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateFunctionStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateFunctionStmt.java index f8bae7dd08..d2c982969a 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateFunctionStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateFunctionStmt.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import java.util.List; import java.util.Map; @@ -50,7 +50,7 @@ public class CreateFunctionStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateRepositoryStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateRepositoryStmt.java index 952d4cc275..f2b5b2c015 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateRepositoryStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateRepositoryStmt.java @@ -24,7 +24,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.PrintableMap; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -70,7 +70,7 @@ public class CreateRepositoryStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); // check auth diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateRoleStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateRoleStmt.java index a536d7f28d..6d77e437cf 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateRoleStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateRoleStmt.java @@ -16,9 +16,8 @@ package com.baidu.palo.analysis; import com.baidu.palo.cluster.ClusterNamespace; -import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; public class CreateRoleStmt extends DdlStmt { @@ -33,7 +32,7 @@ public class CreateRoleStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); FeNameFormat.checkRoleName(role, false /* can not be admin */, "Can not create role"); role = ClusterNamespace.getFullName(analyzer.getClusterName(), role); diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateTableAsSelectStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateTableAsSelectStmt.java index 684e36a106..c2258d63ec 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateTableAsSelectStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateTableAsSelectStmt.java @@ -27,7 +27,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.DdlException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import java.util.List; @@ -53,7 +53,7 @@ public class CreateTableAsSelectStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws InternalException, AnalysisException { + public void analyze(Analyzer analyzer) throws UserException, AnalysisException { // first: we analyze queryStmt before create table. // To avoid duplicate registrations of table/colRefs, // create a new root analyzer and clone the query statement for this initial pass. diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateTableStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateTableStmt.java index 73bca8d1f7..5f4a8d2b0c 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateTableStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateTableStmt.java @@ -31,11 +31,12 @@ import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; import com.baidu.palo.common.util.KuduUtil; import com.baidu.palo.common.util.PrintableMap; +import com.baidu.palo.external.EsUtil; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -78,6 +79,7 @@ public class CreateTableStmt extends DdlStmt implements Writable { engineNames.add("mysql"); engineNames.add("kudu"); engineNames.add("broker"); + engineNames.add("elasticsearch"); } // for backup. set to -1 for normal use @@ -187,7 +189,7 @@ public class CreateTableStmt extends DdlStmt implements Writable { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); tableName.analyze(analyzer); FeNameFormat.checkTableName(tableName.getTbl()); @@ -305,6 +307,8 @@ public class CreateTableStmt extends DdlStmt implements Writable { distributionDesc.analyze(columnSet); } else if (engineName.equals("kudu")) { KuduUtil.analyzePartitionAndDistributionDesc(keysDesc, partitionDesc, distributionDesc); + } else if (engineName.equalsIgnoreCase("elasticsearch")) { + EsUtil.analyzePartitionAndDistributionDesc(partitionDesc, distributionDesc); } else { if (partitionDesc != null || distributionDesc != null) { throw new AnalysisException("Create " + engineName @@ -323,7 +327,8 @@ public class CreateTableStmt extends DdlStmt implements Writable { throw new AnalysisException("Unknown engine name: " + engineName); } - if (engineName.equals("mysql") || engineName.equals("broker")) { + if (engineName.equals("mysql") || engineName.equals("broker") + || engineName.equals("elasticsearch")) { if (!isExternal) { // this is for compatibility isExternal = true; diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateUserStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateUserStmt.java index 293e9ce33d..385fe74c76 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateUserStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateUserStmt.java @@ -26,7 +26,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.MysqlPassword; import com.baidu.palo.mysql.privilege.PaloRole; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -102,12 +102,13 @@ public class CreateUserStmt extends DdlStmt { return userIdent; } + @Override public boolean needAuditEncryption() { return true; } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); userIdent.analyze(analyzer.getClusterName()); // convert plain password to hashed password diff --git a/fe/src/main/java/com/baidu/palo/analysis/CreateViewStmt.java b/fe/src/main/java/com/baidu/palo/analysis/CreateViewStmt.java index 1dc2501688..9106659bd0 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/CreateViewStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/CreateViewStmt.java @@ -24,15 +24,13 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.catalog.PrimitiveType; -import com.baidu.palo.catalog.ScalarType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; -import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -85,41 +83,32 @@ public class CreateViewStmt extends DdlStmt { return inlineViewDef; } - private Column createScalarColumn(String name, ScalarType scalarType) { - final ColumnType columnType = ColumnType.createType(scalarType.getPrimitiveType()); - final Integer precision = scalarType.getPrecision(); - if (precision != null) { - columnType.setPrecision(precision); - } - final Integer digits = scalarType.getDecimalDigits(); - if (digits != null) { - columnType.setScale(digits); - } - return new Column(name, columnType); - } - /** * Sets the originalViewDef and the expanded inlineViewDef based on viewDefStmt. * If columnNames were given, checks that they do not contain duplicate column names * and throws an exception if they do. */ - private void createColumnAndViewDefs(Analyzer analyzer) throws AnalysisException, InternalException { - - List newColumnNames = columnNames; - if (newColumnNames != null) { - if (newColumnNames.size() != viewDefStmt.getColLabels().size()) { + private void createColumnAndViewDefs(Analyzer analyzer) throws AnalysisException, UserException { + if (columnNames != null) { + if (columnNames.size() != viewDefStmt.getColLabels().size()) { ErrorReport.reportAnalysisException(ErrorCode.ERR_VIEW_WRONG_LIST); } + // TODO(zc): type + for (int i = 0; i < columnNames.size(); ++i) { + PrimitiveType type = viewDefStmt.getBaseTblResultExprs().get(i).getType().getPrimitiveType(); + finalCols.add(new Column( + columnNames.get(i), + ColumnType.createType(type))); + } } else { - newColumnNames = viewDefStmt.getColLabels(); + // TODO(zc): type + for (int i = 0; i < viewDefStmt.getBaseTblResultExprs().size(); ++i) { + PrimitiveType type = viewDefStmt.getBaseTblResultExprs().get(i).getType().getPrimitiveType(); + finalCols.add(new Column( + viewDefStmt.getColLabels().get(i), + ColumnType.createType(type))); + } } - - for (int i = 0; i < viewDefStmt.getBaseTblResultExprs().size(); ++i) { - Preconditions.checkState(viewDefStmt.getBaseTblResultExprs().get(i).getType() instanceof ScalarType); - final ScalarType scalarType = (ScalarType)viewDefStmt.getBaseTblResultExprs().get(i).getType(); - finalCols.add(createScalarColumn(newColumnNames.get(i), scalarType)); - } - // Set for duplicate columns Set colSets = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER); for (Column col : finalCols) { @@ -159,7 +148,7 @@ public class CreateViewStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (columnNames != null) { cloneStmt = viewDefStmt.clone(); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/DeleteStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DeleteStmt.java index b0aa53ad04..4de8febcf8 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DeleteStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DeleteStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.PrintableMap; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -74,7 +74,7 @@ public class DeleteStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (tbl == null) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/DescribeStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DescribeStmt.java index 498be872a0..94db5f4ee5 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DescribeStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DescribeStmt.java @@ -31,7 +31,7 @@ import com.baidu.palo.catalog.Table.TableType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.ProcNodeInterface; import com.baidu.palo.common.proc.ProcResult; import com.baidu.palo.common.proc.ProcService; @@ -95,7 +95,7 @@ public class DescribeStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { dbTableName.analyze(analyzer); if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbTableName.getDb(), diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropClusterStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropClusterStmt.java index f6ed0bcdf0..fa101ca544 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropClusterStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropClusterStmt.java @@ -19,7 +19,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.system.SystemInfoService; @@ -36,7 +36,7 @@ public class DropClusterStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (Strings.isNullOrEmpty(name)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_CLUSTER_NAME_NULL); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropDbStmt.java index 2921243a76..f3e689bc00 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropDbStmt.java @@ -21,7 +21,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -46,7 +46,7 @@ public class DropDbStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_DB_NAME, dbName); diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropFunctionStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropFunctionStmt.java index 0a459bedf8..ec21a55611 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropFunctionStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropFunctionStmt.java @@ -16,7 +16,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; /** * Created by zhaochun on 14-7-30. @@ -29,7 +29,7 @@ public class DropFunctionStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropRepositoryStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropRepositoryStmt.java index 0658ecb9d1..63a6cf327a 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropRepositoryStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropRepositoryStmt.java @@ -21,11 +21,10 @@ package com.baidu.palo.analysis; import com.baidu.palo.catalog.Catalog; -import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -42,7 +41,7 @@ public class DropRepositoryStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); // check auth diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropRoleStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropRoleStmt.java index 795d023262..758adcd26b 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropRoleStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropRoleStmt.java @@ -16,9 +16,8 @@ package com.baidu.palo.analysis; import com.baidu.palo.cluster.ClusterNamespace; -import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; public class DropRoleStmt extends DdlStmt { @@ -33,7 +32,7 @@ public class DropRoleStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); FeNameFormat.checkRoleName(role, false /* can not be superuser */, "Can not drop role"); role = ClusterNamespace.getFullName(analyzer.getClusterName(), role); diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropTableStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropTableStmt.java index cfc96dee7f..f6dc35727f 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropTableStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropTableStmt.java @@ -19,7 +19,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -60,7 +60,7 @@ public class DropTableStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (Strings.isNullOrEmpty(tableName.getDb())) { tableName.setDb(analyzer.getDefaultDb()); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/DropUserStmt.java b/fe/src/main/java/com/baidu/palo/analysis/DropUserStmt.java index 80ec770091..32f4e3fd67 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/DropUserStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/DropUserStmt.java @@ -19,7 +19,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -36,7 +36,7 @@ public class DropUserStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); userIdent.analyze(analyzer.getClusterName()); diff --git a/fe/src/main/java/com/baidu/palo/analysis/EnterStmt.java b/fe/src/main/java/com/baidu/palo/analysis/EnterStmt.java index fa624a7280..a3ee7af432 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/EnterStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/EnterStmt.java @@ -23,7 +23,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Strings; public class EnterStmt extends DdlStmt { @@ -35,7 +35,7 @@ public class EnterStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (Strings.isNullOrEmpty(name)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_CLUSTER_NAME_NULL); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/ExportStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ExportStmt.java index 2846d61083..b3a7719f2a 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ExportStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ExportStmt.java @@ -23,7 +23,7 @@ import com.baidu.palo.catalog.Table; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.PrintableMap; import com.baidu.palo.common.util.PropertyAnalyzer; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -104,6 +104,7 @@ public class ExportStmt extends StatementBase { return this.lineDelimiter; } + @Override public boolean needAuditEncryption() { if (brokerDesc != null) { return true; @@ -112,7 +113,7 @@ public class ExportStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); tableRef = analyzer.resolveTableRef(tableRef); diff --git a/fe/src/main/java/com/baidu/palo/analysis/Expr.java b/fe/src/main/java/com/baidu/palo/analysis/Expr.java index 90d90b2f5f..a3d5293577 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/Expr.java +++ b/fe/src/main/java/com/baidu/palo/analysis/Expr.java @@ -1147,14 +1147,17 @@ abstract public class Expr extends TreeNode implements ParseNode, Cloneabl return this; } - if ((targetType.isStringType() || targetType.isHllType()) && (this.type.isStringType() || this.type.isHllType())) { return this; } // Preconditions.checkState(PrimitiveType.isImplicitCast(type, targetType), "cast %s to %s", this.type, targetType); // TODO(zc): use implicit cast - Preconditions.checkState(Type.canCastTo(this.type, targetType), "cast %s to %s", this.type, targetType); + if (!Type.canCastTo(this.type, targetType)) { + throw new AnalysisException("type not match, originType=" + this.type + + ", targeType=" + targetType); + + } return uncheckedCastTo(targetType); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/FromClause.java b/fe/src/main/java/com/baidu/palo/analysis/FromClause.java index cc5cac70d6..286c47a462 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/FromClause.java +++ b/fe/src/main/java/com/baidu/palo/analysis/FromClause.java @@ -27,7 +27,7 @@ import java.util.Iterator; import java.util.List; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; @@ -76,7 +76,7 @@ public class FromClause implements ParseNode, Iterable { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (analyzed_) return; if (tableRefs_.isEmpty()) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/FunctionCallExpr.java b/fe/src/main/java/com/baidu/palo/analysis/FunctionCallExpr.java index d89e51dec9..e485932f9d 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/FunctionCallExpr.java +++ b/fe/src/main/java/com/baidu/palo/analysis/FunctionCallExpr.java @@ -390,6 +390,40 @@ public class FunctionCallExpr extends Expr { return; } + // Provide better error message for some aggregate builtins. These can be + // a bit more user friendly than a generic function not found. + // TODO: should we bother to do this? We could also improve the general + // error messages. For example, listing the alternatives. + protected String getFunctionNotFoundError(Type[] argTypes) { + // Some custom error message for builtins + if (fnParams.isStar()) { + return "'*' can only be used in conjunction with COUNT"; + } + + if (fnName.getFunction().equalsIgnoreCase("count")) { + if (!fnParams.isDistinct() && argTypes.length > 1) { + return "COUNT must have DISTINCT for multiple arguments: " + toSql(); + } + } + + if (fnName.getFunction().equalsIgnoreCase("sum")) { + return "SUM requires a numeric parameter: " + toSql(); + } + + if (fnName.getFunction().equalsIgnoreCase("avg")) { + return "AVG requires a numeric or timestamp parameter: " + toSql(); + } + + String[] argTypesSql = new String[argTypes.length]; + for (int i = 0; i < argTypes.length; ++i) { + argTypesSql[i] = argTypes[i].toSql(); + } + + return String.format( + "No matching function with signature: %s(%s).", + fnName, fnParams.isStar() ? "*" : Joiner.on(", ").join(argTypesSql)); + } + @Override public void analyzeImpl(Analyzer analyzer) throws AnalysisException { if (isMergeAggFn) { @@ -455,7 +489,7 @@ public class FunctionCallExpr extends Expr { if (fn == null) { LOG.warn("fn {} not exists", fnName.getFunction()); - throw new AnalysisException(fnName.getFunction() + "can't support"); + throw new AnalysisException(getFunctionNotFoundError(collectChildReturnTypes())); } if (fnName.getFunction().equals("from_unixtime")) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/GrantStmt.java b/fe/src/main/java/com/baidu/palo/analysis/GrantStmt.java index b9fb92d6a9..80373fbca4 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/GrantStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/GrantStmt.java @@ -27,7 +27,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeNameFormat; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth.PrivLevel; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; @@ -81,7 +81,7 @@ public class GrantStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (userIdent != null) { userIdent.analyze(analyzer.getClusterName()); diff --git a/fe/src/main/java/com/baidu/palo/analysis/InlineViewRef.java b/fe/src/main/java/com/baidu/palo/analysis/InlineViewRef.java index 87a0b67dc7..8bc328c803 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/InlineViewRef.java +++ b/fe/src/main/java/com/baidu/palo/analysis/InlineViewRef.java @@ -26,7 +26,7 @@ import com.baidu.palo.catalog.View; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.rewrite.ExprRewriter; import com.google.common.base.Preconditions; @@ -165,7 +165,7 @@ public class InlineViewRef extends TableRef { * then performs join clause analysis. */ @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (isAnalyzed) { return; } @@ -316,7 +316,7 @@ public class InlineViewRef extends TableRef { // } // } - protected void makeOutputNullable(Analyzer analyzer) throws AnalysisException, InternalException { + protected void makeOutputNullable(Analyzer analyzer) throws AnalysisException, UserException { try { makeOutputNullableHelper(analyzer, sMap); makeOutputNullableHelper(analyzer, baseTblSmap); @@ -359,7 +359,7 @@ public class InlineViewRef extends TableRef { * false otherwise. */ private boolean requiresNullWrapping(Analyzer analyzer, Expr expr, ExprSubstitutionMap nullSMap) - throws InternalException { + throws UserException { // If the expr is already wrapped in an IF(TupleIsNull(), NULL, expr) // then do not try to execute it. if (expr.contains(TupleIsNullPredicate.class)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/InsertStmt.java b/fe/src/main/java/com/baidu/palo/analysis/InsertStmt.java index 438e0d850c..99b77f1c2f 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/InsertStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/InsertStmt.java @@ -33,13 +33,16 @@ import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.planner.DataPartition; import com.baidu.palo.planner.DataSink; import com.baidu.palo.planner.DataSplitSink; import com.baidu.palo.planner.ExportSink; +import com.baidu.palo.planner.OlapTableSink; import com.baidu.palo.qe.ConnectContext; +import com.baidu.palo.thrift.TUniqueId; +import com.baidu.palo.transaction.TransactionState.LoadJobSourceType; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -52,6 +55,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.UUID; // InsertStmt used to public class InsertStmt extends DdlStmt { @@ -59,6 +63,7 @@ public class InsertStmt extends DdlStmt { public static final String SHUFFLE_HINT = "SHUFFLE"; public static final String NOSHUFFLE_HINT = "NOSHUFFLE"; + public static final String STREAMING = "STREAMING"; private final TableName tblName; private final Set targetPartitions; @@ -66,6 +71,9 @@ public class InsertStmt extends DdlStmt { private final QueryStmt queryStmt; private final List planHints; private Boolean isRepartition; + private boolean isStreaming = false; + + private Map indexIdToSchemaHash = null; // set after parse all columns and expr in query statement // this result expr in the order of target table's columns @@ -75,6 +83,9 @@ public class InsertStmt extends DdlStmt { private Table targetTable; + private Database db; + private long transactionId; + // we need a new TupleDesc for olap table. private TupleDescriptor olapTuple; @@ -116,6 +127,14 @@ public class InsertStmt extends DdlStmt { this.targetTable = targetTable; } + public Map getIndexIdToSchemaHash() { + return this.indexIdToSchemaHash; + } + + public long getTransactionId() { + return this.transactionId; + } + public Boolean isRepartition() { return isRepartition; } @@ -153,8 +172,21 @@ public class InsertStmt extends DdlStmt { return queryStmt; } + public boolean isStreaming() { + return isStreaming; + } + + // Only valid when this statement is streaming + public OlapTableSink getOlapTableSink() { + return (OlapTableSink) dataSink; + } + + public Database getDbObj() { + return db; + } + @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); if (targetTable == null) { @@ -183,6 +215,27 @@ public class InsertStmt extends DdlStmt { // create data sink createDataSink(); + + if (targetTable instanceof OlapTable) { + String dbName = tblName.getDb(); + // check exist + db = analyzer.getCatalog().getDb(dbName); + // although the insert stmt maybe failed at next stage, but has to begin transaction here + // if get transactionid at add job stage, the transaction id maybe a little larger, it maybe error at alter job to check + // if all previous job finished + UUID uuid = UUID.randomUUID(); + String jobLabel = "insert_" + uuid; + LoadJobSourceType sourceType = isStreaming ? LoadJobSourceType.INSERT_STREAMING + : LoadJobSourceType.FRONTEND; + transactionId = Catalog.getCurrentGlobalTransactionMgr().beginTransaction(db.getId(), + jobLabel, + "fe", sourceType); + if (isStreaming) { + OlapTableSink sink = (OlapTableSink) dataSink; + TUniqueId loadId = new TUniqueId(uuid.getMostSignificantBits(), uuid.getLeastSignificantBits()); + sink.init(loadId, transactionId, db.getId()); + } + } LOG.info("analyzer is ", analyzer.getDescTbl().debugString()); } @@ -229,6 +282,8 @@ public class InsertStmt extends DdlStmt { slotDesc.setIsNullable(false); } } + // will use it during create load job + indexIdToSchemaHash = olapTable.getIndexIdToSchemaHash(); } else if (targetTable instanceof MysqlTable) { if (targetPartitions != null) { ErrorReport.reportAnalysisException(ErrorCode.ERR_PARTITION_CLAUSE_NO_ALLOWED); @@ -270,7 +325,7 @@ public class InsertStmt extends DdlStmt { } } - public void analyzeSubquery(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyzeSubquery(Analyzer analyzer) throws UserException { queryStmt.setFromInsert(true); // parse query statement queryStmt.analyze(analyzer); @@ -307,20 +362,25 @@ public class InsertStmt extends DdlStmt { if (planHints == null) { return; } - if (!planHints.isEmpty() && !targetTable.isPartitioned()) { - ErrorReport.reportAnalysisException(ErrorCode.ERR_INSERT_HINT_NOT_SUPPORT); - } for (String hint : planHints) { if (SHUFFLE_HINT.equalsIgnoreCase(hint)) { + if (!targetTable.isPartitioned()) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_INSERT_HINT_NOT_SUPPORT); + } if (isRepartition != null && !isRepartition) { ErrorReport.reportAnalysisException(ErrorCode.ERR_PLAN_HINT_CONFILT, hint); } isRepartition = Boolean.TRUE; } else if (NOSHUFFLE_HINT.equalsIgnoreCase(hint)) { + if (!targetTable.isPartitioned()) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_INSERT_HINT_NOT_SUPPORT); + } if (isRepartition != null && isRepartition) { ErrorReport.reportAnalysisException(ErrorCode.ERR_PLAN_HINT_CONFILT, hint); } isRepartition = Boolean.FALSE; + } else if (STREAMING.equalsIgnoreCase(hint)) { + isStreaming = true; } else { ErrorReport.reportAnalysisException(ErrorCode.ERR_UNKNOWN_PLAN_HINT, hint); } @@ -345,6 +405,7 @@ public class InsertStmt extends DdlStmt { } else { throw new AnalysisException(hllMismatchLog); } + } if (col.getDataType().equals(expr.getType())) { @@ -354,7 +415,7 @@ public class InsertStmt extends DdlStmt { } private void prepareExpressions(List targetCols, List selectList, Analyzer analyzer) - throws AnalysisException { + throws UserException { // check type compatibility int numCols = targetCols.size(); for (int i = 0; i < numCols; ++i) { @@ -378,8 +439,13 @@ public class InsertStmt extends DdlStmt { return dataSink; } if (targetTable instanceof OlapTable) { - dataSink = new DataSplitSink((OlapTable) targetTable, olapTuple); - dataPartition = dataSink.getOutputPartition(); + if (isStreaming) { + dataSink = new OlapTableSink((OlapTable) targetTable, olapTuple); + dataPartition = dataSink.getOutputPartition(); + } else { + dataSink = new DataSplitSink((OlapTable) targetTable, olapTuple); + dataPartition = dataSink.getOutputPartition(); + } } else if (targetTable instanceof BrokerTable) { BrokerTable table = (BrokerTable) targetTable; // TODO(lingbin): think use which one if have more than one path @@ -399,6 +465,12 @@ public class InsertStmt extends DdlStmt { return dataSink; } + public void finalize() throws UserException { + if (isStreaming) { + ((OlapTableSink) dataSink).finalize(); + } + } + public ArrayList getResultExprs() { return resultExprs; } diff --git a/fe/src/main/java/com/baidu/palo/analysis/LargeIntLiteral.java b/fe/src/main/java/com/baidu/palo/analysis/LargeIntLiteral.java index 9438593e8d..fa8849d28f 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/LargeIntLiteral.java +++ b/fe/src/main/java/com/baidu/palo/analysis/LargeIntLiteral.java @@ -192,11 +192,14 @@ public class LargeIntLiteral extends LiteralExpr { return new FloatLiteral(new Double(value.doubleValue()), targetType); } else if (targetType.isDecimal()) { return new DecimalLiteral(new BigDecimal(value)); - } else if (!targetType.isNumericType()) { - return super.uncheckedCastTo(targetType); + } else if (targetType.isNumericType()) { + try { + return new IntLiteral(value.longValueExact(), targetType); + } catch (ArithmeticException e) { + throw new AnalysisException("Number out of range[" + value + "]. type: " + targetType); + } } - - return this; + return super.uncheckedCastTo(targetType); } @Override diff --git a/fe/src/main/java/com/baidu/palo/analysis/LinkDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/LinkDbStmt.java index be504df7ea..ace139b7c7 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/LinkDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/LinkDbStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -62,7 +62,7 @@ public class LinkDbStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { src.analyze(analyzer); dest.analyze(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/analysis/LoadStmt.java b/fe/src/main/java/com/baidu/palo/analysis/LoadStmt.java index 6bf18012b4..3359c5c2ee 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/LoadStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/LoadStmt.java @@ -22,7 +22,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.PrintableMap; import com.baidu.palo.qe.ConnectContext; @@ -41,7 +41,7 @@ import java.util.Map.Entry; // LOAD LABEL load_label // (data_desc, ...) // [BY cluster] -// [PROPERTIES (key1=value1, )] +// [PROPERTIES (key1=value1, )] // // load_label: // db_name.label_name @@ -179,7 +179,7 @@ public class LoadStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); label.analyze(analyzer); if (dataDescriptions == null || dataDescriptions.isEmpty()) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/MigrateDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/MigrateDbStmt.java index 09c0d16b89..62d6e594ac 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/MigrateDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/MigrateDbStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -60,7 +60,7 @@ public class MigrateDbStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { src.analyze(analyzer); dest.analyze(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ParseNode.java b/fe/src/main/java/com/baidu/palo/analysis/ParseNode.java index 8f1ccbc1bd..a4a524585b 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ParseNode.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ParseNode.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; public interface ParseNode { @@ -32,7 +32,7 @@ public interface ParseNode { * @param analyzer * @throws AnalysisException, InternalException */ - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException; + public void analyze(Analyzer analyzer) throws AnalysisException, UserException; /** * @return SQL syntax corresponding to this node. diff --git a/fe/src/main/java/com/baidu/palo/analysis/PartitionDesc.java b/fe/src/main/java/com/baidu/palo/analysis/PartitionDesc.java index f984648193..f79cb6a905 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/PartitionDesc.java +++ b/fe/src/main/java/com/baidu/palo/analysis/PartitionDesc.java @@ -35,7 +35,6 @@ import java.io.DataOutput; import java.io.IOException; import java.util.List; import java.util.Map; -import java.util.Set; public class PartitionDesc implements Writable { diff --git a/fe/src/main/java/com/baidu/palo/analysis/QueryStmt.java b/fe/src/main/java/com/baidu/palo/analysis/QueryStmt.java index 5b0a8d6231..bd66535a21 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/QueryStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/QueryStmt.java @@ -24,7 +24,7 @@ import com.baidu.palo.catalog.Database; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; @@ -111,7 +111,7 @@ public abstract class QueryStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (isAnalyzed()) return; super.analyze(analyzer); analyzeLimit(analyzer); @@ -520,6 +520,7 @@ public abstract class QueryStmt extends StatementBase { ambiguousAliasList.clear(); sortInfo = null; evaluateOrderBy = false; + fromInsert = false; } public void setFromInsert(boolean value) { @@ -530,5 +531,5 @@ public abstract class QueryStmt extends StatementBase { public abstract QueryStmt clone(); public abstract void substituteSelectList(Analyzer analyzer, List newColLabels) - throws AnalysisException, InternalException; + throws AnalysisException, UserException; } diff --git a/fe/src/main/java/com/baidu/palo/analysis/RecoverDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/RecoverDbStmt.java index dddbc32377..eace7d4068 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/RecoverDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/RecoverDbStmt.java @@ -26,7 +26,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -46,7 +46,7 @@ public class RecoverDbStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_DB_NAME, dbName); diff --git a/fe/src/main/java/com/baidu/palo/analysis/RecoverPartitionStmt.java b/fe/src/main/java/com/baidu/palo/analysis/RecoverPartitionStmt.java index b4d753be8d..ca4fcb9769 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/RecoverPartitionStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/RecoverPartitionStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -55,7 +55,7 @@ public class RecoverPartitionStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { dbTblName.analyze(analyzer); if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbTblName.getDb(), dbTblName.getTbl(), diff --git a/fe/src/main/java/com/baidu/palo/analysis/RecoverTableStmt.java b/fe/src/main/java/com/baidu/palo/analysis/RecoverTableStmt.java index 00e1bcc660..b57a7caad0 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/RecoverTableStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/RecoverTableStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -49,7 +49,7 @@ public class RecoverTableStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { dbTblName.analyze(analyzer); if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbTblName.getDb(), diff --git a/fe/src/main/java/com/baidu/palo/analysis/RestoreStmt.java b/fe/src/main/java/com/baidu/palo/analysis/RestoreStmt.java index 0cc4d495ad..abe4eae7af 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/RestoreStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/RestoreStmt.java @@ -24,7 +24,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeConstants; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.PrintableMap; import com.google.common.base.Joiner; @@ -61,7 +61,7 @@ public class RestoreStmt extends AbstractBackupStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); // check if alias is duplicated diff --git a/fe/src/main/java/com/baidu/palo/analysis/SelectStmt.java b/fe/src/main/java/com/baidu/palo/analysis/SelectStmt.java index 6db1b917b8..be6f62a5aa 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/SelectStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/SelectStmt.java @@ -32,7 +32,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ColumnAliasGenerator; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.TableAliasGenerator; import com.baidu.palo.common.TreeNode; @@ -266,7 +266,7 @@ public class SelectStmt extends QueryStmt { return tableAliasGenerator; } - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (isAnalyzed()) return; super.analyze(analyzer); @@ -1254,7 +1254,7 @@ public class SelectStmt extends QueryStmt { @Override public void substituteSelectList(Analyzer analyzer, List newColLabels) - throws AnalysisException, InternalException { + throws AnalysisException, UserException { // start out with table refs to establish aliases TableRef leftTblRef = null; // the one to the left of tblRef for (int i = 0; i < fromClause_.size(); ++i) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/SetStmt.java b/fe/src/main/java/com/baidu/palo/analysis/SetStmt.java index 5a75077132..b237212db0 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/SetStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/SetStmt.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import java.util.List; @@ -49,7 +49,7 @@ public class SetStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { if (setVars == null || setVars.isEmpty()) { throw new AnalysisException("Empty set statement."); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/SetUserPropertyStmt.java b/fe/src/main/java/com/baidu/palo/analysis/SetUserPropertyStmt.java index d5a056a8a8..bc65877a7f 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/SetUserPropertyStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/SetUserPropertyStmt.java @@ -22,8 +22,8 @@ package com.baidu.palo.analysis; import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.Pair; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -60,7 +60,7 @@ public class SetUserPropertyStmt extends DdlStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(user)) { // If param 'user' is not set, use the login user name. diff --git a/fe/src/main/java/com/baidu/palo/analysis/SetVar.java b/fe/src/main/java/com/baidu/palo/analysis/SetVar.java index a7084fa5cd..a83cd4f030 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/SetVar.java +++ b/fe/src/main/java/com/baidu/palo/analysis/SetVar.java @@ -24,7 +24,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.mysql.privilege.UserResource; import com.baidu.palo.qe.ConnectContext; @@ -70,7 +70,7 @@ public class SetVar { } // Value can be null. When value is null, means to set variable to DEFAULT. - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (type == null) { type = SetType.DEFAULT; } diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowAlterStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowAlterStmt.java index 0597aa558b..2fc4448e1b 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowAlterStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowAlterStmt.java @@ -22,7 +22,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.ProcNodeInterface; import com.baidu.palo.common.proc.ProcService; import com.baidu.palo.common.proc.RollupProcDir; @@ -71,7 +71,7 @@ public class ShowAlterStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); @@ -89,7 +89,7 @@ public class ShowAlterStmt extends ShowStmt { handleShowAlterTable(analyzer); } - private void handleShowAlterTable(Analyzer analyzer) throws AnalysisException, InternalException { + private void handleShowAlterTable(Analyzer analyzer) throws AnalysisException, UserException { final String dbNameWithoutPrefix = ClusterNamespace.getNameFromFullName(dbName); Database db = analyzer.getCatalog().getDb(dbName); if (db == null) { @@ -105,7 +105,7 @@ public class ShowAlterStmt extends ShowStmt { } else if (type == AlterType.ROLLUP) { sb.append("/rollup"); } else { - throw new InternalException("SHOW " + type.name() + " does not implement yet"); + throw new UserException("SHOW " + type.name() + " does not implement yet"); } LOG.debug("process SHOW PROC '{}';", sb.toString()); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowBackendsStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowBackendsStmt.java index 0099772e10..663ad08712 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowBackendsStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowBackendsStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.proc.BackendsProcDir; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -33,7 +32,7 @@ public class ShowBackendsStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN) && !Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.OPERATOR)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowBackupStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowBackupStmt.java index 1684b54fcc..1c94df1211 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowBackupStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowBackupStmt.java @@ -19,10 +19,9 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.cluster.ClusterNamespace; -import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -48,7 +47,7 @@ public class ShowBackupStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowBrokerStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowBrokerStmt.java index 0277d588d7..6d16b6cdf4 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowBrokerStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowBrokerStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -38,7 +37,7 @@ public class ShowBrokerStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN) && !Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.OPERATOR)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowClusterStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowClusterStmt.java index 3b0a9bec22..8d39a26588 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowClusterStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowClusterStmt.java @@ -22,7 +22,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -56,7 +55,7 @@ public class ShowClusterStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.of(PrivBitSet.of(PaloPrivilege.ADMIN_PRIV, PaloPrivilege.NODE_PRIV), diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowCreateDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowCreateDbStmt.java index ba87d6280e..4e0e2d691d 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowCreateDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowCreateDbStmt.java @@ -23,7 +23,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloPrivilege; import com.baidu.palo.mysql.privilege.PrivBitSet; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -53,7 +53,7 @@ public class ShowCreateDbStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(db)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_DB_NAME, db); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowDataStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowDataStmt.java index bccb892ffe..dec7eebd93 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowDataStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowDataStmt.java @@ -31,7 +31,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -76,7 +76,7 @@ public class ShowDataStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowDbStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowDbStmt.java index bd318b34c0..a4e5b85628 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowDbStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowDbStmt.java @@ -19,7 +19,7 @@ import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.catalog.InfoSchemaDb; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.qe.ShowResultSetMetaData; import com.google.common.collect.Lists; @@ -50,7 +50,7 @@ public class ShowDbStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowDeleteStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowDeleteStmt.java index 1fb0af34ff..0f032bb065 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowDeleteStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowDeleteStmt.java @@ -21,7 +21,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.DeleteInfoProcDir; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -40,7 +40,7 @@ public class ShowDeleteStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowExportStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowExportStmt.java index 758683ffec..0a0fdbd460 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowExportStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowExportStmt.java @@ -22,7 +22,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.ExportProcNode; import com.baidu.palo.common.util.OrderByPair; import com.baidu.palo.load.ExportJob.JobState; @@ -90,7 +90,7 @@ public class ShowExportStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowFrontendsStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowFrontendsStmt.java index 2c99f0ca98..e1650cde6c 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowFrontendsStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowFrontendsStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.proc.FrontendsProcNode; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -33,7 +32,7 @@ public class ShowFrontendsStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN) && !Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.OPERATOR)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowGrantsStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowGrantsStmt.java index 331328e3cc..7bc07085cd 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowGrantsStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowGrantsStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.proc.AuthProcDir; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -68,7 +67,7 @@ public class ShowGrantsStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (userIdent != null) { if (isAll) { throw new AnalysisException("Can not specified keyword ALL when specified user"); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowLoadStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowLoadStmt.java index 26012ed987..799fe6c9e0 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowLoadStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowLoadStmt.java @@ -22,7 +22,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.LoadProcDir; import com.baidu.palo.common.util.OrderByPair; import com.baidu.palo.load.LoadJob.JobState; @@ -106,7 +106,7 @@ public class ShowLoadStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowLoadWarningsStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowLoadWarningsStmt.java index 8fee298109..6203b5c5c1 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowLoadWarningsStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowLoadWarningsStmt.java @@ -21,7 +21,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.qe.ShowResultSetMetaData; import com.google.common.base.Strings; @@ -85,7 +85,7 @@ public class ShowLoadWarningsStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowMigrationsStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowMigrationsStmt.java index 44537c8822..86c490aec4 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowMigrationsStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowMigrationsStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -56,7 +55,7 @@ public class ShowMigrationsStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowPartitionsStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowPartitionsStmt.java index 928911434e..facf23c9d0 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowPartitionsStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowPartitionsStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.ProcNodeInterface; import com.baidu.palo.common.proc.ProcResult; import com.baidu.palo.common.proc.ProcService; @@ -70,7 +70,7 @@ public class ShowPartitionsStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowProcStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowProcStmt.java index fc37c34726..2dd1f4b5a1 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowProcStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowProcStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.proc.ProcNodeInterface; import com.baidu.palo.common.proc.ProcResult; import com.baidu.palo.common.proc.ProcService; @@ -43,7 +42,7 @@ public class ShowProcStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN"); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowRestoreStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowRestoreStmt.java index d8bb7f1fe8..e0ff4550b9 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowRestoreStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowRestoreStmt.java @@ -19,10 +19,9 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.cluster.ClusterNamespace; -import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -57,7 +56,7 @@ public class ShowRestoreStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowRolesStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowRolesStmt.java index 1e86f04aa6..44884be54a 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowRolesStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowRolesStmt.java @@ -21,7 +21,6 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -45,7 +44,7 @@ public class ShowRolesStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.GRANT)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "GRANT"); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowSnapshotStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowSnapshotStmt.java index 531a628164..9655ff04cf 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowSnapshotStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowSnapshotStmt.java @@ -24,7 +24,7 @@ import com.baidu.palo.analysis.CompoundPredicate.Operator; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.qe.ShowResultSetMetaData; import com.google.common.base.Strings; @@ -49,7 +49,7 @@ public class ShowSnapshotStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); // analyze where clause if not null diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowTabletStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowTabletStmt.java index c646e14651..042c390901 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowTabletStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowTabletStmt.java @@ -22,7 +22,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.proc.TabletsProcDir; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -68,7 +68,7 @@ public class ShowTabletStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (!isShowSingleTablet && Strings.isNullOrEmpty(dbName)) { dbName = analyzer.getDefaultDb(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowUserPropertyStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowUserPropertyStmt.java index ce588b78e7..d748f3ecb8 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowUserPropertyStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowUserPropertyStmt.java @@ -23,7 +23,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.CaseSensibility; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.PatternMatcher; import com.baidu.palo.common.proc.UserPropertyProcNode; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -53,7 +53,7 @@ public class ShowUserPropertyStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(user)) { user = analyzer.getQualifiedUser(); diff --git a/fe/src/main/java/com/baidu/palo/analysis/ShowUserStmt.java b/fe/src/main/java/com/baidu/palo/analysis/ShowUserStmt.java index c855d0d5aa..634929f390 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/ShowUserStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/ShowUserStmt.java @@ -3,7 +3,6 @@ package com.baidu.palo.analysis; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.proc.AuthProcDir; import com.baidu.palo.qe.ShowResultSetMetaData; @@ -29,7 +28,7 @@ public class ShowUserStmt extends ShowStmt { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException { user = analyzer.getQualifiedUser(); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/SlotDescriptor.java b/fe/src/main/java/com/baidu/palo/analysis/SlotDescriptor.java index 98806769bf..25fb04fb7c 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/SlotDescriptor.java +++ b/fe/src/main/java/com/baidu/palo/analysis/SlotDescriptor.java @@ -62,6 +62,8 @@ public class SlotDescriptor { private ColumnStats stats; // only set if 'column' isn't set private boolean isAgg; private boolean isMultiRef; + // used for load to get more information of varchar and decimal + private Type originType; public SlotDescriptor(SlotId id, TupleDescriptor parent) { this.id = id; @@ -144,6 +146,7 @@ public class SlotDescriptor { public void setColumn(Column column) { this.column = column; this.type = column.getType(); + this.originType = column.getOriginType(); } public boolean isMaterialized() { @@ -241,9 +244,15 @@ public class SlotDescriptor { // TODO public TSlotDescriptor toThrift() { - return new TSlotDescriptor(id.asInt(), parent.getId().asInt(), type.toThrift(), -1, - byteOffset, nullIndicatorByte, - nullIndicatorBit, ((column != null) ? column.getName() : ""), slotIdx, isMaterialized); + if (originType != null) { + return new TSlotDescriptor(id.asInt(), parent.getId().asInt(), originType.toThrift(), -1, + byteOffset, nullIndicatorByte, + nullIndicatorBit, ((column != null) ? column.getName() : ""), slotIdx, isMaterialized); + } else { + return new TSlotDescriptor(id.asInt(), parent.getId().asInt(), type.toThrift(), -1, + byteOffset, nullIndicatorByte, + nullIndicatorBit, ((column != null) ? column.getName() : ""), slotIdx, isMaterialized); + } } public String debugString() { diff --git a/fe/src/main/java/com/baidu/palo/analysis/StatementBase.java b/fe/src/main/java/com/baidu/palo/analysis/StatementBase.java index 8dda309e89..7e3be21e8c 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/StatementBase.java +++ b/fe/src/main/java/com/baidu/palo/analysis/StatementBase.java @@ -24,7 +24,7 @@ import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.rewrite.ExprRewriter; import com.google.common.base.Preconditions; @@ -66,7 +66,7 @@ public abstract class StatementBase implements ParseNode { * It is up to the analysis() implementation to ensure the maximum number of missing * tables/views get collected in the Analyzer before failing analyze(). */ - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (isAnalyzed()) return; if (isExplain) analyzer.setIsExplain(); this.analyzer = analyzer; diff --git a/fe/src/main/java/com/baidu/palo/analysis/StmtRewriter.java b/fe/src/main/java/com/baidu/palo/analysis/StmtRewriter.java index 7f598c539e..512b180340 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/StmtRewriter.java +++ b/fe/src/main/java/com/baidu/palo/analysis/StmtRewriter.java @@ -29,7 +29,7 @@ import org.slf4j.LoggerFactory; import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Preconditions; import com.google.common.base.Predicates; import com.google.common.collect.Lists; @@ -373,7 +373,7 @@ public class StmtRewriter { inlineView.reset(); try { inlineView.analyze(analyzer); - } catch (InternalException e) { + } catch (UserException e) { throw new AnalysisException(e.getMessage()); } inlineView.setLeftTblRef(stmt.fromClause_.get(stmt.fromClause_.size() - 1)); diff --git a/fe/src/main/java/com/baidu/palo/analysis/Subquery.java b/fe/src/main/java/com/baidu/palo/analysis/Subquery.java index 46aab83ea7..d02cbf5d31 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/Subquery.java +++ b/fe/src/main/java/com/baidu/palo/analysis/Subquery.java @@ -30,7 +30,7 @@ import com.baidu.palo.catalog.ArrayType; import com.baidu.palo.catalog.StructField; import com.baidu.palo.catalog.StructType; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TExprNode; @@ -89,7 +89,7 @@ public class Subquery extends Expr { analyzer.setIsSubquery(); try { stmt.analyze(analyzer); - } catch (InternalException e) { + } catch (UserException e) { throw new AnalysisException(e.getMessage()); } // Check whether the stmt_ contains an illegal mix of un/correlated table refs. diff --git a/fe/src/main/java/com/baidu/palo/analysis/SyncStmt.java b/fe/src/main/java/com/baidu/palo/analysis/SyncStmt.java index fd64c879fa..5f2777aa7d 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/SyncStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/SyncStmt.java @@ -21,11 +21,11 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; public class SyncStmt extends DdlStmt { @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { // if (analyzer.getCatalog().isMaster()) { // throw new AnalysisException("No need to Sync, for you are master"); // } diff --git a/fe/src/main/java/com/baidu/palo/analysis/TableRef.java b/fe/src/main/java/com/baidu/palo/analysis/TableRef.java index 6fa466db9d..73e1f304d3 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/TableRef.java +++ b/fe/src/main/java/com/baidu/palo/analysis/TableRef.java @@ -24,7 +24,7 @@ import com.baidu.palo.catalog.Table; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; import com.baidu.palo.rewrite.ExprRewriter; @@ -184,7 +184,7 @@ public class TableRef implements ParseNode, Writable { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { ErrorReport.reportAnalysisException(ErrorCode.ERR_UNRESOLVED_TABLE_REF, tableRefToSql()); } diff --git a/fe/src/main/java/com/baidu/palo/analysis/TupleIsNullPredicate.java b/fe/src/main/java/com/baidu/palo/analysis/TupleIsNullPredicate.java index 68221f973a..eb3af097b2 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/TupleIsNullPredicate.java +++ b/fe/src/main/java/com/baidu/palo/analysis/TupleIsNullPredicate.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TExprNode; import com.baidu.palo.thrift.TExprNodeType; import com.baidu.palo.thrift.TTupleIsNullPredicate; @@ -103,7 +103,7 @@ public class TupleIsNullPredicate extends Predicate { * Returns a new list with the nullable exprs. */ public static List wrapExprs(List inputExprs, - List tids, Analyzer analyzer) throws InternalException { + List tids, Analyzer analyzer) throws UserException { // Assert that all tids are materialized. for (TupleId tid: tids) { TupleDescriptor tupleDesc = analyzer.getTupleDesc(tid); @@ -122,7 +122,7 @@ public class TupleIsNullPredicate extends Predicate { * if required to make expr nullable. Otherwise, returns expr. */ public static Expr wrapExpr(Expr expr, List tids, Analyzer analyzer) - throws InternalException { + throws UserException { if (!requiresNullWrapping(expr, analyzer)) { return expr; } @@ -140,8 +140,7 @@ public class TupleIsNullPredicate extends Predicate { * SlotRefs evaluate to NULL, false otherwise. * Throws an InternalException if expr evaluation in the BE failed. */ - private static boolean requiresNullWrapping(Expr expr, Analyzer analyzer) - throws InternalException { + private static boolean requiresNullWrapping(Expr expr, Analyzer analyzer) { if (expr.isConstant()) { return false; } diff --git a/fe/src/main/java/com/baidu/palo/analysis/UnionStmt.java b/fe/src/main/java/com/baidu/palo/analysis/UnionStmt.java index 1a0520c571..0835f35f21 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/UnionStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/UnionStmt.java @@ -21,24 +21,18 @@ package com.baidu.palo.analysis; import com.baidu.palo.catalog.Database; -import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.rewrite.ExprRewriter; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; -import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import java.util.ArrayList; import java.util.List; -import java.util.ListIterator; import java.util.Map; /** @@ -190,7 +184,7 @@ public class UnionStmt extends QueryStmt { * union operands are union compatible, adding implicit casts if necessary. */ @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { if (isAnalyzed()) return; super.analyze(analyzer); Preconditions.checkState(operands.size() > 0); @@ -258,7 +252,7 @@ public class UnionStmt extends QueryStmt { * Throws an AnalysisException if that is not the case, or if analyzing * an operand fails. */ - private void analyzeOperands(Analyzer analyzer) throws AnalysisException, InternalException { + private void analyzeOperands(Analyzer analyzer) throws AnalysisException, UserException { for (int i = 0; i < operands.size(); ++i) { operands.get(i).analyze(analyzer); QueryStmt firstQuery = operands.get(0).getQueryStmt(); @@ -599,7 +593,7 @@ public class UnionStmt extends QueryStmt { @Override public void substituteSelectList(Analyzer analyzer, List newColLabels) - throws AnalysisException, InternalException { + throws AnalysisException, UserException { QueryStmt firstQuery = operands.get(0).getQueryStmt(); firstQuery.substituteSelectList(analyzer, newColLabels); // substitute order by @@ -638,7 +632,7 @@ public class UnionStmt extends QueryStmt { smap_ = new ExprSubstitutionMap(); } - public void analyze(Analyzer parent) throws AnalysisException, InternalException { + public void analyze(Analyzer parent) throws AnalysisException, UserException { if (isAnalyzed()) return; analyzer = new Analyzer(parent); queryStmt.analyze(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/analysis/UnsupportedStmt.java b/fe/src/main/java/com/baidu/palo/analysis/UnsupportedStmt.java index 378eeb7b7b..f37f4156df 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/UnsupportedStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/UnsupportedStmt.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; public class UnsupportedStmt extends StatementBase { @@ -30,7 +30,7 @@ public class UnsupportedStmt extends StatementBase { } @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { // do nothing } diff --git a/fe/src/main/java/com/baidu/palo/analysis/UseStmt.java b/fe/src/main/java/com/baidu/palo/analysis/UseStmt.java index c4513ee38a..20d2386f9c 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/UseStmt.java +++ b/fe/src/main/java/com/baidu/palo/analysis/UseStmt.java @@ -25,7 +25,7 @@ import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -59,7 +59,7 @@ public class UseStmt extends StatementBase { return toSql(); } - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { super.analyze(analyzer); if (Strings.isNullOrEmpty(database)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_DB_ERROR); diff --git a/fe/src/main/java/com/baidu/palo/analysis/WithClause.java b/fe/src/main/java/com/baidu/palo/analysis/WithClause.java index 6621f2c07c..bf9072453f 100644 --- a/fe/src/main/java/com/baidu/palo/analysis/WithClause.java +++ b/fe/src/main/java/com/baidu/palo/analysis/WithClause.java @@ -27,7 +27,7 @@ import java.util.Map; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.View; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; @@ -72,7 +72,7 @@ public class WithClause implements ParseNode { * TableRefs to simplify the analysis of view references. */ @Override - public void analyze(Analyzer analyzer) throws AnalysisException, InternalException { + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { // Create a new analyzer for the WITH clause with a new global state (IMPALA-1357) // but a child of 'analyzer' so that the global state for 'analyzer' is not polluted // during analysis of the WITH clause. withClauseAnalyzer is a child of 'analyzer' so diff --git a/fe/src/main/java/com/baidu/palo/backup/BackupHandler.java b/fe/src/main/java/com/baidu/palo/backup/BackupHandler.java index 7f16ee5b51..f7242d2ace 100644 --- a/fe/src/main/java/com/baidu/palo/backup/BackupHandler.java +++ b/fe/src/main/java/com/baidu/palo/backup/BackupHandler.java @@ -88,7 +88,7 @@ public class BackupHandler extends Daemon implements Writable { private Catalog catalog; - private BackupHandler() { + public BackupHandler() { // for persist } diff --git a/fe/src/main/java/com/baidu/palo/backup/MetaDownloadTask.java b/fe/src/main/java/com/baidu/palo/backup/MetaDownloadTask.java index 6936c7552b..05956e343a 100644 --- a/fe/src/main/java/com/baidu/palo/backup/MetaDownloadTask.java +++ b/fe/src/main/java/com/baidu/palo/backup/MetaDownloadTask.java @@ -29,7 +29,7 @@ import com.baidu.palo.catalog.Table.TableType; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.Config; import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.CommandResult; import com.baidu.palo.common.util.Util; @@ -122,11 +122,11 @@ public class MetaDownloadTask extends ResultfulTask { return null; } - private void checkRestoreObjs() throws InternalException { + private void checkRestoreObjs() throws UserException { try { Database db = Catalog.getInstance().getDb(dbName); if (db == null) { - throw new InternalException("Database[" + dbName + "] does not exist"); + throw new UserException("Database[" + dbName + "] does not exist"); } // 1.1 check restored objs exist @@ -148,7 +148,7 @@ public class MetaDownloadTask extends ResultfulTask { Preconditions.checkState(tableRenameMap.isEmpty()); DirSaver dbDir = (DirSaver) pathBuilder.getRoot().getChild(dbName); if (dbDir == null) { - throw new InternalException("Backup path does not contains database[" + dbName + "]"); + throw new UserException("Backup path does not contains database[" + dbName + "]"); } Collection tableDirs = dbDir.getChildren(); for (FileSaverI child : tableDirs) { @@ -213,7 +213,7 @@ public class MetaDownloadTask extends ResultfulTask { // get all existed partitions FileSaverI tableSaver = pathBuilder.getRoot().getChild(dbName).getChild(tableName); if (!(tableSaver instanceof DirSaver)) { - throw new InternalException("Table[" + tableName + "] dir does not exist"); + throw new UserException("Table[" + tableName + "] dir does not exist"); } List partitionStmts = Lists.newArrayList(); @@ -257,7 +257,7 @@ public class MetaDownloadTask extends ResultfulTask { continue; } else { if (partitionNames.isEmpty()) { - throw new InternalException("Table[" + newTableName + "]' already exist. " + throw new UserException("Table[" + newTableName + "]' already exist. " + "Drop table first or restore to another table"); } } @@ -265,7 +265,7 @@ public class MetaDownloadTask extends ResultfulTask { // table CreateTableStmt stmt = tableToCreateTableStmt.get(newTableName); if (table.getSignature(BackupVersion.VERSION_1) != stmt.getTableSignature()) { - throw new InternalException("Table[" + newTableName + "]'s struct is not same"); + throw new UserException("Table[" + newTableName + "]'s struct is not same"); } // partition @@ -279,7 +279,7 @@ public class MetaDownloadTask extends ResultfulTask { checkRangeValid(olapTable, partitionName); } else { // do not allow overwrite a partition - throw new InternalException("Partition[" + partitionName + "]' already exist in table[" + throw new UserException("Partition[" + partitionName + "]' already exist in table[" + newTableName + "]. Drop partition first or restore to another table"); } } @@ -289,7 +289,7 @@ public class MetaDownloadTask extends ResultfulTask { db.readUnlock(); } } catch (Exception e) { - throw new InternalException(e.getMessage(), e); + throw new UserException(e.getMessage(), e); } } diff --git a/fe/src/main/java/com/baidu/palo/backup/Repository.java b/fe/src/main/java/com/baidu/palo/backup/Repository.java index 61256e5cc3..fabedaa5a5 100644 --- a/fe/src/main/java/com/baidu/palo/backup/Repository.java +++ b/fe/src/main/java/com/baidu/palo/backup/Repository.java @@ -361,7 +361,7 @@ public class Repository implements Writable { // upload the local file to specified remote file with checksum // remoteFilePath should be FULL path public Status upload(String localFilePath, String remoteFilePath) { - Preconditions.checkArgument(remoteFilePath.startsWith(location), remoteFilePath); + // Preconditions.checkArgument(remoteFilePath.startsWith(location), remoteFilePath); // get md5usm of local file File file = new File(localFilePath); String md5sum = null; diff --git a/fe/src/main/java/com/baidu/palo/backup/RestoreJob.java b/fe/src/main/java/com/baidu/palo/backup/RestoreJob.java index ac2b3a9ebc..01d971415d 100644 --- a/fe/src/main/java/com/baidu/palo/backup/RestoreJob.java +++ b/fe/src/main/java/com/baidu/palo/backup/RestoreJob.java @@ -1144,8 +1144,7 @@ public class RestoreJob extends AbstractJob { } // update partition committed version - part.setCommittedVersion(entry.getValue().first); - part.setCommittedVersionHash(entry.getValue().second); + part.updateCommitVersionAndVersionHash(entry.getValue().first, entry.getValue().second); // we also need to update the replica version of these overwritten restored partitions for (MaterializedIndex idx : part.getMaterializedIndices()) { diff --git a/fe/src/main/java/com/baidu/palo/backup/SaveManifestTask.java b/fe/src/main/java/com/baidu/palo/backup/SaveManifestTask.java index f58214676c..a39094919c 100644 --- a/fe/src/main/java/com/baidu/palo/backup/SaveManifestTask.java +++ b/fe/src/main/java/com/baidu/palo/backup/SaveManifestTask.java @@ -16,7 +16,7 @@ package com.baidu.palo.backup; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Writable; import com.baidu.palo.common.util.CommandResult; import com.baidu.palo.common.util.Util; @@ -73,7 +73,7 @@ public class SaveManifestTask extends ResultfulTask { return null; } - private void saveAndUploadManifest() throws InternalException { + private void saveAndUploadManifest() throws UserException { String manifestPath = pathBuilder.manifest(); DirSaver labelDir = pathBuilder.getRoot(); Preconditions.checkState(labelDir.getName().equals(localDirName)); @@ -99,14 +99,14 @@ public class SaveManifestTask extends ResultfulTask { } if (!succeed) { - throw new InternalException(msg); + throw new UserException(msg); } uploadManifest(manifestPath, PathBuilder.MANIFEST_NAME); } - private void saveAndUploadReadableManifest() throws InternalException { + private void saveAndUploadReadableManifest() throws UserException { String localReadableManifest = pathBuilder.readableManifest(); // get list @@ -147,7 +147,7 @@ public class SaveManifestTask extends ResultfulTask { } // end for labels if (backupedObjs.isEmpty()) { - throw new InternalException("nothing backuped??!!, job: " + jobId); + throw new UserException("nothing backuped??!!, job: " + jobId); } // add last load label and last delete info @@ -175,13 +175,13 @@ public class SaveManifestTask extends ResultfulTask { if (!succeed) { LOG.warn("Failed to save readable manifest. job: {}", jobId); - throw new InternalException(msg); + throw new UserException(msg); } uploadManifest(localReadableManifest, PathBuilder.READABLE_MANIFEST_NAME); } - private void uploadManifest(String manifestFile, String fileName) throws InternalException { + private void uploadManifest(String manifestFile, String fileName) throws UserException { String uploadCmd = null; String msg = null; boolean succeed = false; @@ -219,7 +219,7 @@ public class SaveManifestTask extends ResultfulTask { } if (!succeed) { - throw new InternalException(msg); + throw new UserException(msg); } } } diff --git a/fe/src/main/java/com/baidu/palo/catalog/Catalog.java b/fe/src/main/java/com/baidu/palo/catalog/Catalog.java index 561899639d..91a4495fa0 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Catalog.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Catalog.java @@ -91,9 +91,9 @@ import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.FeConstants; import com.baidu.palo.common.FeMetaVersion; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.MarkedCountDownLatch; import com.baidu.palo.common.Pair; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.util.Daemon; import com.baidu.palo.common.util.KuduUtil; @@ -105,6 +105,7 @@ import com.baidu.palo.deploy.DeployManager; import com.baidu.palo.deploy.impl.AmbariDeployManager; import com.baidu.palo.deploy.impl.K8sDeployManager; import com.baidu.palo.deploy.impl.LocalFileDeployManager; +import com.baidu.palo.external.EsStateStore; import com.baidu.palo.ha.BDBHA; import com.baidu.palo.ha.FrontendNodeType; import com.baidu.palo.ha.HAProtocol; @@ -160,6 +161,8 @@ import com.baidu.palo.task.PullLoadJobMgr; import com.baidu.palo.thrift.TStorageMedium; import com.baidu.palo.thrift.TStorageType; import com.baidu.palo.thrift.TTaskType; +import com.baidu.palo.transaction.GlobalTransactionMgr; +import com.baidu.palo.transaction.PublishVersionDaemon; import com.google.common.base.Joiner; import com.google.common.base.Joiner.MapJoiner; @@ -204,9 +207,11 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.concurrent.locks.ReentrantLock; public class Catalog { private static final Logger LOG = LogManager.getLogger(Catalog.class); @@ -224,11 +229,18 @@ public class Catalog { private int journalVersion = 0; private long epoch = 0; - private Map idToDb; - private Map fullNameToDb; + // Lock to perform atomic modification on map like 'idToDb' and 'fullNameToDb'. + // These maps are all thread safe, we only use lock to perform atomic operations. + // Operations like Get or Put do not need lock. + // We use fair ReentrantLock to avoid starvation. Do not use this lock in critical code pass + // because fair lock has poor performance. + private ReentrantLock lock; - private Map idToCluster; - private Map nameToCluster; + private ConcurrentHashMap idToDb; + private ConcurrentHashMap fullNameToDb; + + private ConcurrentHashMap idToCluster; + private ConcurrentHashMap nameToCluster; private Load load; private ExportMgr exportMgr; @@ -236,16 +248,18 @@ public class Catalog { private Alter alter; private ConsistencyChecker consistencyChecker; private BackupHandler backupHandler; + private PublishVersionDaemon publishVersionDaemon; @Deprecated private UserPropertyMgr userPropertyMgr; private Daemon cleaner; // To clean old LabelInfo, ExportJobInfos + private Daemon txnCleaner; // To clean aborted or timeout txns private Daemon replayer; private Daemon timePrinter; private Daemon listener; + private EsStateStore esStateStore; // it is a daemon, so add it here - private ReentrantReadWriteLock lock; private boolean isFirstTimeStartUp = false; private boolean isMaster; private boolean isElectable; @@ -280,12 +294,11 @@ public class Catalog { private Checkpoint checkpointer; private List> helperNodes = Lists.newArrayList(); private Pair selfNode = null; - private Pair selfHostname = null; // node name -> Frontend - private Map frontends; + private ConcurrentHashMap frontends; // removed frontends' name. used for checking if name is duplicated in bdbje - private List removedFrontends; + private ConcurrentLinkedQueue removedFrontends; private HAProtocol haProtocol = null; @@ -301,40 +314,35 @@ public class Catalog { private PullLoadJobMgr pullLoadJobMgr; private BrokerMgr brokerMgr; + + private GlobalTransactionMgr globalTransactionMgr; private DeployManager deployManager; + private TabletStatMgr tabletStatMgr; + private PaloAuth auth; private DomainResolver domainResolver; public List getFrontends(FrontendNodeType nodeType) { + if (nodeType == null) { + // get all + return Lists.newArrayList(frontends.values()); + } + List result = Lists.newArrayList(); - readLock(); - try { - if (nodeType == null) { - // get all - return Lists.newArrayList(frontends.values()); + for (Frontend frontend : frontends.values()) { + if (frontend.getRole() == nodeType) { + result.add(frontend); } - for (Frontend frontend : frontends.values()) { - if (frontend.getRole() == nodeType) { - result.add(frontend); - } - } - } finally { - readUnlock(); } return result; } public List getRemovedFrontendNames() { - readLock(); - try { - return Lists.newArrayList(removedFrontends); - } finally { - readUnlock(); - } + return Lists.newArrayList(removedFrontends); } public JournalObservable getJournalObservable() { @@ -345,7 +353,7 @@ public class Catalog { return this.systemInfo; } - private TabletInvertedIndex getTabletInvertedIndex() { + public TabletInvertedIndex getTabletInvertedIndex() { return this.tabletInvertedIndex; } @@ -362,17 +370,18 @@ public class Catalog { } private Catalog() { - this.idToDb = new HashMap(); - this.fullNameToDb = new HashMap(); + this.idToDb = new ConcurrentHashMap<>(); + this.fullNameToDb = new ConcurrentHashMap<>(); this.load = new Load(); this.exportMgr = new ExportMgr(); this.clone = new Clone(); this.alter = new Alter(); this.consistencyChecker = new ConsistencyChecker(); + this.lock = new ReentrantLock(true); this.backupHandler = new BackupHandler(this); - this.lock = new ReentrantReadWriteLock(true); this.metaDir = Config.meta_dir; this.userPropertyMgr = new UserPropertyMgr(); + this.publishVersionDaemon = new PublishVersionDaemon(); this.canWrite = false; this.canRead = false; @@ -383,8 +392,8 @@ public class Catalog { this.feType = FrontendNodeType.INIT; this.role = FrontendNodeType.UNKNOWN; - this.frontends = Maps.newHashMap(); - this.removedFrontends = Lists.newArrayList(); + this.frontends = new ConcurrentHashMap<>(); + this.removedFrontends = new ConcurrentLinkedQueue<>(); this.journalObservable = new JournalObservable(); this.formerFeType = FrontendNodeType.INIT; @@ -400,16 +409,21 @@ public class Catalog { this.metaReplayState = new MetaReplayState(); - this.idToCluster = new HashMap(); - this.nameToCluster = new HashMap(); + this.idToCluster = new ConcurrentHashMap<>(); + this.nameToCluster = new ConcurrentHashMap<>(); this.isDefaultClusterCreated = false; this.pullLoadJobMgr = new PullLoadJobMgr(); this.brokerMgr = new BrokerMgr(); + this.globalTransactionMgr = new GlobalTransactionMgr(this); + this.tabletStatMgr = new TabletStatMgr(); + this.auth = new PaloAuth(); this.domainResolver = new DomainResolver(auth); + + this.esStateStore = new EsStateStore(); } public static void destroyCheckpoint() { @@ -446,6 +460,14 @@ public class Catalog { public BrokerMgr getBrokerMgr() { return brokerMgr; } + + public static GlobalTransactionMgr getCurrentGlobalTransactionMgr() { + return getCurrentCatalog().globalTransactionMgr; + } + + public GlobalTransactionMgr getGlobalTransactionMgr() { + return globalTransactionMgr; + } public PaloAuth getAuth() { return auth; @@ -474,20 +496,33 @@ public class Catalog { return Thread.currentThread().getId() == checkpointThreadId; } - public void readLock() { - this.lock.readLock().lock(); + // Use tryLock to avoid potential dead lock + private boolean tryLock(boolean mustLock) { + while (true) { + try { + if (!lock.tryLock(Config.catalog_try_lock_timeout_ms, TimeUnit.MILLISECONDS)) { + if (mustLock) { + continue; + } else { + return false; + } + } + return true; + } catch (InterruptedException e) { + LOG.warn("got exception while getting catalog lock", e); + if (mustLock) { + continue; + } else { + return lock.isHeldByCurrentThread(); + } + } + } } - public void readUnlock() { - this.lock.readLock().unlock(); - } - - private void writeLock() { - this.lock.writeLock().lock(); - } - - private void writeUnlock() { - this.lock.writeLock().unlock(); + private void unlock() { + if (lock.isHeldByCurrentThread()) { + this.lock.unlock(); + } } public void initialize(String[] args) throws Exception { @@ -521,6 +556,7 @@ public class Catalog { this.editLog = new EditLog(nodeName); loadImage(IMAGE_DIR); // load image file editLog.open(); // open bdb env or local output stream + this.globalTransactionMgr.setEditLog(editLog); // 4. start load label cleaner thread createCleaner(); @@ -528,11 +564,23 @@ public class Catalog { cleaner.setInterval(Config.label_clean_interval_second * 1000L); cleaner.start(); - // 5. start state listener thread + // 5. create es state store + esStateStore.loadTableFromCatalog(); + esStateStore.start(); + + // 6. start state listener thread createStateListener(); listener.setName("stateListener"); listener.setInterval(STATE_CHANGE_CHECK_INTERVAL_MS); listener.start(); + + // 7. start txn cleaner thread + createTxnCleaner(); + txnCleaner.setName("txnCleaner"); + // the clear threads runs every min(transaction_clean_interval_second,stream_load_default_timeout_second)/10 + txnCleaner.setInterval(Math.min(Config.transaction_clean_interval_second, + Config.stream_load_default_timeout_second) * 100L); + } private void getClusterIdAndRole() throws IOException { @@ -791,8 +839,7 @@ public class Catalog { private void getSelfHostPort() { selfNode = new Pair(FrontendOptions.getLocalHostAddress(), Config.edit_log_port); - selfHostname = new Pair(FrontendOptions.getHostname(), Config.edit_log_port); - LOG.debug("get self node: {}, self hostname: {}", selfNode, selfHostname); + LOG.debug("get self node: {}", selfNode); } private void getHelperNodes(String[] args) throws AnalysisException { @@ -950,6 +997,12 @@ public class Catalog { // Clone checker CloneChecker.getInstance().setInterval(Config.clone_checker_interval_second * 1000L); CloneChecker.getInstance().start(); + + // Publish Version Daemon + publishVersionDaemon.start(); + + // Start txn cleaner + txnCleaner.start(); // Alter getAlterInstance().start(); @@ -979,6 +1032,8 @@ public class Catalog { } domainResolver.start(); + + tabletStatMgr.start(); MetricRepo.init(); } @@ -1014,7 +1069,10 @@ public class Catalog { } formerFeType = feType; + domainResolver.start(); + + tabletStatMgr.start(); MetricRepo.init(); } @@ -1158,6 +1216,9 @@ public class Catalog { checksum = loadExportJob(dis, checksum); checksum = loadBackupHandler(dis, checksum); checksum = loadPaloAuth(dis, checksum); + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + checksum = loadTransactionState(dis, checksum); + } long remoteChecksum = dis.readLong(); Preconditions.checkState(remoteChecksum == checksum, remoteChecksum + " vs. " + checksum); @@ -1168,7 +1229,7 @@ public class Catalog { long loadImageEndTime = System.currentTimeMillis(); LOG.info("finished load image in " + (loadImageEndTime - loadImageStartTime) + " ms"); } - + private void recreateTabletInvertIndex() { if (isCheckpointThread()) { return; @@ -1324,6 +1385,32 @@ public class Catalog { param.readFields(dis); load.setLoadErrorHubInfo(param); } + + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + // 4. load delete jobs + int deleteJobSize = dis.readInt(); + newChecksum ^= deleteJobSize; + for (int i = 0; i < deleteJobSize; i++) { + long dbId = dis.readLong(); + newChecksum ^= dbId; + + int deleteJobCount = dis.readInt(); + newChecksum ^= deleteJobCount; + for (int j = 0; j < deleteJobCount; j++) { + LoadJob job = new LoadJob(); + job.readFields(dis); + long currentTimeMs = System.currentTimeMillis(); + + // Delete the history load jobs that are older than + // LABEL_KEEP_MAX_MS + // This job must be FINISHED or CANCELLED + if ((currentTimeMs - job.getCreateTimeMs()) / 1000 <= Config.label_keep_max_second + || (job.getState() != JobState.FINISHED && job.getState() != JobState.CANCELLED)) { + load.unprotectAddLoadJob(job, true /* replay */); + } + } + } + } return newChecksum; } @@ -1361,7 +1448,7 @@ public class Catalog { public long loadAlterJob(DataInputStream dis, long checksum, JobType type) throws IOException { Map alterJobs = null; - List finishedOrCancelledAlterJobs = null; + ConcurrentLinkedQueue finishedOrCancelledAlterJobs = null; if (type == JobType.ROLLUP) { alterJobs = this.getRollupHandler().unprotectedGetAlterJobs(); finishedOrCancelledAlterJobs = this.getRollupHandler().unprotectedGetFinishedOrCancelledAlterJobs(); @@ -1385,8 +1472,10 @@ public class Catalog { // init job Database db = getDb(job.getDbId()); - if (db != null) { - job.unprotectedReplayInitJob(db); + // should check job state here because the job is finished but not removed from alter jobs list + if (db != null && (job.getState() == com.baidu.palo.alter.AlterJob.JobState.PENDING + || job.getState() == com.baidu.palo.alter.AlterJob.JobState.RUNNING)) { + job.replayInitJob(db); } } @@ -1505,6 +1594,13 @@ public class Catalog { } return checksum; } + + public long loadTransactionState(DataInputStream dis, long checksum) throws IOException { + int size = dis.readInt(); + long newChecksum = checksum ^ size; + globalTransactionMgr.readFields(dis); + return newChecksum; + } public long loadRecycleBin(DataInputStream dis, long checksum) throws IOException { if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_10) { @@ -1539,38 +1635,36 @@ public class Catalog { curFile.createNewFile(); } + // save image does not need any lock. because only checkpoint thread will call this method. LOG.info("start save image to {}. is ckpt: {}", curFile.getAbsolutePath(), Catalog.isCheckpointThread()); long checksum = 0; long saveImageStartTime = System.currentTimeMillis(); - readLock(); + DataOutputStream dos = new DataOutputStream(new FileOutputStream(curFile)); try { - DataOutputStream dos = new DataOutputStream(new FileOutputStream(curFile)); - try { - checksum = saveHeader(dos, replayedJournalId, checksum); - checksum = saveMasterInfo(dos, checksum); - checksum = saveFrontends(dos, checksum); - checksum = Catalog.getCurrentSystemInfo().saveBackends(dos, checksum); - checksum = saveDb(dos, checksum); - checksum = saveLoadJob(dos, checksum); - checksum = saveAlterJob(dos, checksum); - checksum = saveRecycleBin(dos, checksum); - checksum = saveGlobalVariable(dos, checksum); - checksum = saveCluster(dos, checksum); - checksum = saveBrokers(dos, checksum); - checksum = saveExportJob(dos, checksum); - checksum = saveBackupHandler(dos, checksum); - checksum = savePaloAuth(dos, checksum); - dos.writeLong(checksum); - } finally { - dos.close(); - } + checksum = saveHeader(dos, replayedJournalId, checksum); + checksum = saveMasterInfo(dos, checksum); + checksum = saveFrontends(dos, checksum); + checksum = Catalog.getCurrentSystemInfo().saveBackends(dos, checksum); + checksum = saveDb(dos, checksum); + checksum = saveLoadJob(dos, checksum); + checksum = saveAlterJob(dos, checksum); + checksum = saveRecycleBin(dos, checksum); + checksum = saveGlobalVariable(dos, checksum); + checksum = saveCluster(dos, checksum); + checksum = saveBrokers(dos, checksum); + checksum = saveExportJob(dos, checksum); + checksum = saveBackupHandler(dos, checksum); + checksum = savePaloAuth(dos, checksum); + checksum = saveTransactionState(dos, checksum); + dos.writeLong(checksum); } finally { - readUnlock(); + dos.close(); } long saveImageEndTime = System.currentTimeMillis(); - LOG.info("finished save image in {} ms. checksum is {}", (saveImageEndTime - saveImageStartTime), checksum); + LOG.info("finished save image {} in {} ms. checksum is {}", + curFile.getAbsolutePath(), (saveImageEndTime - saveImageStartTime), checksum); } public long saveHeader(DataOutputStream dos, long replayedJournalId, long checksum) throws IOException { @@ -1686,6 +1780,25 @@ public class Catalog { // 3. load error hub info LoadErrorHub.Param param = load.getLoadErrorHubInfo(); param.write(dos); + + // 4. save delete load job info + Map> dbToDeleteJobs = load.getDbToDeleteJobs(); + int deleteJobSize = dbToDeleteJobs.size(); + checksum ^= deleteJobSize; + dos.writeInt(deleteJobSize); + for (Entry> entry : dbToDeleteJobs.entrySet()) { + long dbId = entry.getKey(); + checksum ^= dbId; + dos.writeLong(dbId); + + List deleteJobs = entry.getValue(); + int deleteJobCount = deleteJobs.size(); + checksum ^= deleteJobCount; + dos.writeInt(deleteJobCount); + for (LoadJob job : deleteJobs) { + job.write(dos); + } + } return checksum; } @@ -1714,7 +1827,7 @@ public class Catalog { public long saveAlterJob(DataOutputStream dos, long checksum, JobType type) throws IOException { Map alterJobs = null; - List finishedOrCancelledAlterJobs = null; + ConcurrentLinkedQueue finishedOrCancelledAlterJobs = null; if (type == JobType.ROLLUP) { alterJobs = this.getRollupHandler().unprotectedGetAlterJobs(); finishedOrCancelledAlterJobs = this.getRollupHandler().unprotectedGetFinishedOrCancelledAlterJobs(); @@ -1755,6 +1868,14 @@ public class Catalog { auth.write(dos); return checksum; } + + public long saveTransactionState(DataOutputStream dos, long checksum) throws IOException { + int size = globalTransactionMgr.getTransactionNum(); + checksum ^= size; + dos.writeInt(size); + globalTransactionMgr.write(dos); + return checksum; + } public long saveRecycleBin(DataOutputStream dos, long checksum) throws IOException { CatalogRecycleBin recycleBin = Catalog.getCurrentRecycleBin(); @@ -1788,6 +1909,14 @@ public class Catalog { } }; } + + public void createTxnCleaner() { + txnCleaner = new Daemon() { + protected void runOneCycle() { + globalTransactionMgr.removeOldTransactions(); + } + }; + } public void createReplayer() { if (isMaster) { @@ -2019,7 +2148,9 @@ public class Catalog { } public void addFrontend(FrontendNodeType role, String host, int editLogPort) throws DdlException { - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { Frontend fe = checkFeExist(host, editLogPort); if (fe != null) { @@ -2039,7 +2170,7 @@ public class Catalog { } editLog.logAddFrontend(fe); } finally { - writeUnlock(); + unlock(); } } @@ -2047,7 +2178,9 @@ public class Catalog { if (host.equals(selfNode.first) && port == selfNode.second && isMaster) { throw new DdlException("can not drop current master node."); } - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { Frontend fe = checkFeExist(host, port); if (fe == null) { @@ -2064,44 +2197,37 @@ public class Catalog { } editLog.logRemoveFrontend(fe); } finally { - writeUnlock(); + unlock(); } } public Frontend checkFeExist(String host, int port) { - readLock(); - try { - for (Frontend fe : frontends.values()) { - if (fe.getHost().equals(host) && fe.getEditLogPort() == port) { - return fe; - } + for (Frontend fe : frontends.values()) { + if (fe.getHost().equals(host) && fe.getEditLogPort() == port) { + return fe; } - } finally { - readUnlock(); } return null; } public Frontend getFeByHost(String host) { - readLock(); - try { - for (Frontend fe : frontends.values()) { - if (fe.getHost().equals(host)) { - return fe; - } + for (Frontend fe : frontends.values()) { + if (fe.getHost().equals(host)) { + return fe; } - } finally { - readUnlock(); } return null; } + // The interface which DdlExecutor needs. public void createDb(CreateDbStmt stmt) throws DdlException { final String clusterName = stmt.getClusterName(); String fullDbName = stmt.getFullDbName(); long id = 0L; - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { if (!nameToCluster.containsKey(clusterName)) { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_NO_SELECT_CLUSTER, clusterName); @@ -2121,7 +2247,7 @@ public class Catalog { editLog.logCreateDb(db); } } finally { - writeUnlock(); + unlock(); } LOG.info("createDb dbName = " + fullDbName + ", id = " + id); } @@ -2140,13 +2266,12 @@ public class Catalog { idToCluster.put(cluster.getId(), cluster); } - public void replayCreateDb(Database db) { - writeLock(); + tryLock(true); try { unprotectCreateDb(db); } finally { - writeUnlock(); + unlock(); } } @@ -2154,7 +2279,9 @@ public class Catalog { String dbName = stmt.getDbName(); // 1. check if database exists - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { if (!fullNameToDb.containsKey(dbName)) { if (stmt.isSetIfExists()) { @@ -2170,6 +2297,7 @@ public class Catalog { db.writeLock(); try { if (db.getDbState() == DbState.LINK && dbName.equals(db.getAttachDb())) { + // We try to drop a hard link. final DropLinkDbAndUpdateDbInfo info = new DropLinkDbAndUpdateDbInfo(); fullNameToDb.remove(db.getAttachDb()); db.setDbState(DbState.NORMAL); @@ -2187,7 +2315,9 @@ public class Catalog { return; } - if (dbName.equals(db.getFullName()) && db.getDbState() == DbState.LINK) { + if (db.getDbState() == DbState.LINK && dbName.equals(db.getFullName())) { + // We try to drop a db which other dbs attach to it, + // which is not allowed. ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_DB_STATE_LINK_OR_MIGRATE, ClusterNamespace.getNameFromFullName(dbName)); return; @@ -2214,7 +2344,7 @@ public class Catalog { cluster.removeDb(dbName, db.getId()); editLog.logDropDb(dbName); } finally { - writeUnlock(); + unlock(); } LOG.info("finish drop database[{}]", dbName); @@ -2227,7 +2357,7 @@ public class Catalog { } public void replayDropLinkDb(DropLinkDbAndUpdateDbInfo info) { - writeLock(); + tryLock(true); try { final Database db = this.fullNameToDb.remove(info.getDropDbName()); db.setDbState(info.getUpdateDbState()); @@ -2238,12 +2368,12 @@ public class Catalog { param.addLongParam(db.getId()); cluster.removeLinkDb(param); } finally { - writeUnlock(); + unlock(); } } public void replayDropDb(String dbName) throws DdlException { - writeLock(); + tryLock(true); try { Database db = fullNameToDb.get(dbName); db.writeLock(); @@ -2260,7 +2390,7 @@ public class Catalog { final Cluster cluster = nameToCluster.get(db.getClusterName()); cluster.removeDb(dbName, db.getId()); } finally { - writeUnlock(); + unlock(); } } @@ -2273,7 +2403,9 @@ public class Catalog { Database db = Catalog.getCurrentRecycleBin().recoverDatabase(recoverStmt.getDbName()); // add db to catalog - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { if (fullNameToDb.containsKey(db.getFullName())) { throw new DdlException("Database[" + db.getFullName() + "] already exist."); @@ -2283,13 +2415,14 @@ public class Catalog { fullNameToDb.put(db.getFullName(), db); idToDb.put(db.getId(), db); + + // log + RecoverInfo recoverInfo = new RecoverInfo(db.getId(), -1L, -1L); + Catalog.getInstance().getEditLog().logRecoverDb(recoverInfo); } finally { - writeUnlock(); + unlock(); } - // log - RecoverInfo recoverInfo = new RecoverInfo(db.getId(), -1L, -1L); - Catalog.getInstance().getEditLog().logRecoverDb(recoverInfo); LOG.info("recover database[{}]", db.getId()); } @@ -2383,58 +2516,58 @@ public class Catalog { } public void renameDatabase(AlterDatabaseRename stmt) throws DdlException { - String dbName = stmt.getDbName(); - String newDbName = stmt.getNewDbName(); + String fullDbName = stmt.getDbName(); + String newFullDbName = stmt.getNewDbName(); String clusterName = stmt.getClusterName(); - if (dbName.equals(newDbName)) { + if (fullDbName.equals(newFullDbName)) { throw new DdlException("Same database name"); } Database db = null; Cluster cluster = null; - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { cluster = nameToCluster.get(clusterName); if (cluster == null) { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_NO_EXISTS, clusterName); } - final String dbNameWithoutPrefix = ClusterNamespace.getNameFromFullName(dbName); // check if db exists - db = fullNameToDb.get(dbName); + db = fullNameToDb.get(fullDbName); if (db == null) { - ErrorReport.reportDdlException(ErrorCode.ERR_BAD_DB_ERROR, dbNameWithoutPrefix); + ErrorReport.reportDdlException(ErrorCode.ERR_BAD_DB_ERROR, fullDbName); } if (db.getDbState() == DbState.LINK || db.getDbState() == DbState.MOVE) { - ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_RENAME_DB_ERR, dbNameWithoutPrefix); + ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_RENAME_DB_ERR, fullDbName); } - final String newDbNameWithoutPrefix = ClusterNamespace.getNameFromFullName(newDbName); // check if name is already used - if (fullNameToDb.get(newDbName) != null) { - throw new DdlException("Database name[" + newDbNameWithoutPrefix + "] is already used"); + if (fullNameToDb.get(newFullDbName) != null) { + throw new DdlException("Database name[" + newFullDbName + "] is already used"); } cluster.removeDb(db.getFullName(), db.getId()); - cluster.addDb(newDbName, db.getId()); + cluster.addDb(newFullDbName, db.getId()); // 1. rename db - db.setNameWithLock(newDbName); + db.setNameWithLock(newFullDbName); // 2. add to meta. check again - fullNameToDb.remove(dbName); - fullNameToDb.put(newDbName, db); + fullNameToDb.remove(fullDbName); + fullNameToDb.put(newFullDbName, db); - DatabaseInfo dbInfo = new DatabaseInfo(dbName, newDbName, -1L); + DatabaseInfo dbInfo = new DatabaseInfo(fullDbName, newFullDbName, -1L); editLog.logDatabaseRename(dbInfo); } finally { - writeUnlock(); + unlock(); } - LOG.info("rename database[{}] to [{}]", dbName, newDbName); + LOG.info("rename database[{}] to [{}]", fullDbName, newFullDbName); } public void replayRenameDatabase(String dbName, String newDbName) { - writeLock(); + tryLock(true); try { Database db = fullNameToDb.get(dbName); Cluster cluster = nameToCluster.get(db.getClusterName()); @@ -2444,7 +2577,7 @@ public class Catalog { fullNameToDb.remove(dbName); fullNameToDb.put(newDbName, db); } finally { - writeUnlock(); + unlock(); } LOG.info("replay rename database {} to {}", dbName, newDbName); @@ -2465,10 +2598,13 @@ public class Catalog { ErrorReport.reportDdlException(ErrorCode.ERR_BAD_DB_ERROR, dbName); } - // check cluster capacity - Catalog.getCurrentSystemInfo().checkClusterCapacity(stmt.getClusterName()); - // check db quota - db.checkQuota(); + // only internal table should check quota and cluster capacity + if (!stmt.isExternal()) { + // check cluster capacity + Catalog.getCurrentSystemInfo().checkClusterCapacity(stmt.getClusterName()); + // check db quota + db.checkQuota(); + } // check if table exists in db if (!isRestore) { @@ -2495,6 +2631,8 @@ public class Catalog { return createKuduTable(db, stmt); } else if (engineName.equals("broker")) { return createBrokerTable(db, stmt, isRestore); + } else if (engineName.equalsIgnoreCase("elasticsearch") || engineName.equalsIgnoreCase("es")) { + return createEsTable(db, stmt); } else { ErrorReport.reportDdlException(ErrorCode.ERR_UNKNOWN_STORAGE_ENGINE, engineName); } @@ -2697,7 +2835,6 @@ public class Catalog { rangePartitionInfo.handleNewSinglePartitionDesc(singlePartitionDesc, partitionId); olapTable.addPartition(partition); - // log PartitionPersistInfo info = new PartitionPersistInfo(db.getId(), olapTable.getId(), partition, rangePartitionInfo.getRange(partitionId), dataProperty, @@ -2951,8 +3088,7 @@ public class Catalog { // version and version hash if (versionInfo != null) { - partition.setCommittedVersion(versionInfo.first); - partition.setCommittedVersionHash(versionInfo.second); + partition.updateCommitVersionAndVersionHash(versionInfo.first, versionInfo.second); } long version = partition.getCommittedVersion(); long versionHash = partition.getCommittedVersionHash(); @@ -3033,7 +3169,6 @@ public class Catalog { partition.createRollupIndex(index); } } // end for indexMap - return partition; } @@ -3269,6 +3404,36 @@ public class Catalog { return returnTable; } + + private Table createEsTable(Database db, CreateTableStmt stmt) throws DdlException { + String tableName = stmt.getTableName(); + + // create columns + List baseSchema = stmt.getColumns(); + validateColumns(baseSchema); + + // create partition info + PartitionDesc partitionDesc = stmt.getPartitionDesc(); + PartitionInfo partitionInfo = null; + Map partitionNameToId = Maps.newHashMap(); + if (partitionDesc != null) { + partitionInfo = partitionDesc.toPartitionInfo(baseSchema, partitionNameToId); + } else { + long partitionId = getNextId(); + // use table name as single partition name + partitionNameToId.put(tableName, partitionId); + partitionInfo = new SinglePartitionInfo(); + } + + long tableId = Catalog.getInstance().getNextId(); + EsTable esTable = new EsTable(tableId, tableName, baseSchema, stmt.getProperties(), partitionInfo); + + if (!db.createTableWithLock(esTable, false, stmt.isSetIfNotExists())) { + ErrorReport.reportDdlException(ErrorCode.ERR_CANT_CREATE_TABLE, tableName, "table already exist"); + } + LOG.info("successfully create table{} with id {}", tableName, tableId); + return esTable; + } private Table createKuduTable(Database db, CreateTableStmt stmt) throws DdlException { String tableName = stmt.getTableName(); @@ -3395,7 +3560,8 @@ public class Catalog { // 1.2 other table type sb.append("CREATE "); - if (table.getType() == TableType.KUDU || table.getType() == TableType.MYSQL) { + if (table.getType() == TableType.KUDU || table.getType() == TableType.MYSQL + || table.getType() == TableType.ELASTICSEARCH) { sb.append("EXTERNAL "); } sb.append("TABLE "); @@ -3535,6 +3701,33 @@ public class Catalog { } sb.append(";"); + } else if (table.getType() == TableType.ELASTICSEARCH) { + EsTable esTable = (EsTable) table; + + // partition + PartitionInfo partitionInfo = esTable.getPartitionInfo(); + if (partitionInfo.getType() == PartitionType.RANGE) { + sb.append("\n"); + sb.append("PARTITION BY RANGE("); + idx = 0; + RangePartitionInfo rangePartitionInfo = (RangePartitionInfo) partitionInfo; + for (Column column : rangePartitionInfo.getPartitionColumns()) { + if (idx != 0) { + sb.append(", "); + } + sb.append("`").append(column.getName()).append("`"); + } + sb.append(")\n()"); + } + + // properties + sb.append("\nPROPERTIES (\n"); + sb.append("\"host\" = \"").append(esTable.getHosts()).append("\",\n"); + sb.append("\"user\" = \"").append(esTable.getUserName()).append("\",\n"); + sb.append("\"password\" = \"").append(hidePassword ? "" : esTable.getPasswd()).append("\",\n"); + sb.append("\"index\" = \"").append(esTable.getIndexName()).append("\"\n"); + sb.append("\"type\" = \"").append(esTable.getMappingType()).append("\"\n"); + sb.append(");"); } createTableStmt.add(sb.toString()); @@ -3662,19 +3855,13 @@ public class Catalog { String dbName = stmt.getDbName(); String tableName = stmt.getTableName(); - Database db = null; - Table table = null; - readLock(); - try { - // check database - db = this.fullNameToDb.get(dbName); - if (db == null) { - ErrorReport.reportDdlException(ErrorCode.ERR_BAD_DB_ERROR, dbName); - } - } finally { - readUnlock(); + // check database + Database db = this.fullNameToDb.get(dbName); + if (fullNameToDb.get(dbName) == null) { + ErrorReport.reportDdlException(ErrorCode.ERR_BAD_DB_ERROR, dbName); } - + + Table table = null; db.writeLock(); try { table = db.getTable(tableName); @@ -3699,14 +3886,14 @@ public class Catalog { } unprotectDropTable(db, table.getId()); + + DropInfo info = new DropInfo(db.getId(), table.getId(), -1L); + editLog.logDropTable(info); } finally { db.writeUnlock(); } - DropInfo info = new DropInfo(db.getId(), table.getId(), -1L); - editLog.logDropTable(info); - - LOG.info("finish drop table[{}] from db[{}]", tableName, dbName); + LOG.info("finished dropping table: {} from db: {}", tableName, dbName); } public boolean unprotectDropTable(Database db, long tableId) { @@ -3716,10 +3903,7 @@ public class Catalog { return false; } - if (table.getType() == TableType.OLAP) { - OlapTable olapTable = (OlapTable) table; - cancelJobsWithTable(db, olapTable); - } else if (table.getType() == TableType.KUDU) { + if (table.getType() == TableType.KUDU) { KuduTable kuduTable = (KuduTable) table; KuduClient client = KuduUtil.createKuduClient(kuduTable.getMasterAddrs()); try { @@ -3733,6 +3917,8 @@ public class Catalog { } catch (KuduException e) { LOG.warn("failed to delete kudu table {} when replay", kuduTable.getName(), e); } + } else if (table.getType() == TableType.ELASTICSEARCH) { + esStateStore.deRegisterTable(tableId); } db.dropTable(table.getName()); @@ -3766,16 +3952,6 @@ public class Catalog { } } - private void cancelJobsWithTable(Database db, OlapTable olapTable) { - // remove related jobs - RollupHandler rollupHandler = Catalog.getInstance().getRollupHandler(); - rollupHandler.cancelWithTable(olapTable); - SchemaChangeHandler schemaChangeHandler = Catalog.getInstance().getSchemaChangeHandler(); - schemaChangeHandler.cancelWithTable(olapTable); - Clone clone = Catalog.getInstance().getCloneInstance(); - clone.cancelCloneJob(olapTable); - } - public void handleJobsWhenDeleteReplica(long tableId, long partitionId, long indexId, long tabletId, long replicaId, long backendId) { // rollup @@ -3789,13 +3965,18 @@ public class Catalog { } public void unprotectAddReplica(ReplicaPersistInfo info) { + LOG.debug("replay add a replica {}", info); Database db = getDb(info.getDbId()); OlapTable olapTable = (OlapTable) db.getTable(info.getTableId()); Partition partition = olapTable.getPartition(info.getPartitionId()); MaterializedIndex materializedIndex = partition.getIndex(info.getIndexId()); Tablet tablet = materializedIndex.getTablet(info.getTabletId()); Replica replica = new Replica(info.getReplicaId(), info.getBackendId(), info.getVersion(), - info.getVersionHash(), info.getDataSize(), info.getRowCount(), ReplicaState.NORMAL); + info.getVersionHash(), info.getDataSize(), info.getRowCount(), ReplicaState.NORMAL, + info.getLastFailedVersion(), + info.getLastFailedVersionHash(), + info.getLastSuccessVersion(), + info.getLastSuccessVersionHash()); tablet.addReplica(replica); } @@ -3829,7 +4010,7 @@ public class Catalog { } public void addFrontendWithCheck(Frontend fe) { - writeLock(); + tryLock(true); try { Frontend existFe = checkFeExist(fe.getHost(), fe.getEditLogPort()); if (existFe != null) { @@ -3855,12 +4036,12 @@ public class Catalog { // helper sockets will be added after start BDBHA } } finally { - writeUnlock(); + unlock(); } } public void replayDropFrontend(Frontend frontend) { - writeLock(); + tryLock(true); try { Frontend removedFe = frontends.remove(frontend.getNodeName()); if (removedFe == null) { @@ -3869,7 +4050,7 @@ public class Catalog { } removedFrontends.add(removedFe.getNodeName()); } finally { - writeUnlock(); + unlock(); } } @@ -3878,37 +4059,24 @@ public class Catalog { } public Database getDb(String name) { - readLock(); - try { - if (fullNameToDb.containsKey(name)) { - return fullNameToDb.get(name); - } else { - // This maybe a information_schema db request, and information_schema db name is case insensitive. - // So, we first extract db name to check if it is information_schema. - // Then we reassemble the origin cluster name with lower case db name, - // and finally get information_schema db from the name map. - String dbName = ClusterNamespace.getNameFromFullName(name); - if (dbName.equalsIgnoreCase(InfoSchemaDb.DATABASE_NAME)) { - String clusterName = ClusterNamespace.getClusterNameFromFullName(name); - return fullNameToDb.get(ClusterNamespace.getFullName(clusterName, dbName.toLowerCase())); - } - } - return null; - } finally { - readUnlock(); - } + if (fullNameToDb.containsKey(name)) { + return fullNameToDb.get(name); + } else { + // This maybe a information_schema db request, and information_schema db name is case insensitive. + // So, we first extract db name to check if it is information_schema. + // Then we reassemble the origin cluster name with lower case db name, + // and finally get information_schema db from the name map. + String dbName = ClusterNamespace.getNameFromFullName(name); + if (dbName.equalsIgnoreCase(InfoSchemaDb.DATABASE_NAME)) { + String clusterName = ClusterNamespace.getClusterNameFromFullName(name); + return fullNameToDb.get(ClusterNamespace.getFullName(clusterName, dbName.toLowerCase())); + } + } + return null; } public Database getDb(long dbId) { - readLock(); - try { - if (idToDb.containsKey(dbId)) { - return idToDb.get(dbId); - } - return null; - } finally { - readUnlock(); - } + return idToDb.get(dbId); } public EditLog getEditLog() { @@ -3923,37 +4091,19 @@ public class Catalog { } public List getDbNames() { - readLock(); - try { - List dbNames = Lists.newArrayList(fullNameToDb.keySet()); - return dbNames; - } finally { - readUnlock(); - } + return Lists.newArrayList(fullNameToDb.keySet()); } public List getClusterDbNames(String clusterName) throws AnalysisException { - readLock(); - try { - final Cluster cluster = nameToCluster.get(clusterName); - if (cluster == null) { - throw new AnalysisException("No cluster selected"); - } - List dbNames = Lists.newArrayList(cluster.getDbNames()); - return dbNames; - } finally { - readUnlock(); + final Cluster cluster = nameToCluster.get(clusterName); + if (cluster == null) { + throw new AnalysisException("No cluster selected"); } + return Lists.newArrayList(cluster.getDbNames()); } public List getDbIds() { - readLock(); - try { - List dbIds = Lists.newArrayList(idToDb.keySet()); - return dbIds; - } finally { - readUnlock(); - } + return Lists.newArrayList(idToDb.keySet()); } public HashMap getPartitionIdToStorageMediumMap() { @@ -4133,10 +4283,6 @@ public class Catalog { return this.selfNode; } - public Pair getSelfHostname() { - return this.selfHostname; - } - public String getNodeName() { return this.nodeName; } @@ -4169,6 +4315,10 @@ public class Catalog { } return this.masterIp; } + + public EsStateStore getEsStateStore() { + return this.esStateStore; + } public void setMaster(MasterInfo info) { this.masterIp = info.getIp(); @@ -4308,7 +4458,7 @@ public class Catalog { * used for handling AlterTableStmt (for client is the ALTER TABLE command). * including SchemaChangeHandler and RollupHandler */ - public void alterTable(AlterTableStmt stmt) throws DdlException, InternalException { + public void alterTable(AlterTableStmt stmt) throws DdlException, UserException { this.alter.processAlterTable(stmt); } @@ -4509,7 +4659,7 @@ public class Catalog { * used for handling AlterClusterStmt * (for client is the ALTER CLUSTER command). */ - public void alterCluster(AlterSystemStmt stmt) throws DdlException, InternalException { + public void alterCluster(AlterSystemStmt stmt) throws DdlException, UserException { this.alter.processAlterCluster(stmt); } @@ -4607,7 +4757,7 @@ public class Catalog { newView.setOriginalViewDef(stmt.getInlineViewDef()); try { newView.init(); - } catch (InternalException e) { + } catch (UserException e) { throw new DdlException(e.getMessage()); } @@ -4639,7 +4789,9 @@ public class Catalog { */ public void createCluster(CreateClusterStmt stmt) throws DdlException { final String clusterName = stmt.getClusterName(); - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { if (nameToCluster.containsKey(clusterName)) { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_HAS_EXIST, clusterName); @@ -4649,9 +4801,7 @@ public class Catalog { // 2: BE returned is more than or equal to 0, succeeds. if (backendList != null || stmt.getInstanceNum() == 0) { final long id = getNextId(); - final Cluster cluster = new Cluster(); - cluster.setName(clusterName); - cluster.setId(id); + final Cluster cluster = new Cluster(clusterName, id); cluster.setBackendIdList(backendList); unprotectCreateCluster(cluster); if (clusterName.equals(SystemInfoService.DEFAULT_CLUSTER)) { @@ -4668,7 +4818,7 @@ public class Catalog { } } } finally { - writeUnlock(); + unlock(); } // create super user for this cluster @@ -4710,11 +4860,11 @@ public class Catalog { * @param cluster */ public void replayCreateCluster(Cluster cluster) { - writeLock(); + tryLock(true); try { unprotectCreateCluster(cluster); } finally { - writeUnlock(); + unlock(); } } @@ -4725,7 +4875,9 @@ public class Catalog { * @throws DdlException */ public void dropCluster(DropClusterStmt stmt) throws DdlException { - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { final String clusterName = stmt.getClusterName(); final Cluster cluster = nameToCluster.get(clusterName); @@ -4749,7 +4901,7 @@ public class Catalog { unprotectDropCluster(info, false /* is not replay */); editLog.logDropCluster(info); } finally { - writeUnlock(); + unlock(); } // drop user of this cluster @@ -4767,18 +4919,18 @@ public class Catalog { } public void replayDropCluster(ClusterInfo info) { - writeLock(); + tryLock(true); try { unprotectDropCluster(info, true/* is replay */); } finally { - writeUnlock(); + unlock(); } auth.dropUserOfCluster(info.getClusterName(), true /* is replay */); } public void replayExpandCluster(ClusterInfo info) { - writeLock(); + tryLock(true); try { final Cluster cluster = nameToCluster.get(info.getClusterName()); cluster.addBackends(info.getBackendIdList()); @@ -4792,7 +4944,7 @@ public class Catalog { be.setBackendState(BackendState.using); } } finally { - writeUnlock(); + unlock(); } } @@ -4805,7 +4957,9 @@ public class Catalog { public void processModifyCluster(AlterClusterStmt stmt) throws DdlException { final String clusterName = stmt.getAlterClusterName(); final int newInstanceNum = stmt.getInstanceNum(); - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { Cluster cluster = nameToCluster.get(clusterName); if (cluster == null) { @@ -4865,7 +5019,7 @@ public class Catalog { } } finally { - writeUnlock(); + unlock(); } } @@ -4889,7 +5043,7 @@ public class Catalog { } /** - * migrate db to link des cluster + * migrate db to link dest cluster * * @param stmt * @throws DdlException @@ -4900,7 +5054,9 @@ public class Catalog { final String srcDbName = stmt.getSrcDb(); final String destDbName = stmt.getDestDb(); - writeLock(); + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { if (!nameToCluster.containsKey(srcClusterName)) { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_SRC_CLUSTER_NOT_EXIST, srcClusterName); @@ -4926,7 +5082,7 @@ public class Catalog { // if the max replication num of the src db is larger then the backends num of the dest cluster, // the migration will not be processed. - final int maxReplicationNum = getDbMaxReplicationNum(db); + final int maxReplicationNum = db.getMaxReplicationNum(); if (maxReplicationNum > destCluster.getBackendIdList().size()) { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_MIGRATE_BE_NOT_ENOUGH, destClusterName); } @@ -4958,48 +5114,17 @@ public class Catalog { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_MIGRATION_NO_LINK, srcDbName, destDbName); } } finally { - writeUnlock(); + unlock(); } } - /** - * return max replicationNum of a db - * - * @param db - * @return - */ - private int getDbMaxReplicationNum(Database db) { - int ret = 0; - final Set tableNames = db.getTableNamesWithLock(); - db.readLock(); - try { - for (String tableName : tableNames) { - Table table = db.getTable(tableName); - if (table == null || table.getType() != TableType.OLAP) { - continue; - } - OlapTable olapTable = (OlapTable) table; - for (Partition partition : olapTable.getPartitions()) { - short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); - if (ret < replicationNum) { - ret = replicationNum; - } - } - - } - } finally { - db.readUnlock(); - } - return ret; - } - public void replayMigrateDb(BaseParam param) { final String desDbName = param.getStringParam(); final String srcDbName = param.getStringParam(1); final String desClusterName = param.getStringParam(2); final String srcClusterName = param.getStringParam(3); + tryLock(true); try { - writeLock(); final Cluster desCluster = this.nameToCluster.get(desClusterName); final Cluster srcCluster = this.nameToCluster.get(srcClusterName); final Database db = fullNameToDb.get(srcDbName); @@ -5017,7 +5142,7 @@ public class Catalog { db.writeUnlock(); } } finally { - writeUnlock(); + unlock(); } } @@ -5026,7 +5151,7 @@ public class Catalog { final String srcDbName = param.getStringParam(1); final String desDbName = param.getStringParam(); - writeLock(); + tryLock(true); try { final Cluster desCluster = this.nameToCluster.get(desClusterName); final Database srcDb = fullNameToDb.get(srcDbName); @@ -5037,7 +5162,7 @@ public class Catalog { desCluster.addLinkDb(param); fullNameToDb.put(desDbName, srcDb); } finally { - writeUnlock(); + unlock(); } } @@ -5052,7 +5177,10 @@ public class Catalog { final String destClusterName = stmt.getDestCluster(); final String srcDbName = stmt.getSrcDb(); final String destDbName = stmt.getDestDb(); - writeLock(); + + if (!tryLock(false)) { + throw new DdlException("Failed to acquire catalog lock. Try again"); + } try { if (!nameToCluster.containsKey(srcClusterName)) { ErrorReport.reportDdlException(ErrorCode.ERR_CLUSTER_SRC_CLUSTER_NOT_EXIST, srcClusterName); @@ -5103,17 +5231,12 @@ public class Catalog { fullNameToDb.put(destDbName, srcDb); editLog.logLinkCluster(param); } finally { - writeUnlock(); + unlock(); } } public Cluster getCluster(String clusterName) { - readLock(); - try { - return nameToCluster.get(clusterName); - } finally { - readUnlock(); - } + return nameToCluster.get(clusterName); } public List getClusterNames() { @@ -5127,14 +5250,15 @@ public class Catalog { public Set getMigrations() { final Set infos = Sets.newHashSet(); for (Database db : fullNameToDb.values()) { - if (db.getDbState() == DbState.MOVE) { - int tabletTotal = 0; - int tabletQuorum = 0; - final Set ids = Sets.newHashSet(systemInfo.getClusterBackendIds(db.getClusterName())); - final Set tableNames = db.getTableNamesWithLock(); - for (String tableName : tableNames) { - db.readLock(); - try { + db.readLock(); + try { + if (db.getDbState() == DbState.MOVE) { + int tabletTotal = 0; + int tabletQuorum = 0; + final Set beIds = Sets.newHashSet(systemInfo.getClusterBackendIds(db.getClusterName())); + final Set tableNames = db.getTableNamesWithLock(); + for (String tableName : tableNames) { + Table table = db.getTable(tableName); if (table == null || table.getType() != TableType.OLAP) { continue; @@ -5153,7 +5277,7 @@ public class Catalog { int quorum = replicationNum / 2 + 1; for (Replica replica : tablet.getReplicas()) { if (replica.getState() != ReplicaState.CLONE - && ids.contains(replica.getBackendId())) { + && beIds.contains(replica.getBackendId())) { replicaNum++; } } @@ -5166,18 +5290,17 @@ public class Catalog { } } } - } finally { - db.readUnlock(); } + final BaseParam info = new BaseParam(); + info.addStringParam(db.getClusterName()); + info.addStringParam(db.getAttachDb()); + info.addStringParam(db.getFullName()); + final float percentage = tabletTotal > 0 ? (float) tabletQuorum / (float) tabletTotal : 0f; + info.addFloatParam(percentage); + infos.add(info); } - final BaseParam info = new BaseParam(); - info.addStringParam(db.getClusterName()); - info.addStringParam(db.getAttachDb()); - info.addStringParam(db.getFullName()); - final float percentage = tabletTotal > 0 ? (float) tabletQuorum / (float) tabletTotal : 0f; - info.addFloatParam(percentage); - infos.add(info); - db.getDbState(); + } finally { + db.readUnlock(); } } @@ -5189,8 +5312,7 @@ public class Catalog { int clusterCount = dis.readInt(); checksum ^= clusterCount; for (long i = 0; i < clusterCount; ++i) { - final Cluster cluster = new Cluster(); - cluster.readFields(dis); + final Cluster cluster = Cluster.read(dis); checksum ^= cluster.getId(); List latestBackendIds = systemInfo.getClusterBackendIds(cluster.getName()); @@ -5216,7 +5338,7 @@ public class Catalog { return checksum; } - private void initDefaultCluster() { + public void initDefaultCluster() { final List backendList = Lists.newArrayList(); final List defaultClusterBackends = systemInfo.getClusterBackends(SystemInfoService.DEFAULT_CLUSTER); for (Backend backend : defaultClusterBackends) { @@ -5224,9 +5346,7 @@ public class Catalog { } final long id = getNextId(); - final Cluster cluster = new Cluster(); - cluster.setName(SystemInfoService.DEFAULT_CLUSTER); - cluster.setId(id); + final Cluster cluster = new Cluster(SystemInfoService.DEFAULT_CLUSTER, id); // make sure one host hold only one backend. Set beHost = Sets.newHashSet(); @@ -5319,17 +5439,57 @@ public class Catalog { public void replayUpdateClusterAndBackends(BackendIdsUpdateInfo info) { for (long id : info.getBackendList()) { final Backend backend = systemInfo.getBackend(id); - writeLock(); - try { - final Cluster cluster = nameToCluster.get(backend.getOwnerClusterName()); - cluster.removeBackend(id); - } finally { - writeUnlock(); - } + final Cluster cluster = nameToCluster.get(backend.getOwnerClusterName()); + cluster.removeBackend(id); backend.setDecommissioned(false); backend.clearClusterName(); backend.setBackendState(BackendState.free); } } + + public String dumpImage() { + LOG.info("begin to dump meta data"); + String dumpFilePath; + Map lockedDbMap = Maps.newTreeMap(); + tryLock(true); + try { + // sort all dbs + for (long dbId : getDbIds()) { + Database db = getDb(dbId); + Preconditions.checkNotNull(db); + lockedDbMap.put(dbId, db); + } + + // lock all dbs + for (Database db : lockedDbMap.values()) { + db.readLock(); + } + LOG.info("acquired all the dbs' read lock."); + + load.readLock(); + + LOG.info("acquired all jobs' read lock."); + long journalId = getMaxJournalId(); + File dumpFile = new File(Config.meta_dir, "image." + journalId); + dumpFilePath = dumpFile.getAbsolutePath(); + try { + LOG.info("begin to dump {}", dumpFilePath); + saveImage(dumpFile, journalId); + } catch (IOException e) { + LOG.error("failed to dump image to {}", dumpFilePath, e); + } + } finally { + // unlock all + load.readUnlock(); + for (Database db : lockedDbMap.values()) { + db.readUnlock(); + } + + unlock(); + } + + LOG.info("finished dumpping image to {}", dumpFilePath); + return dumpFilePath; + } } diff --git a/fe/src/main/java/com/baidu/palo/catalog/Column.java b/fe/src/main/java/com/baidu/palo/catalog/Column.java index 93d827ea22..ce2f0c3baf 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Column.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Column.java @@ -152,11 +152,14 @@ public class Column implements Writable { return this.columnType.getType(); } - // TODO(zc): public Type getType() { return ScalarType.createType(columnType.getType()); } + public Type getOriginType() { + return columnType.getTypeDesc(); + } + public int getStrLen() { return this.columnType.getLen(); } @@ -186,6 +189,10 @@ public class Column implements Writable { return isAllowNull; } + public void setIsAllowNull(boolean isAllowNull) { + this.isAllowNull = isAllowNull; + } + public String getDefaultValue() { return this.defaultValue; } diff --git a/fe/src/main/java/com/baidu/palo/catalog/Database.java b/fe/src/main/java/com/baidu/palo/catalog/Database.java index d4eb271a43..9ac3794862 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Database.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Database.java @@ -20,6 +20,26 @@ package com.baidu.palo.catalog; +import com.baidu.palo.catalog.MaterializedIndex.IndexState; +import com.baidu.palo.catalog.Replica.ReplicaState; +import com.baidu.palo.catalog.Table.TableType; +import com.baidu.palo.cluster.ClusterNamespace; +import com.baidu.palo.common.DdlException; +import com.baidu.palo.common.FeConstants; +import com.baidu.palo.common.FeMetaVersion; +import com.baidu.palo.common.Pair; +import com.baidu.palo.common.io.Text; +import com.baidu.palo.common.io.Writable; +import com.baidu.palo.common.util.DebugUtil; +import com.baidu.palo.persist.CreateTableInfo; +import com.baidu.palo.system.SystemInfoService; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Maps; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; @@ -35,24 +55,6 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.zip.Adler32; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import com.baidu.palo.catalog.MaterializedIndex.IndexState; -import com.baidu.palo.catalog.Replica.ReplicaState; -import com.baidu.palo.catalog.Table.TableType; -import com.baidu.palo.cluster.ClusterNamespace; -import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.FeConstants; -import com.baidu.palo.common.FeMetaVersion; -import com.baidu.palo.common.Pair; -import com.baidu.palo.common.io.Text; -import com.baidu.palo.common.io.Writable; -import com.baidu.palo.common.util.DebugUtil; -import com.baidu.palo.persist.CreateTableInfo; -import com.baidu.palo.system.SystemInfoService; -import com.google.common.base.Preconditions; - /** * Internal representation of db-related metadata. Owned by Catalog instance. * Not thread safe. @@ -77,6 +79,9 @@ public class Database extends MetaObject implements Writable { private String fullQualifiedName; private String clusterName; private ReentrantReadWriteLock rwLock; + + // temp for trace + private Map allLocks = Maps.newConcurrentMap(); // table family group map private Map idToTable; @@ -112,28 +117,32 @@ public class Database extends MetaObject implements Writable { public void readLock() { this.rwLock.readLock().lock(); + allLocks.put(Long.toString(Thread.currentThread().getId()), new Exception()); } - public boolean tryReadLock(long timeout, TimeUnit unit) { - try { - return this.rwLock.readLock().tryLock(timeout, unit); - } catch (InterruptedException e) { - LOG.warn("failed to try read lock at db[" + id + "]", e); - return false; + public void printLocks() { + for (Throwable exception: allLocks.values()) { + LOG.debug("a lock in db [{}]", fullQualifiedName, exception); } } public void readUnlock() { this.rwLock.readLock().unlock(); + allLocks.remove(Long.toString(Thread.currentThread().getId())); } public void writeLock() { this.rwLock.writeLock().lock(); + allLocks.put(Long.toString(Thread.currentThread().getId()), new Exception()); } public boolean tryWriteLock(long timeout, TimeUnit unit) { try { - return this.rwLock.writeLock().tryLock(timeout, unit); + boolean result = this.rwLock.writeLock().tryLock(timeout, unit); + if (result) { + allLocks.put(Long.toString(Thread.currentThread().getId()), new Exception()); + } + return result; } catch (InterruptedException e) { LOG.warn("failed to try write lock at db[" + id + "]", e); return false; @@ -142,6 +151,7 @@ public class Database extends MetaObject implements Writable { public void writeUnlock() { this.rwLock.writeLock().unlock(); + allLocks.remove(Long.toString(Thread.currentThread().getId())); } public boolean isWriteLockHeldByCurrentThread() { @@ -251,6 +261,9 @@ public class Database extends MetaObject implements Writable { CreateTableInfo info = new CreateTableInfo(fullQualifiedName, table); Catalog.getInstance().getEditLog().logCreateTable(info); } + if (table.getType() == TableType.ELASTICSEARCH) { + Catalog.getCurrentCatalog().getEsStateStore().registerTable((EsTable)table); + } } return result; } finally { @@ -320,6 +333,28 @@ public class Database extends MetaObject implements Writable { return idToTable.get(tableId); } + public int getMaxReplicationNum() { + int ret = 0; + readLock(); + try { + for (Table table : idToTable.values()) { + if (table.getType() != TableType.OLAP) { + continue; + } + OlapTable olapTable = (OlapTable) table; + for (Partition partition : olapTable.getPartitions()) { + short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); + if (ret < replicationNum) { + ret = replicationNum; + } + } + } + } finally { + readUnlock(); + } + return ret; + } + public static Database read(DataInput in) throws IOException { Database db = new Database(); db.readFields(in); diff --git a/fe/src/main/java/com/baidu/palo/catalog/OlapTable.java b/fe/src/main/java/com/baidu/palo/catalog/OlapTable.java index b94f9ffb99..4455c8b065 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/OlapTable.java +++ b/fe/src/main/java/com/baidu/palo/catalog/OlapTable.java @@ -1,5 +1,4 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -12,9 +11,10 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - package com.baidu.palo.catalog; +import com.baidu.palo.alter.AlterJob.JobState; +import com.baidu.palo.alter.RollupJob; import com.baidu.palo.analysis.AddPartitionClause; import com.baidu.palo.analysis.AddRollupClause; import com.baidu.palo.analysis.AlterClause; @@ -194,7 +194,6 @@ public class OlapTable extends Table { indexIdToSchemaHash.put(indexId, schemaHash); indexIdToShortKeyColumnCount.put(indexId, shortKeyColumnCount); } - public void setIndexStorageType(Long indexId, TStorageType newStorageType) { Preconditions.checkState(newStorageType == TStorageType.COLUMN); indexIdToStorageType.put(indexId, newStorageType); @@ -519,6 +518,17 @@ public class OlapTable extends Table { this.bfColumns = bfColumns; this.bfFpp = bfFpp; } + + // when the table is creating new rollup and enter finishing state, should tell be not auto load to new rollup + // it is used for stream load + // the caller should get db lock when call this method + public boolean shouldLoadToNewRollup() { + RollupJob rollupJob = (RollupJob) Catalog.getInstance().getRollupHandler().getAlterJob(id); + if (rollupJob != null && rollupJob.getState() == JobState.FINISHING) { + return false; + } + return true; + } public TTableDescriptor toThrift() { TOlapTable tOlapTable = new TOlapTable(getName()); @@ -923,4 +933,3 @@ public class OlapTable extends Table { return copied; } } - diff --git a/fe/src/main/java/com/baidu/palo/catalog/Partition.java b/fe/src/main/java/com/baidu/palo/catalog/Partition.java index a0c6c84c46..87228f8d23 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Partition.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Partition.java @@ -1,6 +1,7 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // @@ -16,8 +17,10 @@ package com.baidu.palo.catalog; import com.baidu.palo.catalog.DistributionInfo.DistributionInfoType; +import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; +import com.baidu.palo.common.util.Util; import java.io.DataInput; import java.io.DataOutput; @@ -50,23 +53,33 @@ public class Partition extends MetaObject implements Writable { private long committedVersion; private long committedVersionHash; + private long nextVersion; + private long nextVersionHash; + // not have currentVersion because currentVersion = nextVersion - 1 + private long currentVersionHash; + private DistributionInfo distributionInfo; public Partition() { this.idToRollupIndex = new HashMap(); } - public Partition(long id, String name, MaterializedIndex baseIndex, DistributionInfo distributionInfo) { + public Partition(long id, String name, + MaterializedIndex baseIndex, DistributionInfo distributionInfo) { this.id = id; this.name = name; this.state = PartitionState.NORMAL; - + this.baseIndex = baseIndex; this.idToRollupIndex = new HashMap(); this.committedVersion = PARTITION_INIT_VERSION; this.committedVersionHash = PARTITION_INIT_VERSION_HASH; this.distributionInfo = distributionInfo; + // PARTITION_INIT_VERSION == 1, so the first load version is 2 !!! + this.nextVersion = PARTITION_INIT_VERSION + 1; + this.nextVersionHash = Util.generateVersionHash(); + this.currentVersionHash = PARTITION_INIT_VERSION_HASH; } public void setIdForRestore(long id) { @@ -89,22 +102,32 @@ public class Partition extends MetaObject implements Writable { this.state = state; } + public void updateCommitVersionAndVersionHash(long committedVersion, long committedVersionHash) { + this.committedVersion = committedVersion; + this.committedVersionHash = committedVersionHash; + // if it is upgrade from old palo cluster, then should update next version info + if (Catalog.getCurrentCatalogJournalVersion() < FeMetaVersion.VERSION_45) { + // the partition is created and not import any data + if (committedVersion == PARTITION_INIT_VERSION + 1 && committedVersionHash == PARTITION_INIT_VERSION_HASH) { + this.nextVersion = PARTITION_INIT_VERSION + 1; + this.nextVersionHash = Util.generateVersionHash(); + this.currentVersionHash = PARTITION_INIT_VERSION_HASH; + } else { + this.nextVersion = committedVersion + 1; + this.nextVersionHash = Util.generateVersionHash(); + this.currentVersionHash = committedVersionHash; + } + } + } + public long getCommittedVersion() { return committedVersion; } - public void setCommittedVersion(long committedVersion) { - this.committedVersion = committedVersion; - } - public long getCommittedVersionHash() { return committedVersionHash; } - public void setCommittedVersionHash(long committedVersionHash) { - this.committedVersionHash = committedVersionHash; - } - public PartitionState getState() { return this.state; } @@ -125,6 +148,31 @@ public class Partition extends MetaObject implements Writable { return baseIndex; } + public long getNextVersion() { + return nextVersion; + } + + public void setNextVersion(long nextVersion) { + this.nextVersion = nextVersion; + } + + public long getNextVersionHash() { + return nextVersionHash; + } + + public void setNextVersionHash(long nextVersionHash, long currentVersionHash) { + this.currentVersionHash = currentVersionHash; + this.nextVersionHash = nextVersionHash; + } + + public long getCurrentVersion() { + return Math.max(this.nextVersion - 1, 2); + } + + public long getCurrentVersionHash() { + return currentVersionHash; + } + public List getRollupIndices() { List rollupIndices = new ArrayList(idToRollupIndex.size()); for (Map.Entry entry : idToRollupIndex.entrySet()) { @@ -164,6 +212,7 @@ public class Partition extends MetaObject implements Writable { out.writeLong(id); Text.writeString(out, name); Text.writeString(out, state.name()); + baseIndex.write(out); int rollupCount = (idToRollupIndex != null) ? idToRollupIndex.size() : 0; @@ -177,6 +226,10 @@ public class Partition extends MetaObject implements Writable { out.writeLong(committedVersion); out.writeLong(committedVersionHash); + out.writeLong(nextVersion); + out.writeLong(nextVersionHash); + out.writeLong(currentVersionHash); + Text.writeString(out, distributionInfo.getType().name()); distributionInfo.write(out); } @@ -187,6 +240,7 @@ public class Partition extends MetaObject implements Writable { id = in.readLong(); name = Text.readString(in); state = PartitionState.valueOf(Text.readString(in)); + baseIndex = MaterializedIndex.read(in); int rollupCount = in.readInt(); @@ -197,7 +251,22 @@ public class Partition extends MetaObject implements Writable { committedVersion = in.readLong(); committedVersionHash = in.readLong(); - + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + nextVersion = in.readLong(); + nextVersionHash = in.readLong(); + currentVersionHash = in.readLong(); + } else { + // the partition is created and not import any data + if (committedVersion == PARTITION_INIT_VERSION + 1 && committedVersionHash == PARTITION_INIT_VERSION_HASH) { + this.nextVersion = PARTITION_INIT_VERSION + 1; + this.nextVersionHash = Util.generateVersionHash(); + this.currentVersionHash = PARTITION_INIT_VERSION_HASH; + } else { + this.nextVersion = committedVersion + 1; + this.nextVersionHash = Util.generateVersionHash(); + this.currentVersionHash = committedVersionHash; + } + } DistributionInfoType distriType = DistributionInfoType.valueOf(Text.readString(in)); if (distriType == DistributionInfoType.HASH) { distributionInfo = HashDistributionInfo.read(in); diff --git a/fe/src/main/java/com/baidu/palo/catalog/RangePartitionInfo.java b/fe/src/main/java/com/baidu/palo/catalog/RangePartitionInfo.java index c350ebaf45..fd57fa6db0 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/RangePartitionInfo.java +++ b/fe/src/main/java/com/baidu/palo/catalog/RangePartitionInfo.java @@ -170,10 +170,12 @@ public class RangePartitionInfo extends PartitionInfo { } } - public void handleNewSinglePartitionDesc(SingleRangePartitionDesc desc, long partitionId) throws DdlException { + public Range handleNewSinglePartitionDesc(SingleRangePartitionDesc desc, + long partitionId) throws DdlException { Preconditions.checkArgument(desc.isAnalyzed()); + Range range = null; try { - Range range = checkAndCreateRange(desc); + range = checkAndCreateRange(desc); idToRange.put(partitionId, range); } catch (IllegalArgumentException e) { // Range.closedOpen may throw this if (lower > upper) @@ -181,6 +183,7 @@ public class RangePartitionInfo extends PartitionInfo { } idToDataProperty.put(partitionId, desc.getPartitionDataProperty()); idToReplicationNum.put(partitionId, desc.getReplicationNum()); + return range; } // for catalog restore diff --git a/fe/src/main/java/com/baidu/palo/catalog/Replica.java b/fe/src/main/java/com/baidu/palo/catalog/Replica.java index 4b7eee861f..d4f7b49106 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Replica.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Replica.java @@ -1,5 +1,4 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -12,9 +11,9 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - package com.baidu.palo.catalog; +import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; @@ -41,6 +40,13 @@ public class Replica implements Writable { CLONE } + public enum ReplicaStatus { + OK, // health + DEAD, // backend is not available + VERSION_ERROR, // missing version + MISSING // replica does not exist + } + private long id; private long backendId; private long version; @@ -48,21 +54,32 @@ public class Replica implements Writable { private long dataSize; private long rowCount; private ReplicaState state; - private AtomicLong versionCount = new AtomicLong(-1); + + private long lastFailedVersion = -1L; + private long lastFailedVersionHash = 0L; + // not serialized, not very important + private long lastFailedTimestamp = 0; + private long lastSuccessVersion = -1L; + private long lastSuccessVersionHash = 0L; + + private AtomicLong versionCount = new AtomicLong(-1); public Replica() { } + // the new replica's version is -1 and last failed version is -1 public Replica(long replicaId, long backendId, ReplicaState state) { - this(replicaId, backendId, -1, 0, -1, -1, state); + this(replicaId, backendId, -1, 0, -1, -1, state, -1, 0, -1, 0); } public Replica(long replicaId, long backendId, ReplicaState state, long version, long versionHash) { - this(replicaId, backendId, version, versionHash, -1, -1, state); + this(replicaId, backendId, version, versionHash, -1, -1, state, -1L, 0L, version, versionHash); } public Replica(long replicaId, long backendId, long version, long versionHash, - long dataSize, long rowCount, ReplicaState state) { + long dataSize, long rowCount, ReplicaState state, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { this.id = replicaId; this.backendId = backendId; this.version = version; @@ -73,6 +90,18 @@ public class Replica implements Writable { if (this.state == null) { this.state = ReplicaState.NORMAL; } + this.lastFailedVersion = lastFailedVersion; + this.lastFailedVersionHash = lastFailedVersionHash; + if (this.lastFailedVersion > 0) { + this.lastFailedTimestamp = System.currentTimeMillis(); + } + if (lastSuccessVersion < this.version) { + this.lastSuccessVersion = this.version; + this.lastSuccessVersionHash = this.versionHash; + } else { + this.lastSuccessVersion = lastSuccessVersion; + this.lastSuccessVersionHash = lastSuccessVersionHash; + } } public long getVersion() { @@ -98,17 +127,114 @@ public class Replica implements Writable { public long getRowCount() { return rowCount; } + public long getLastFailedVersion() { + return lastFailedVersion; + } + + public long getLastFailedVersionHash() { + return lastFailedVersionHash; + } + + public long getLastFailedTimestamp() { + return lastFailedTimestamp; + } + + public long getLastSuccessVersion() { + return lastSuccessVersion; + } + + public long getLastSuccessVersionHash() { + return lastSuccessVersionHash; + } + // only update data size and row num + public synchronized void updateStat(long dataSize, long rowNum) { + this.dataSize = dataSize; + this.rowCount = rowNum; + } - public void updateInfo(long newVersion, long newVersionHash, long newDataSize, long newRowCount) { + public synchronized void updateInfo(long newVersion, long newVersionHash, long newDataSize, long newRowCount) { + updateReplicaInfo(newVersion, newVersionHash, this.lastFailedVersion, this.lastFailedVersionHash, + this.lastSuccessVersion, this.lastSuccessVersionHash, newDataSize, newRowCount); + } + + public synchronized void updateVersionInfo(long newVersion, long newVersionHash, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + updateReplicaInfo(newVersion, newVersionHash, lastFailedVersion, lastFailedVersionHash, + lastSuccessVersion, lastSuccessVersionHash, dataSize, rowCount); + } + + private void updateReplicaInfo(long newVersion, long newVersionHash, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash, + long newDataSize, long newRowCount) { if (newVersion < this.version) { LOG.warn("replica[" + id + "] new version is lower than meta version. " + newVersion + " vs " + version); + // yiguolei: could not find any reason why new version less than this.version should run??? + return; } this.version = newVersion; this.versionHash = newVersionHash; this.dataSize = newDataSize; this.rowCount = newRowCount; - - LOG.debug("update {}", this.toString()); + // just check it + if (lastSuccessVersion <= this.version) { + lastSuccessVersion = this.version; + lastSuccessVersionHash = this.versionHash; + } + if (this.lastSuccessVersion <= this.lastFailedVersion) { + this.lastSuccessVersion = this.version; + this.lastSuccessVersionHash = this.versionHash; + } + + if (this.version > lastFailedVersion && lastFailedVersion > 0) { + LOG.info("current version {} is larger than last failed version {} , " + + "last failed version hash {}, maybe a fatal error or be report version, print a stack here ", + this.version, lastFailedVersion, lastFailedVersionHash, new Exception()); + } + + if (lastFailedVersion != this.lastFailedVersion + || this.lastFailedVersionHash != lastFailedVersionHash) { + // if last failed version changed, then set last success version to invalid version + if (lastFailedVersion > this.lastFailedVersion) { + this.lastFailedVersion = lastFailedVersion; + this.lastFailedVersionHash = lastFailedVersionHash; + this.lastFailedTimestamp = System.currentTimeMillis(); + } + this.lastSuccessVersion = this.version; + this.lastSuccessVersionHash = this.versionHash; + } else { + if (lastSuccessVersion >= this.lastSuccessVersion) { + this.lastSuccessVersion = lastSuccessVersion; + this.lastSuccessVersionHash = lastSuccessVersionHash; + } + if (lastFailedVersion >= this.lastSuccessVersion) { + this.lastSuccessVersion = this.version; + this.lastSuccessVersionHash = this.versionHash; + } + } + + // if last failed version <= version, then last failed version is invalid + // version xxxx | last failed version xxxx | last success version xxx + // if current version == last failed version and version hash != last failed version hash, it means the version report from be is not valid + if (this.version > this.lastFailedVersion + || this.version == this.lastFailedVersion && this.versionHash == this.lastFailedVersionHash + || this.version == this.lastFailedVersion && this.lastFailedVersionHash == 0 && this.versionHash != 0) { + this.lastFailedVersion = -1; + this.lastFailedVersionHash = 0; + this.lastFailedTimestamp = -1; + if (this.version < this.lastSuccessVersion) { + this.version = this.lastSuccessVersion; + this.versionHash = this.lastSuccessVersionHash; + } + } + // TODO yiguolei use info log here, there maybe a lot of logs, change it to debug when concurrent load is stable + LOG.debug("update {}", this.toString()); + } + + public synchronized void updateLastFailedVersion(long lastFailedVersion, long lastFailedVersionHash) { + updateReplicaInfo(this.version, this.versionHash, lastFailedVersion, lastFailedVersionHash, + this.lastSuccessVersion, this.lastSuccessVersionHash, dataSize, rowCount); } public boolean checkVersionCatchUp(long committedVersion, long committedVersionHash) { @@ -128,15 +254,15 @@ public class Replica implements Writable { public ReplicaState getState() { return this.state; } - + public long getVersionCount() { - return versionCount.get(); - } + return versionCount.get(); + } public void setVersionCount(long versionCount) { - this.versionCount.set(versionCount); - } - + this.versionCount.set(versionCount); + } + @Override public String toString() { StringBuffer strBuffer = new StringBuffer("replicaId="); @@ -151,6 +277,16 @@ public class Replica implements Writable { strBuffer.append(dataSize); strBuffer.append(", rowCount="); strBuffer.append(rowCount); + strBuffer.append(", lastFailedVersion="); + strBuffer.append(lastFailedVersion); + strBuffer.append(", lastFailedVersionHash="); + strBuffer.append(lastFailedVersionHash); + strBuffer.append(", lastSuccessVersion="); + strBuffer.append(lastSuccessVersion); + strBuffer.append(", lastSuccessVersionHash="); + strBuffer.append(lastSuccessVersionHash); + strBuffer.append(", lastFailedTimestamp="); + strBuffer.append(lastFailedTimestamp); return strBuffer.toString(); } @@ -162,6 +298,12 @@ public class Replica implements Writable { out.writeLong(dataSize); out.writeLong(rowCount); Text.writeString(out, state.name()); + + out.writeLong(lastFailedVersion); + out.writeLong(lastFailedVersionHash); + out.writeLong(lastSuccessVersion); + out.writeLong(lastSuccessVersionHash); + } public void readFields(DataInput in) throws IOException { @@ -172,6 +314,12 @@ public class Replica implements Writable { dataSize = in.readLong(); rowCount = in.readLong(); state = ReplicaState.valueOf(Text.readString(in)); + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + lastFailedVersion = in.readLong(); + lastFailedVersionHash = in.readLong(); + lastSuccessVersion = in.readLong(); + lastSuccessVersionHash = in.readLong(); + } } public static Replica read(DataInput in) throws IOException { @@ -195,7 +343,11 @@ public class Replica implements Writable { && (versionHash == replica.versionHash) && (dataSize == replica.dataSize) && (rowCount == replica.rowCount) - && (state.equals(replica.state)); + && (state.equals(replica.state)) + && (lastFailedVersion == replica.lastFailedVersion) + && (lastFailedVersionHash == replica.lastFailedVersionHash) + && (lastSuccessVersion == replica.lastSuccessVersion) + && (lastSuccessVersionHash == replica.lastSuccessVersionHash); } private static class VersionComparator implements Comparator { @@ -214,4 +366,3 @@ public class Replica implements Writable { } } } - diff --git a/fe/src/main/java/com/baidu/palo/catalog/ScalarType.java b/fe/src/main/java/com/baidu/palo/catalog/ScalarType.java index 1265bd0bc6..09ab5bc655 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/ScalarType.java +++ b/fe/src/main/java/com/baidu/palo/catalog/ScalarType.java @@ -150,8 +150,8 @@ public class ScalarType extends Type { } public static ScalarType createDecimalType(int precision, int scale) { - Preconditions.checkState(precision >= 0); // Enforced by parser - Preconditions.checkState(scale >= 0); // Enforced by parser. + // Preconditions.checkState(precision >= 0); // Enforced by parser + // Preconditions.checkState(scale >= 0); // Enforced by parser. ScalarType type = new ScalarType(PrimitiveType.DECIMAL); type.precision = precision; type.scale = scale; diff --git a/fe/src/main/java/com/baidu/palo/catalog/Table.java b/fe/src/main/java/com/baidu/palo/catalog/Table.java index 06a2b83e06..825929355f 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Table.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Table.java @@ -21,7 +21,7 @@ package com.baidu.palo.catalog; import com.baidu.palo.analysis.CreateTableStmt; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; import com.baidu.palo.thrift.TTableDescriptor; @@ -49,7 +49,8 @@ public class Table extends MetaObject implements Writable { INLINE_VIEW, VIEW, KUDU, - BROKER + BROKER, + ELASTICSEARCH } protected long id; @@ -148,6 +149,8 @@ public class Table extends MetaObject implements Writable { table = new KuduTable(); } else if (type == TableType.BROKER) { table = new BrokerTable(); + } else if (type == TableType.ELASTICSEARCH) { + table = new EsTable(); } else { throw new IOException("Unknown table type: " + type.name()); } @@ -158,7 +161,7 @@ public class Table extends MetaObject implements Writable { View view = (View) table; try { view.init(); - } catch (InternalException e) { + } catch (UserException e) { throw new IOException(e.getMessage()); } } @@ -254,4 +257,9 @@ public class Table extends MetaObject implements Writable { public int getSignature(int signatureVersion) { throw new NotImplementedException(); } + + @Override + public String toString() { + return "Table [id=" + id + ", name=" + name + ", type=" + type + "]"; + } } diff --git a/fe/src/main/java/com/baidu/palo/catalog/Tablet.java b/fe/src/main/java/com/baidu/palo/catalog/Tablet.java index 37080eacfb..49e16757db 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/Tablet.java +++ b/fe/src/main/java/com/baidu/palo/catalog/Tablet.java @@ -29,7 +29,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import java.util.Set; diff --git a/fe/src/main/java/com/baidu/palo/catalog/TabletInvertedIndex.java b/fe/src/main/java/com/baidu/palo/catalog/TabletInvertedIndex.java index dcf0f1f18b..67c3c44558 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/TabletInvertedIndex.java +++ b/fe/src/main/java/com/baidu/palo/catalog/TabletInvertedIndex.java @@ -20,9 +20,16 @@ package com.baidu.palo.catalog; +import com.baidu.palo.task.RecoverTabletTask; +import com.baidu.palo.thrift.TPartitionVersionInfo; import com.baidu.palo.thrift.TStorageMedium; import com.baidu.palo.thrift.TTablet; import com.baidu.palo.thrift.TTabletInfo; +import com.baidu.palo.transaction.GlobalTransactionMgr; +import com.baidu.palo.transaction.PartitionCommitInfo; +import com.baidu.palo.transaction.TableCommitInfo; +import com.baidu.palo.transaction.TransactionState; +import com.baidu.palo.transaction.TransactionStatus; import com.google.common.base.Preconditions; import com.google.common.collect.HashBasedTable; @@ -54,6 +61,9 @@ public class TabletInvertedIndex { // tablet id -> tablet meta private Map tabletMetaMap = Maps.newHashMap(); + // replica id -> tablet id + private Map replicaToTabletMap = Maps.newHashMap(); + /* * we use this to save memory. * we do not need create TabletMeta instance for each tablet, @@ -72,7 +82,6 @@ public class TabletInvertedIndex { private Table backingReplicaMetaTable = HashBasedTable.create(); public TabletInvertedIndex() { - } private final void readLock() { @@ -97,7 +106,10 @@ public class TabletInvertedIndex { ListMultimap tabletDeleteFromMeta, Set foundTabletsWithValidSchema, Map foundTabletsWithInvalidSchema, - ListMultimap tabletMigrationMap) { + ListMultimap tabletMigrationMap, + ListMultimap transactionsToPublish, + ListMultimap transactionsToClear, + List tabletNeedRecover) { long start = 0L; readLock(); @@ -124,6 +136,14 @@ public class TabletInvertedIndex { // need sync tabletSyncMap.put(tabletMeta.getDbId(), tabletId); } + + if (checkNeedRecover(replica, backendTabletInfo.getVersion(), + backendTabletInfo.getVersion_hash())) { + RecoverTabletTask recoverTabletTask = new RecoverTabletTask(backendId, + tabletId, replica.getVersion(), replica.getVersionHash(), + backendTabletInfo.getSchema_hash()); + tabletNeedRecover.add(recoverTabletTask); + } // check if need migration long partitionId = tabletMeta.getPartitionId(); @@ -133,7 +153,26 @@ public class TabletInvertedIndex { tabletMigrationMap.put(storageMedium, tabletId); } } - + // check if should clear transactions + if (backendTabletInfo.isSetTransaction_ids()) { + List transactionIds = backendTabletInfo.getTransaction_ids(); + GlobalTransactionMgr transactionMgr = Catalog.getCurrentGlobalTransactionMgr(); + for (Long transactionId : transactionIds) { + TransactionState transactionState = transactionMgr.getTransactionState(transactionId); + if (transactionState == null || transactionState.getTransactionStatus() == TransactionStatus.ABORTED) { + transactionsToClear.put(transactionId, tabletMeta.getPartitionId()); + LOG.debug("transaction id [{}] is not valid any more, " + + "clear it from backend [{}]", transactionId, backendId); + } else if (transactionState.getTransactionStatus() == TransactionStatus.VISIBLE) { + TableCommitInfo tableCommitInfo = transactionState.getTableCommitInfo(tabletMeta.getTableId()); + PartitionCommitInfo partitionCommitInfo = tableCommitInfo.getPartitionCommitInfo(partitionId); + TPartitionVersionInfo versionInfo = new TPartitionVersionInfo(tabletMeta.getPartitionId(), + partitionCommitInfo.getVersion(), + partitionCommitInfo.getVersionHash()); + transactionsToPublish.put(transactionId, versionInfo); + } + } + } // end for txn id // update replicas's version count // no need to write log, and no need to get db lock. if (backendTabletInfo.isSetVersion_count()) { @@ -142,9 +181,9 @@ public class TabletInvertedIndex { } else { // tablet with invalid schemahash foundTabletsWithInvalidSchema.put(tabletId, backendTabletInfo); - } - } // end for be tablet info - } else { + } // end for be tablet info + } + } else { // 2. (meta - be) // may need delete from meta LOG.debug("backend[{}] does not report tablet[{}-{}]", backendId, tabletId, tabletMeta); @@ -158,9 +197,10 @@ public class TabletInvertedIndex { long end = System.currentTimeMillis(); LOG.info("finished to do tablet diff with backend[{}]. sync: {}. metaDel: {}. foundValid: {}. foundInvalid: {}." - + " migration: {}." + " cost: {} ms", backendId, tabletSyncMap.size(), + + " migration: {}. found invalid transactions {}. found republish transactions {} " + + " cost: {} ms", backendId, tabletSyncMap.size(), tabletDeleteFromMeta.size(), foundTabletsWithValidSchema.size(), foundTabletsWithInvalidSchema.size(), - tabletMigrationMap.size(), (end - start)); + tabletMigrationMap.size(), transactionsToClear.size(), transactionsToPublish.size(), (end - start)); } public long getDbId(long tabletId) { @@ -186,6 +226,30 @@ public class TabletInvertedIndex { readUnlock(); } } + + public TabletMeta getTabletMetaByReplica(long replicaId) { + readLock(); + try { + Long tabletId = replicaToTabletMap.get(replicaId); + if (tabletId == null) { + return null; + } + TabletMeta tabletMeta = tabletMetaMap.get(tabletId); + return tabletMeta; + } finally { + readUnlock(); + } + } + + public Long getTabletIdByReplica(long replicaId) { + readLock(); + try { + Long tabletId = replicaToTabletMap.get(replicaId); + return tabletId; + } finally { + readUnlock(); + } + } public long getPartitionId(long tabletId) { readLock(); @@ -232,6 +296,12 @@ public class TabletInvertedIndex { readUnlock(); } } + + public Set getTabletBackends(long tabletId) { + + Map backendIdToReplica = replicaMetaTable.row(tabletId); + return backendIdToReplica.keySet(); + } private boolean checkSync(Replica replicaMeta, long backendVersion, long backendVersionHash) { long metaVersion = replicaMeta.getVersion(); @@ -241,6 +311,22 @@ public class TabletInvertedIndex { } return false; } + + /** + * if be's report version < fe's meta version, it means there exists one or more holes in be + * the be needs recovery + * @param replicaMeta + * @param backendVersion + * @param backendVersionHash + * @return + */ + private boolean checkNeedRecover(Replica replicaMeta, long backendVersion, long backendVersionHash) { + long metaVersion = replicaMeta.getVersion(); + if (metaVersion > backendVersion) { + return true; + } + return false; + } public void addTablet(long tabletId, TabletMeta tabletMeta) { // always add tablet before adding replicas @@ -269,6 +355,10 @@ public class TabletInvertedIndex { try { Map replicas = replicaMetaTable.rowMap().remove(tabletId); if (replicas != null) { + for (Replica replica : replicas.values()) { + replicaToTabletMap.remove(replica.getId()); + } + for (long backendId : replicas.keySet()) { backingReplicaMetaTable.remove(backendId, tabletId); } @@ -290,6 +380,7 @@ public class TabletInvertedIndex { try { Preconditions.checkState(tabletMetaMap.containsKey(tabletId)); replicaMetaTable.put(tabletId, replica.getBackendId(), replica); + replicaToTabletMap.put(replica.getId(), tabletId); backingReplicaMetaTable.put(replica.getBackendId(), tabletId, replica); } finally { writeUnlock(); @@ -305,6 +396,8 @@ public class TabletInvertedIndex { Preconditions.checkState(tabletMetaMap.containsKey(tabletId)); // Preconditions.checkState(replicaMetaTable.containsRow(tabletId)); if (replicaMetaTable.containsRow(tabletId)) { + Replica replica = replicaMetaTable.remove(tabletId, backendId); + replicaToTabletMap.remove(replica.getId()); replicaMetaTable.remove(tabletId, backendId); backingReplicaMetaTable.remove(backendId, tabletId); LOG.debug("delete tablet[{}] in backend[{}]", tabletId, backendId); @@ -317,6 +410,16 @@ public class TabletInvertedIndex { writeUnlock(); } } + + public Replica getReplica(long tabletId, long backendId) { + readLock(); + try { + Preconditions.checkState(tabletMetaMap.containsKey(tabletId), tabletId); + return replicaMetaTable.get(tabletId, backendId); + } finally { + readUnlock(); + } + } public List getReplicasByTabletId(long tabletId) { readLock(); @@ -403,6 +506,7 @@ public class TabletInvertedIndex { writeLock(); try { tabletMetaMap.clear(); + replicaToTabletMap.clear(); tabletMetaTable.clear(); replicaMetaTable.clear(); backingReplicaMetaTable.clear(); @@ -410,5 +514,9 @@ public class TabletInvertedIndex { writeUnlock(); } } + + public Map getReplicaToTabletMap() { + return replicaToTabletMap; + } } diff --git a/fe/src/main/java/com/baidu/palo/catalog/TabletMeta.java b/fe/src/main/java/com/baidu/palo/catalog/TabletMeta.java index 8b1d07a6fc..04f42ee041 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/TabletMeta.java +++ b/fe/src/main/java/com/baidu/palo/catalog/TabletMeta.java @@ -115,17 +115,6 @@ public class TabletMeta { } } - // XXX - public void forceSetSchema(int schemaHash) { - lock.writeLock().lock(); - try { - this.oldSchemaHash = schemaHash; - this.newSchemaHash = -1; - } finally { - lock.writeLock().unlock(); - } - } - @Override public String toString() { lock.readLock().lock(); diff --git a/fe/src/main/java/com/baidu/palo/catalog/View.java b/fe/src/main/java/com/baidu/palo/catalog/View.java index 880b19de1b..4b9a22c13e 100644 --- a/fe/src/main/java/com/baidu/palo/catalog/View.java +++ b/fe/src/main/java/com/baidu/palo/catalog/View.java @@ -24,7 +24,7 @@ import com.baidu.palo.analysis.ParseNode; import com.baidu.palo.analysis.QueryStmt; import com.baidu.palo.analysis.SqlParser; import com.baidu.palo.analysis.SqlScanner; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Text; import com.google.common.collect.Lists; @@ -123,7 +123,7 @@ public class View extends Table { * Throws a TableLoadingException if there was any error parsing the * the SQL or if the view definition did not parse into a QueryStmt. */ - public void init() throws InternalException { + public void init() throws UserException { // Parse the expanded view definition SQL-string into a QueryStmt and // populate a view definition. SqlScanner input = new SqlScanner(new StringReader(inlineViewDef)); @@ -137,12 +137,12 @@ public class View extends Table { LOG.info("msg is {}", inlineViewDef); // Do not pass e as the exception cause because it might reveal the existence // of tables that the user triggering this load may not have privileges on. - throw new InternalException( + throw new UserException( String.format("Failed to parse view-definition statement of view: %s", name)); } // Make sure the view definition parses to a query statement. if (!(node instanceof QueryStmt)) { - throw new InternalException(String.format("View definition of %s " + + throw new UserException(String.format("View definition of %s " + "is not a query statement", name)); } queryStmt = (QueryStmt) node; diff --git a/fe/src/main/java/com/baidu/palo/clone/Clone.java b/fe/src/main/java/com/baidu/palo/clone/Clone.java index ae50cf8506..bf9741ceff 100644 --- a/fe/src/main/java/com/baidu/palo/clone/Clone.java +++ b/fe/src/main/java/com/baidu/palo/clone/Clone.java @@ -252,6 +252,16 @@ public class Clone { jobInfo.add(job.getState().name()); jobInfo.add(job.getType().name()); jobInfo.add(job.getPriority().name()); + CloneTask cloneTask = job.getCloneTask(); + if (cloneTask != null) { + jobInfo.add(cloneTask.getCommittedVersion()); + jobInfo.add(cloneTask.getCommittedVersionHash()); + jobInfo.add(cloneTask.getFailedTimes()); + } else { + jobInfo.add(-1L); + jobInfo.add(-1L); + jobInfo.add(0); + } jobInfo.add(TimeUtils.longToTimeString(job.getCreateTimeMs())); jobInfo.add(TimeUtils.longToTimeString(job.getCloneStartTimeMs())); jobInfo.add(TimeUtils.longToTimeString(job.getCloneFinishTimeMs())); @@ -283,6 +293,7 @@ public class Clone { if (AgentTaskQueue.addTask(task)) { job.setState(JobState.RUNNING); job.setCloneStartTimeMs(System.currentTimeMillis()); + job.setCloneTask(task); return true; } else { return false; @@ -446,12 +457,12 @@ public class Clone { + ", backend id: " + backendId); } - // Here we do not check is clone version is equal to the commited version. - // Because in case of high frequency loading, clone version always lags behind the commited version, + // Here we do not check is clone version is equal to the committed version. + // Because in case of high frequency loading, clone version always lags behind the committed version, // so the clone job will never succeed, which cause accumulation of quorum finished load jobs. // But we will check if the cloned replica's version is larger than or equal to the task's version. - // We should dicard the cloned replica with stale version. + // We should discard the cloned replica with stale version. if (tabletInfo.getVersion() < taskVersion || (tabletInfo.getVersion() == taskVersion && tabletInfo.getVersion_hash() != taskVersionHash)) { throw new MetaNotFoundException(String.format("cloned replica's version info is stale. %ld-%ld," @@ -471,18 +482,34 @@ public class Clone { LOG.warn("clone job state is not running. job: {}", job); return; } + + // if clone finished and report version == last failed version then last failed version hash should equal + if (replica.getLastFailedVersion() == version && replica.getLastFailedVersionHash() != versionHash) { + throw new MetaNotFoundException(String.format("clone finshed and report version %ld, " + + "version hash %ld, but the replica's current failed version " + + "is %ld versionhash is %ld", + version, versionHash, replica.getLastFailedVersion(), + replica.getLastFailedVersionHash())); + } replica.setState(ReplicaState.NORMAL); replica.updateInfo(version, versionHash, dataSize, rowCount); job.setCloneFinishTimeMs(System.currentTimeMillis()); job.setState(JobState.FINISHED); - LOG.info("finish clone job: {}", job); - - // Write edit log + // yiguolei: + // there are two types of clone job: catch up clone or new replica add to tablet + // for new replica add to tablet, set its last failed version to max commit version for the tablet + // and the new replica will try to clone, if clone finished and the version < last failed version + // the clone type is converted to catchup clone ReplicaPersistInfo info = ReplicaPersistInfo.createForClone(dbId, tableId, partitionId, indexId, tabletId, backendId, replica.getId(), - version, versionHash, dataSize, rowCount); + version, versionHash, dataSize, rowCount, + replica.getLastFailedVersion(), + replica.getLastFailedVersionHash(), + replica.getLastSuccessVersion(), + replica.getLastSuccessVersionHash()); + LOG.info("finish clone job: {}, add a replica {}", job, info); Catalog.getInstance().getEditLog().logAddReplica(info); } finally { writeUnlock(); @@ -560,12 +587,23 @@ public class Clone { Replica replica = tablet.getReplicaByBackendId(backendId); if (replica == null) { - throw new MetaNotFoundException("replica does not exist in be: " + backendId - + " . tablet id: " + tabletId); + LOG.info("could not find replica on backend {} for tablet id {}, " + + "maybe clone not find src backends, ignore it", + backendId, tabletId); + return; } + + // 1. if this is a normal clone job, then should remove it from meta, not write log, because the clone replica + // not exist on follower and observer + // 2. if this is a catch up clone job, should not delete it from meta because the catch up replica is a normal replica + // before clone and we will lost data if delete the catch up clone replica if (replica.getState() == ReplicaState.CLONE) { - if (tablet.deleteReplicaByBackendId(backendId)) { - LOG.info("remove clone replica. tablet id: {}, backend id: {}", tabletId, backendId); + if (job.getType() == JobType.CATCHUP) { + replica.setState(ReplicaState.NORMAL); + } else { + if (tablet.deleteReplicaByBackendId(backendId)) { + LOG.info("remove clone replica. tablet id: {}, backend id: {}", tabletId, backendId); + } } } } catch (MetaNotFoundException e) { diff --git a/fe/src/main/java/com/baidu/palo/clone/CloneChecker.java b/fe/src/main/java/com/baidu/palo/clone/CloneChecker.java index 6fe14a3371..c581b2f77e 100644 --- a/fe/src/main/java/com/baidu/palo/clone/CloneChecker.java +++ b/fe/src/main/java/com/baidu/palo/clone/CloneChecker.java @@ -209,9 +209,12 @@ public class CloneChecker extends Daemon { Clone clone = Catalog.getInstance().getCloneInstance(); LOG.info("start to check clone. job num: {}", clone.getJobNum()); + // yiguolei: check whether the replica's version is less than last failed version + checkFailedReplicas(); + // 1. check tablet for supplement, migration and deletion checkTablets(); - + // 2. check timeout clone.checkTimeout(); @@ -225,6 +228,139 @@ public class CloneChecker extends Daemon { // 4. remove cancelled and finished jobs clone.removeCloneJobs(); } + + // check if a replica is failed during loading, add it as a clone job to catch up + private void checkFailedReplicas() { + Catalog catalog = Catalog.getInstance(); + SystemInfoService clusterInfoService = Catalog.getCurrentSystemInfo(); + + // 1. get all tablets which are in Clone process. + // NOTICE: this is only a copy of tablet under Clone process. + // It will change any time during this method. + // So DO NOT severely depend on it to make any decision! + Set cloneTabletIds = catalog.getCloneInstance().getCloneTabletIds(); + + // check tablet database by database. + List dbNames = catalog.getDbNames(); + for (String dbName : dbNames) { + Database db = catalog.getDb(dbName); + if (db == null) { + LOG.debug("db does not exist. name: {}", dbName); + continue; + } + + final String clusterName = db.getClusterName(); + + if (Strings.isNullOrEmpty(clusterName)) { + LOG.debug("database {} has no cluster name", dbName); + continue; + } + + long dbId = db.getId(); + Set tableNames = db.getTableNamesWithLock(); + // check table by table + for (String tableName : tableNames) { + long tableId = -1L; + db.readLock(); + try { + Table table = db.getTable(tableName); + if (table == null || table.getType() != TableType.OLAP) { + LOG.debug("table {} is null or is not olap table, skip repair process", table); + continue; + } + + OlapTable olapTable = (OlapTable) table; + tableId = table.getId(); + for (Partition partition : olapTable.getPartitions()) { + long partitionId = partition.getId(); + for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { + // only check NORMAL index + if (materializedIndex.getState() != IndexState.NORMAL) { + LOG.debug("index {} is not normal state, so that skip repair" + + " all tablets belongs this index", materializedIndex); + continue; + } + for (Tablet tablet : materializedIndex.getTablets()) { + long tabletId = tablet.getId(); + if (cloneTabletIds.contains(tabletId)) { + LOG.debug("tablet {} is under clone, so that skip repair it", tablet); + continue; + } + Replica replicaToCatchup = null; + for (Replica replica : tablet.getReplicas()) { + long backendId = replica.getBackendId(); + Backend backend = clusterInfoService.getBackend(backendId); + if (backend == null) { + continue; + } + if (backend.isAlive() + && replica.getState() != ReplicaState.CLONE + && replica.getLastFailedVersion() > 0) { + + long elapsedAfterFailed = System.currentTimeMillis() - replica.getLastFailedTimestamp(); + // if not check it, the replica may be failed at version 1,3,4,6,8, then we will run 5 clone jobs + // wait some seconds then the replica maybe stable, and we could run single clone job to repair the + // replica + if (elapsedAfterFailed < Config.replica_delay_recovery_second * 1000L) { + LOG.info("{} is down at {}, less than minimal delay second {}, not clone", + replica, replica.getLastFailedTimestamp(), Config.replica_delay_recovery_second); + continue; + } + + // check if there exists a replica in this tablet which have larger version + // if not any replica in this tablet has larger version then not clone, ignore it + boolean hasCloneSrcReplica = false; + for (Replica srcReplica : tablet.getReplicas()) { + // the src clone replica has to be normal + if (srcReplica.getLastFailedVersion() > 0) { + continue; + } + // the src clone replica's version >= current replica's version + if (srcReplica.getVersion() > replica.getVersion() + || srcReplica.getVersion() == replica.getVersion() + && srcReplica.getVersionHash() != replica.getVersionHash()) { + hasCloneSrcReplica = true; + break; + } + } + if (!hasCloneSrcReplica) { + LOG.info("{} could not find clone src replica meets the " + + "condition, ignore this replica", replica); + continue; + } + if (replicaToCatchup == null) { + replicaToCatchup = replica; + } else if (replica.getLastSuccessVersion() > replica.getLastFailedVersion()) { + // because there is only one catchup clone task for one tablet, so that we should + // select one replica to catch up according to this priority + replicaToCatchup = replica; + // its perfect to select this replica, no need to check others + break; + } else if (replicaToCatchup.getLastFailedVersion() > replica.getLastFailedVersion()) { + // its better to select a low last failed version replica + replicaToCatchup = replica; + } + } + } + if (replicaToCatchup != null) { + LOG.info("select replica [{}] to send clone task", replicaToCatchup); + Clone clone = Catalog.getInstance().getCloneInstance(); + clone.addCloneJob(dbId, tableId, partitionId, materializedIndex.getId(), + tabletId, replicaToCatchup.getBackendId(), + JobType.CATCHUP, JobPriority.HIGH, + Config.clone_job_timeout_second * 1000L); + } + } + + } + } + } finally { + db.readUnlock(); + } + } // end for tables + } // end for dbs + + } private void checkTablets() { Catalog catalog = Catalog.getInstance(); @@ -241,14 +377,14 @@ public class CloneChecker extends Daemon { for (String dbName : dbNames) { Database db = catalog.getDb(dbName); if (db == null) { - LOG.warn("db does not exist. name: {}", dbName); + LOG.debug("db does not exist. name: {}", dbName); continue; } final String clusterName = db.getClusterName(); if (Strings.isNullOrEmpty(clusterName)) { - LOG.error("database {} has no cluster name", dbName); + LOG.debug("database {} has no cluster name", dbName); continue; } @@ -293,6 +429,8 @@ public class CloneChecker extends Daemon { for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { // only check NORMAL index if (materializedIndex.getState() != IndexState.NORMAL) { + LOG.debug("partition [{}] index [{}] state is {}, not normal, skip check tablets", + partitionId, materializedIndex.getId(), materializedIndex.getState()); continue; } @@ -329,6 +467,8 @@ public class CloneChecker extends Daemon { List replicas = tablet.getReplicas(); short onlineReplicaNum = 0; short onlineReplicaNumInCluster = 0; + short healthyReplicaNum = 0; + short healthyReplicaNumInCluster = 0; // choose the largest replica's size as this tablet's size long tabletSizeB = 0L; @@ -348,12 +488,20 @@ public class CloneChecker extends Daemon { } if (backend.isAlive() && replica.getState() != ReplicaState.CLONE) { + // has to check replica's last failed version, because a tablet may contains + // A,B,C,D 4 replica, A,B is normal, C,D is abnormal + // but replica num = 3, then it may drop B, the cluster will comes into fatal error state ++onlineReplicaNum; - // only if if (backendInfosInCluster.containsKey(backendId)) { ++onlineReplicaNumInCluster; } - } + if (replica.getLastFailedVersion() < 0) { + ++ healthyReplicaNum; + if (backendInfosInCluster.containsKey(backendId)) { + ++ healthyReplicaNumInCluster; + } + } + } } TabletInfo tabletInfo = new TabletInfo(dbId, tableId, partitionId, indexId, tabletId, @@ -377,13 +525,19 @@ public class CloneChecker extends Daemon { } if (replicas.size() > replicationNum && onlineReplicaNum >= replicationNum) { + LOG.debug("partition {} index {} tablet {} online replica num is {} > replica num {}, " + + "should delete on replica", + partitionId, index.getId(), tableId, onlineReplicaNum, replicationNum); // in Multi-Tenancy, we will have priority to // guarantee replica in cluster if (onlineReplicaNumInCluster < replicationNum && !cloneTabletIds.contains(tabletId)) { cloneTabletMap.put(tabletId, tabletInfo); } else { - // need delete tablet - deleteTabletSet.add(tabletInfo); + if (healthyReplicaNum >= replicationNum && healthyReplicaNumInCluster >= replicationNum) { + // need delete tablet + LOG.debug("add tablet {} to delete list", tabletInfo); + deleteTabletSet.add(tabletInfo); + } } } else if (onlineReplicaNumInCluster < replicationNum && !cloneTabletIds.contains(tabletId)) { @@ -506,8 +660,8 @@ public class CloneChecker extends Daemon { double avgUsedRatio = (double) (totalCapacityB - availableCapacityB) / totalCapacityB; double lowRatioThreshold = avgUsedRatio * (1 - Config.clone_capacity_balance_threshold); double highRatioThreshold = avgUsedRatio * (1 + Config.clone_capacity_balance_threshold); - LOG.debug("capacity ratio. average used ratio: {}, low ratio threshold: {}, high ratio threshold: {}", - avgUsedRatio, lowRatioThreshold, highRatioThreshold); + // LOG.debug("capacity ratio. average used ratio: {}, low ratio threshold: {}, high ratio threshold: {}", + // avgUsedRatio, lowRatioThreshold, highRatioThreshold); // CapacityLevel -> ids of BE in same host Map>> capacityLevelToHostBackendIds = Maps.newHashMap(); @@ -550,7 +704,7 @@ public class CloneChecker extends Daemon { } } - LOG.info("capacity level map: {}", capacityLevelToHostBackendIds); + // LOG.info("capacity level map: {}", capacityLevelToHostBackendIds); return capacityLevelToHostBackendIds; } @@ -619,7 +773,7 @@ public class CloneChecker extends Daemon { } } - LOG.debug("backend distribution infos. level map: {}", distributionLevelToBackendIds); + // LOG.debug("backend distribution infos. level map: {}", distributionLevelToBackendIds); return distributionLevelToBackendIds; } @@ -681,9 +835,9 @@ public class CloneChecker extends Daemon { // candidate backendIds: // low distribution and low capacity backends Set> candidateBackendIdsByDistribution = distributionLevelToBackendIds.get(CapacityLevel.LOW); - LOG.debug("candidate backends by distribution: {}", candidateBackendIdsByDistribution); + // LOG.debug("candidate backends by distribution: {}", candidateBackendIdsByDistribution); Set> candidateBackendIdsByCapacity = capacityLevelToBackendIds.get(CapacityLevel.LOW); - LOG.debug("candidate backends by capacity: {}", candidateBackendIdsByCapacity); + // LOG.debug("candidate backends by capacity: {}", candidateBackendIdsByCapacity); // select dest backendId from candidates // 2. check canCloneByCapacity && canCloneByDistribution from @@ -758,8 +912,8 @@ public class CloneChecker extends Daemon { } } - LOG.debug("select backend for tablet: {}. type: {}, priority: {}, dest backend id: {}, step: {}", tabletInfo, - jobType.name(), priority.name(), candidateBackendId, step); + // LOG.debug("select backend for tablet: {}. type: {}, priority: {}, dest backend id: {}, step: {}", tabletInfo, + // jobType.name(), priority.name(), candidateBackendId, step); // decrease clone info if (candidateBackendId != -1) { @@ -812,17 +966,27 @@ public class CloneChecker extends Daemon { short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partition.getId()); int realReplicaNum = 0; for (Replica replica : replicas) { - if (replica.getState() != ReplicaState.CLONE) { + // also check if the replica is a health replica or we will drop a health replica + // and the remaining replica is not quorum + if (replica.getState() != ReplicaState.CLONE + && replica.getLastFailedVersion() < 0 + && (replica.getVersion() == partition.getCommittedVersion() + && replica.getVersionHash() == partition.getCommittedVersionHash() + || replica.getVersion() > partition.getCommittedVersionHash())) { ++realReplicaNum; } } - if (realReplicaNum <= replicationNum) { + // if health replica num less than required num, then skip + // if health replica num == required num and == total num, then skip + if (realReplicaNum <= replicationNum + || replicas.size() <= replicationNum) { LOG.info("no redundant replicas in tablet[{}]", tabletId); return; } final Map backendInfos = initBackendInfos(clusterName); // out of cluster and in cluster + // out cluster replica rise to the top Collections.sort(replicas, new Comparator() { public int compare(Replica arg0, Replica arg1) { if (backendInfos.containsKey(arg0.getBackendId())) { @@ -834,7 +998,7 @@ public class CloneChecker extends Daemon { long committedVersion = partition.getCommittedVersion(); long committedVersionHash = partition.getCommittedVersionHash(); - int deleteNum = realReplicaNum - replicationNum; + int deleteNum = replicas.size() - replicationNum; Replica deletedReplica = null; while (deleteNum > 0) { Iterator replicaIterator = replicas.iterator(); @@ -876,10 +1040,33 @@ public class CloneChecker extends Daemon { --deleteNum; deletedReplica = replica; + // actually should write edit log when it is a catchup clone, but we could not distinguish them + // write edit for both + ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(db.getId(), tableId, partitionId, + indexId, tabletId, backendId); + Catalog.getInstance().getEditLog().logDeleteReplica(info); + LOG.info("delete replica [clone], backendId: {}, tablet info: {}, replica: {}", backendId, tabletInfo, replica); break; } + + // delete unhealthy replica + if (replica.getLastFailedVersion() > 0) { + replicaIterator.remove(); + --deleteNum; + deletedReplica = replica; + + // actually should write edit log when it is a catchup clone, but we could not distinguish them + // write edit for both + ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(db.getId(), tableId, partitionId, + indexId, tabletId, backendId); + Catalog.getInstance().getEditLog().logDeleteReplica(info); + + LOG.info("delete replica with last failed version > 0, backendId: {}, " + + "tablet info: {}, replica: {}", backendId, tabletInfo, replica); + break; + } // delete low version long replicaVersion = replica.getVersion(); @@ -911,7 +1098,6 @@ public class CloneChecker extends Daemon { deletedReplica.getId(), deletedReplica.getBackendId()); // delete from inverted index Catalog.getCurrentInvertedIndex().deleteReplica(tabletId, deletedReplica.getBackendId()); - continue; } @@ -1022,16 +1208,16 @@ public class CloneChecker extends Daemon { } else { priority = Clone.calculatePriority(onlineReplicaNum, replicationNum); } - LOG.debug("clone priority: {}, tablet: {}", priority.name(), tabletInfo); + // LOG.debug("clone priority: {}, tablet: {}", priority.name(), tabletInfo); // select dest backend long cloneReplicaBackendId = selectCloneReplicaBackendId(distributionLevelToBackendIds, capacityLevelToBackendIds, backendInfos, tabletInfo, jobType, priority); if (cloneReplicaBackendId == -1) { - LOG.debug("fail to select clone replica backend. tablet: {}", tabletInfo); + // LOG.debug("fail to select clone replica backend. tablet: {}", tabletInfo); return; } - LOG.debug("select clone replica dest backend id[{}] for tablet[{}]", cloneReplicaBackendId, tabletInfo); + // LOG.debug("select clone replica dest backend id[{}] for tablet[{}]", cloneReplicaBackendId, tabletInfo); // add new clone job Clone clone = Catalog.getInstance().getCloneInstance(); @@ -1123,15 +1309,20 @@ public class CloneChecker extends Daemon { } ReplicaState replicaState = replica.getState(); + // if rollup starts then base replcia has errors, the rollup replcia is dropped, then base replica could run clone job + // if schema change starts, replica state is schema change, will not run clone job Preconditions.checkState(replicaState != ReplicaState.ROLLUP); + // yiguolei: schema change, clone, rollup could not run concurrently on a replica + Preconditions.checkState(replicaState != ReplicaState.SCHEMA_CHANGE); // here we pass NORMAL / CLONE / SCHEMA_CHANGE // ATTN(cmy): if adding other state, update here - - if (replica.getBackendId() == job.getDestBackendId() && replicaState != ReplicaState.CLONE) { - String failMsg = "backend[" + replica.getBackendId() + "] already exists in tablet[" + tabletId - + "]. replica id: " + replica.getId() + ". state: " + replicaState; - clone.cancelCloneJob(job, failMsg); - return; + if (job.getType() != JobType.CATCHUP) { + if (replica.getBackendId() == job.getDestBackendId() && replicaState != ReplicaState.CLONE) { + String failMsg = "backend[" + replica.getBackendId() + "] already exists in tablet[" + tabletId + + "]. replica id: " + replica.getId() + ". state: " + replicaState; + clone.cancelCloneJob(job, failMsg); + return; + } } ++onlineReplicaNum; @@ -1139,7 +1330,7 @@ public class CloneChecker extends Daemon { if (clusterBackendInfos.containsKey(backend.getId())) { onlineReplicaNumInCluster++; } - if (replicaState == ReplicaState.CLONE) { + if (replica.getBackendId() == job.getDestBackendId()) { cloneReplica = replica; } } @@ -1161,11 +1352,43 @@ public class CloneChecker extends Daemon { if (backend == null || !backend.isAlive()) { continue; } - + + // this is an abnormal replica, skip it + if (replica.getLastFailedVersion() > 0) { + LOG.debug("replica's last failed version > 0, ignore this replica [{}]", replica); + continue; + } // DO NOT choose replica with stale version or invalid version hash - if (replica.getVersion() > committedVersion || (replica.getVersion() == committedVersion + if (job.getType() != JobType.CATCHUP) { + if (replica.getVersion() > committedVersion || (replica.getVersion() == committedVersion && replica.getVersionHash() == committedVersionHash)) { - srcBackends.add(new TBackend(backend.getHost(), backend.getBePort(), backend.getHttpPort())); + srcBackends.add(new TBackend(backend.getHost(), backend.getBePort(), backend.getHttpPort())); + } else { + LOG.debug("replica [{}] the version not equal to large than commit version {}" + + " or commit version hash {}, ignore this replica", + replica, committedVersion, committedVersionHash); + } + } else { + // deal with this case + // A, B, C 3 replica, A,B verison is 10, C is done, its version is 5 + // A, B is normal during load for version 11 + // but B failed to publish and B is crashed, A is successful + // then C comes up, the partition's committed version is 10, then C try to clone 10, then clone finished + // but last failed version is 11, it is abnormal + // the publish will still fail + if (replica.getVersion() > committedVersion + || replica.getVersion() == committedVersion + && replica.getVersionHash() != committedVersionHash) { + committedVersion = replica.getVersion(); + committedVersionHash = replica.getVersionHash(); + } + // if this is a catchup job, then should exclude the dest backend id from src backends + if (job.getDestBackendId() != backend.getId() + && (replica.getVersion() > cloneReplica.getVersion() + || replica.getVersion() == cloneReplica.getVersion() + && replica.getVersionHash() != cloneReplica.getVersionHash())) { + srcBackends.add(new TBackend(backend.getHost(), backend.getBePort(), backend.getHttpPort())); + } } } @@ -1174,15 +1397,32 @@ public class CloneChecker extends Daemon { clone.cancelCloneJob(job, "no source backends"); return; } - - if (cloneReplica != null) { - tablet.deleteReplica(cloneReplica); + + if (job.getType() != JobType.CATCHUP) { + // yiguolei: in catch up clone, the clone replica is not null + if (cloneReplica != null) { + tablet.deleteReplica(cloneReplica); + ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(dbId, tableId, partitionId, + indexId, tabletId, cloneReplica.getBackendId()); + Catalog.getInstance().getEditLog().logDeleteReplica(info); + LOG.info("remove clone replica. tablet id: {}, backend id: {}", + tabletId, cloneReplica.getBackendId()); + } + // add clone replica in meta + long replicaId = catalog.getNextId(); + // for a new replica to add to the tablet + // first set its state to clone and set last failed version to the largest version in the partition + // wait the catchup clone task to catch up. + // but send the clone task to partition's commit version, although the clone task maybe success but the replica is abnormal + // and another clone task will send to the replica to clone again + // not find a more sufficient method + cloneReplica = new Replica(replicaId, job.getDestBackendId(), -1, 0, + -1, -1, ReplicaState.CLONE, partition.getCurrentVersion(), + partition.getCurrentVersionHash(), -1, 0); + tablet.addReplica(cloneReplica); } - - // add clone replica in meta - long replicaId = catalog.getNextId(); - cloneReplica = new Replica(replicaId, job.getDestBackendId(), ReplicaState.CLONE); - tablet.addReplica(cloneReplica); + // set the replica's state to clone + cloneReplica.setState(ReplicaState.CLONE); } catch (MetaNotFoundException e) { clone.cancelCloneJob(job, e.getMessage()); return; @@ -1192,6 +1432,7 @@ public class CloneChecker extends Daemon { // add clone task AgentBatchTask batchTask = new AgentBatchTask(); + // very important, it is partition's commit version here CloneTask task = new CloneTask(job.getDestBackendId(), dbId, tableId, partitionId, indexId, tabletId, schemaHash, srcBackends, storageMedium, committedVersion, committedVersionHash); batchTask.addTask(task); diff --git a/fe/src/main/java/com/baidu/palo/clone/CloneJob.java b/fe/src/main/java/com/baidu/palo/clone/CloneJob.java index 30b010108e..80eae36dee 100644 --- a/fe/src/main/java/com/baidu/palo/clone/CloneJob.java +++ b/fe/src/main/java/com/baidu/palo/clone/CloneJob.java @@ -15,6 +15,8 @@ package com.baidu.palo.clone; +import com.baidu.palo.task.CloneTask; + public class CloneJob { public enum JobPriority { HIGH, @@ -31,7 +33,8 @@ public class CloneJob { public enum JobType { SUPPLEMENT, - MIGRATION + MIGRATION, + CATCHUP } private long dbId; @@ -48,6 +51,7 @@ public class CloneJob { private long cloneFinishTimeMs; private long timeoutMs; private String failMsg; + private CloneTask cloneTask; public CloneJob(long dbId, long tableId, long partitionId, long indexId, long tabletId, long destBackendId, JobType type, JobPriority priority, long timeoutMs) { @@ -146,6 +150,14 @@ public class CloneJob { public void setFailMsg(String failMsg) { this.failMsg = failMsg; } + + public void setCloneTask(CloneTask task) { + this.cloneTask = task; + } + + public CloneTask getCloneTask() { + return this.cloneTask; + } @Override public String toString() { diff --git a/fe/src/main/java/com/baidu/palo/clone/ClusterLoadStatistic.java b/fe/src/main/java/com/baidu/palo/clone/ClusterLoadStatistic.java index ab878c388d..dc2a021954 100644 --- a/fe/src/main/java/com/baidu/palo/clone/ClusterLoadStatistic.java +++ b/fe/src/main/java/com/baidu/palo/clone/ClusterLoadStatistic.java @@ -41,6 +41,10 @@ import java.util.Collections; import java.util.List; import java.util.stream.Collectors; +/* + * save all load statistic of backends. + * Statistics will be re-calculated at a fix interval. + */ public class ClusterLoadStatistic { private static final Logger LOG = LogManager.getLogger(ClusterLoadStatistic.class); diff --git a/fe/src/main/java/com/baidu/palo/clone/LoadBalancer.java b/fe/src/main/java/com/baidu/palo/clone/LoadBalancer.java index 6cd75b94f3..b950c55e37 100644 --- a/fe/src/main/java/com/baidu/palo/clone/LoadBalancer.java +++ b/fe/src/main/java/com/baidu/palo/clone/LoadBalancer.java @@ -21,6 +21,10 @@ import com.baidu.palo.system.SystemInfoService; import java.util.List; +/* + * LoadBalancer run at a fix interval. + * Each run will re-calculate the load score of all backends + */ public class LoadBalancer extends Daemon { private ClusterLoadStatistic clusterLoadStatistic; diff --git a/fe/src/main/java/com/baidu/palo/clone/RootPathLoadStatistic.java b/fe/src/main/java/com/baidu/palo/clone/RootPathLoadStatistic.java index c74c804173..d50529cc2a 100644 --- a/fe/src/main/java/com/baidu/palo/clone/RootPathLoadStatistic.java +++ b/fe/src/main/java/com/baidu/palo/clone/RootPathLoadStatistic.java @@ -72,7 +72,7 @@ public class RootPathLoadStatistic implements Comparable } } - if (usedCapacityB + tabletSize / (double) capacityB > Config.storage_high_watermark_usage_percent + if ((usedCapacityB + tabletSize) / (double) capacityB > Config.storage_high_watermark_usage_percent || capacityB - usedCapacityB - tabletSize < Config.storage_min_left_capacity_bytes) { return new BalanceStatus(ErrCode.COMMON_ERROR, toString() + " does not fit tablet with size: " + tabletSize); diff --git a/fe/src/main/java/com/baidu/palo/cluster/Cluster.java b/fe/src/main/java/com/baidu/palo/cluster/Cluster.java index 89d798338b..abe8a86907 100644 --- a/fe/src/main/java/com/baidu/palo/cluster/Cluster.java +++ b/fe/src/main/java/com/baidu/palo/cluster/Cluster.java @@ -15,20 +15,6 @@ package com.baidu.palo.cluster; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.InfoSchemaDb; import com.baidu.palo.common.io.Text; @@ -37,8 +23,19 @@ import com.baidu.palo.persist.LinkDbInfo; import com.google.common.base.Strings; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.ReentrantLock; /** * cluster only save db and user's id and name @@ -52,45 +49,44 @@ public class Cluster implements Writable { private Long id; private String name; // backend which cluster own - private Set backendIdSet; + private Set backendIdSet = ConcurrentHashMap.newKeySet(); - private Set userIdSet; - private Set userNameSet; + private ConcurrentHashMap linkDbNames = new ConcurrentHashMap<>(); + private ConcurrentHashMap linkDbIds = new ConcurrentHashMap<>(); - private Map linkDbNames; - private Map linkDbIds; + private Set dbIds = ConcurrentHashMap.newKeySet(); + private Set dbNames = ConcurrentHashMap.newKeySet(); - private Set dbIds; - private Set dbNames; + // lock to perform atomic operations + private ReentrantLock lock = new ReentrantLock(true); - private ReentrantReadWriteLock rwLock; - - public Cluster() { - this.rwLock = new ReentrantReadWriteLock(true); - this.backendIdSet = Sets.newHashSet(); - this.userIdSet = Sets.newHashSet(); - this.userNameSet = Sets.newHashSet(); - this.linkDbNames = Maps.newHashMap(); - this.linkDbIds = Maps.newHashMap(); - this.dbIds = Sets.newHashSet(); - this.dbNames = Sets.newHashSet(); + private Cluster() { + // for persist } public Cluster(String name, long id) { this.name = name; this.id = id; - this.rwLock = new ReentrantReadWriteLock(true); - this.backendIdSet = Sets.newHashSet(); - this.userIdSet = Sets.newHashSet(); - this.userNameSet = Sets.newHashSet(); - this.linkDbNames = Maps.newHashMap(); - this.linkDbIds = Maps.newHashMap(); - this.dbIds = Sets.newHashSet(); - this.dbNames = Sets.newHashSet(); + } + + private void lock() { + this.lock.lock(); + } + + private void unlock() { + this.lock.unlock(); + } + + public Long getId() { + return id; + } + + public String getName() { + return name; } public void addLinkDb(BaseParam param) { - writeLock(); + lock(); try { if (Strings.isNullOrEmpty(param.getStringParam(1)) || param.getLongParam(1) <= 0) { return; @@ -99,141 +95,68 @@ public class Cluster implements Writable { linkDbNames.put(param.getStringParam(), info); linkDbIds.put(param.getLongParam(), info); } finally { - writeUnlock(); + unlock(); } - } public void removeLinkDb(BaseParam param) { - writeLock(); + lock(); try { linkDbNames.remove(param.getStringParam()); linkDbIds.remove(param.getLongParam()); } finally { - writeUnlock(); + unlock(); } } - public boolean containLink(String des, String src) { - readLock(); - try { - final LinkDbInfo info = linkDbNames.get(des); - if (info != null && info.getName().equals(src)) { - return true; - } - } finally { - readUnlock(); + public boolean containLink(String dest, String src) { + final LinkDbInfo info = linkDbNames.get(dest); + if (info != null && info.getName().equals(src)) { + return true; } return false; } - public void addUser(String name, long id) { - if (Strings.isNullOrEmpty(name)) { - return; - } - writeLock(); - try { - userNameSet.add(name); - userIdSet.add(id); - } finally { - writeUnlock(); - } - } - public void addDb(String name, long id) { if (Strings.isNullOrEmpty(name)) { return; } - writeLock(); + lock(); try { dbNames.add(name); dbIds.add(id); } finally { - writeUnlock(); + unlock(); } } public List getDbNames() { final ArrayList ret = new ArrayList(); - readLock(); + lock(); try { ret.addAll(dbNames); ret.addAll(linkDbNames.keySet()); } finally { - readUnlock(); + unlock(); } return ret; } public void removeDb(String name, long id) { - writeLock(); + lock(); try { dbNames.remove(name); dbIds.remove(id); } finally { - writeUnlock(); + unlock(); } } - public boolean containUser(String name) { - return userNameSet.contains(name); - } - - public boolean containUser(long id) { - return userIdSet.contains(id); - } - public boolean containDb(String name) { return dbNames.contains(name); } - public boolean containDb(long id) { - return dbIds.contains(id); - } - - public void readLock() { - this.rwLock.readLock().lock(); - } - - public boolean tryReadLock(long timeout, TimeUnit unit) { - try { - return this.rwLock.readLock().tryLock(timeout, unit); - } catch (InterruptedException e) { - LOG.warn("failed to try read lock at cluster[" + id + "]", e); - return false; - } - } - - public void readUnlock() { - this.rwLock.readLock().unlock(); - } - - public void writeLock() { - this.rwLock.writeLock().lock(); - } - - public boolean tryWriteLock(long timeout, TimeUnit unit) { - try { - return this.rwLock.writeLock().tryLock(timeout, unit); - } catch (InterruptedException e) { - LOG.warn("failed to try write lock at cluster[" + id + "]", e); - return false; - } - } - - public void writeUnlock() { - this.rwLock.writeLock().unlock(); - } - - public boolean isWriteLockHeldByCurrentThread() { - return this.rwLock.writeLock().isHeldByCurrentThread(); - } - - public int getClusterCapacity() { - return backendIdSet.size(); - } - public List getBackendIdList() { return Lists.newArrayList(backendIdSet); } @@ -242,46 +165,26 @@ public class Cluster implements Writable { if (backendIdList == null) { return; } - writeLock(); - try { - this.backendIdSet = Sets.newHashSet(backendIdList); - } finally { - writeUnlock(); - } + backendIdSet = ConcurrentHashMap.newKeySet(); + backendIdSet.addAll(backendIdList); } public void addBackend(long backendId) { - writeLock(); - try { - this.backendIdSet.add(backendId); - } finally { - writeUnlock(); - } + backendIdSet.add(backendId); } public void addBackends(List backendIds) { - writeLock(); - try { - this.backendIdSet.addAll(backendIds); - } finally { - writeUnlock(); - } + backendIdSet.addAll(backendIds); } - public Long getId() { - return id; + public void removeBackend(long removedBackendId) { + backendIdSet.remove((Long) removedBackendId); } - public void setId(Long clusterId) { - this.id = clusterId; - } - - public String getName() { - return name; - } - - public void setName(String clusterName) { - this.name = clusterName; + public static Cluster read(DataInput in) throws IOException { + Cluster cluster = new Cluster(); + cluster.readFields(in); + return cluster; } @Override @@ -361,23 +264,4 @@ public class Cluster implements Writable { linkDbIds.put(key, value); } } - - public void removeBackend(long removedBackendId) { - writeLock(); - try { - backendIdSet.remove((Long)removedBackendId); - } finally { - writeUnlock(); - } - } - - public void removeBackends(List removedBackendIds) { - writeLock(); - try { - backendIdSet.remove(removedBackendIds); - } finally { - writeUnlock(); - } - } - } diff --git a/fe/src/main/java/com/baidu/palo/common/AnalysisException.java b/fe/src/main/java/com/baidu/palo/common/AnalysisException.java index b8578f967e..9e9d2aa92f 100644 --- a/fe/src/main/java/com/baidu/palo/common/AnalysisException.java +++ b/fe/src/main/java/com/baidu/palo/common/AnalysisException.java @@ -23,7 +23,7 @@ package com.baidu.palo.common; /** * Thrown for errors encountered during analysis of a SQL statement. */ -public class AnalysisException extends Exception { +public class AnalysisException extends UserException { public AnalysisException(String msg, Throwable cause) { super(msg, cause); } diff --git a/fe/src/main/java/com/baidu/palo/common/AuthenticationException.java b/fe/src/main/java/com/baidu/palo/common/AuthenticationException.java old mode 100644 new mode 100755 diff --git a/fe/src/main/java/com/baidu/palo/common/AuthorizationException.java b/fe/src/main/java/com/baidu/palo/common/AuthorizationException.java deleted file mode 100755 index 89e551f698..0000000000 --- a/fe/src/main/java/com/baidu/palo/common/AuthorizationException.java +++ /dev/null @@ -1,34 +0,0 @@ -// Modifications copyright (C) 2017, Baidu.com, Inc. -// Copyright 2017 The Apache Software Foundation - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package com.baidu.palo.common; - -/** - * Thrown for authorization errors encountered when accessing Catalog objects. - */ -public class AuthorizationException extends Exception { - public AuthorizationException(String msg, Throwable cause) { - super(msg, cause); - } - - public AuthorizationException(String msg) { - super(msg); - } -} diff --git a/fe/src/main/java/com/baidu/palo/common/ClientPool.java b/fe/src/main/java/com/baidu/palo/common/ClientPool.java index 29a9dd402f..3ad82589fd 100644 --- a/fe/src/main/java/com/baidu/palo/common/ClientPool.java +++ b/fe/src/main/java/com/baidu/palo/common/ClientPool.java @@ -33,7 +33,7 @@ public class ClientPool { static int heartbeatTimeoutMs = FeConstants.heartbeat_interval_second * 1000; static GenericKeyedObjectPoolConfig backendConfig = new GenericKeyedObjectPoolConfig(); - static int backendTimeoutMs = 300; // 5min + static int backendTimeoutMs = 5000; // 5sec static { heartbeatConfig.setLifo(true); // set Last In First Out strategy diff --git a/fe/src/main/java/com/baidu/palo/common/Config.java b/fe/src/main/java/com/baidu/palo/common/Config.java index e5eedbd399..9d6b90bfe3 100644 --- a/fe/src/main/java/com/baidu/palo/common/Config.java +++ b/fe/src/main/java/com/baidu/palo/common/Config.java @@ -68,11 +68,15 @@ public class Config extends ConfigBase { * Load label cleaner will run every *label_clean_interval_second* to clean the outdated jobs. */ @ConfField public static int label_clean_interval_second = 4 * 3600; // 4 hours + /* + * the transaction will be cleaned after transaction_clean_interval_second seconds if the transaction is visible or aborted + */ + @ConfField public static int transaction_clean_interval_second = 1800; // 0.5 hours /* * If a load job stay in QUORUM_FINISHED state longer than *quorum_load_job_max_second*, * a clone job will be triggered to help finishing this load job. */ - @ConfField public static int quorum_load_job_max_second = 4 * 3600; // 4 hours + @ConfField public static int quorum_load_job_max_second = 24 * 3600; // 1 days // Configurations for meta data durability /* @@ -179,6 +183,7 @@ public class Config extends ConfigBase { @ConfField public static int http_port = 8030; /* * FE thrift server port + * */ @ConfField public static int rpc_port = 9020; /* @@ -210,6 +215,34 @@ public class Config extends ConfigBase { */ @ConfField public static int tablet_create_timeout_second = 1; + /* + * Maximal waiting time for publish version message to backend + */ + @ConfField public static int publish_version_timeout_second = 3; + + /* + * minimal intervals between two publish version action + */ + @ConfField public static int publish_version_interval_millis = 100; + + /* + * maximun concurrent running txn num including prepare, commit txns under a single db + * txn manager will reject coming txns + */ + @ConfField public static int max_running_txn_num_per_db = 100; + + /* + * Maximal wait seconds for straggler node in load + * eg. + * there are 3 replicas A, B, C + * load is already quorum finished(A,B) at t1 and C is not finished + * if (current_time - t1) > 300s, then palo will treat C as a failure node + * will call transaction manager to commit the transaction and tell transaction manager + * that C is failed + * TODO this parameter is the default value for all job and the DBA could specify it for separate job + */ + @ConfField public static int load_straggler_wait_second = 300; + /* * Maximal memory layout length of a row. default is 100 KB. * In BE, the maximal size of a RowBlock is 100MB(Configure as max_unpacked_row_block_size in be.conf). @@ -276,6 +309,11 @@ public class Config extends ConfigBase { * Default mini load timeout */ @ConfField public static int mini_load_default_timeout_second = 3600; // 1 hour + + /* + * Default stream load timeout + */ + @ConfField public static int stream_load_default_timeout_second = 300; // 300s /* * Default hadoop load timeout @@ -319,6 +357,10 @@ public class Config extends ConfigBase { * HIGH priority clone job's delay trigger time. */ @ConfField public static int clone_high_priority_delay_second = 0; + /* + * the minimal delay seconds between a replica is failed and fe try to recovery it using clone. + */ + @ConfField public static int replica_delay_recovery_second = 0; /* * Balance threshold of data size in BE. * The balance algorithm is: @@ -332,7 +374,7 @@ public class Config extends ConfigBase { * Balance threshold of num of replicas in Backends. */ @ConfField public static double clone_distribution_balance_threshold = 0.2; - /* + /* * The high water of disk capacity used percent. * This is used for calculating load score of a backend. */ @@ -528,6 +570,10 @@ public class Config extends ConfigBase { @ConfField public static double storage_high_watermark_usage_percent = 0.85; @ConfField public static double storage_min_left_capacity_bytes = 1000 * 1024 * 1024; // 1G + // update interval of tablet stat + // All frontends will get tablet stat from all backends at each interval + @ConfField public static int tablet_stat_update_interval_second = 300; // 5 min + // May be necessary to modify the following BRPC configurations in high concurrency scenarios. // The number of concurrent requests BRPC can processed @ConfField public static int brpc_number_of_concurrent_requests_processed = 4096; @@ -595,4 +641,10 @@ public class Config extends ConfigBase { * Set to true to disable this kind of load. */ @ConfField public static boolean disable_hadoop_load = false; + + /* + * fe will call es api to get es index shard info every es_state_sync_interval_secs + */ + @ConfField public static long es_state_sync_interval_secs = 10; } + diff --git a/fe/src/main/java/com/baidu/palo/common/DdlException.java b/fe/src/main/java/com/baidu/palo/common/DdlException.java index 072f90485f..f0861b433d 100644 --- a/fe/src/main/java/com/baidu/palo/common/DdlException.java +++ b/fe/src/main/java/com/baidu/palo/common/DdlException.java @@ -20,10 +20,7 @@ package com.baidu.palo.common; -/** - * Created by zhaochun on 14/11/12. - */ -public class DdlException extends Exception { +public class DdlException extends UserException { public DdlException(String msg) { super(msg); } diff --git a/fe/src/main/java/com/baidu/palo/common/FeConstants.java b/fe/src/main/java/com/baidu/palo/common/FeConstants.java index c11423e34c..f46518c3e3 100644 --- a/fe/src/main/java/com/baidu/palo/common/FeConstants.java +++ b/fe/src/main/java/com/baidu/palo/common/FeConstants.java @@ -38,5 +38,5 @@ public class FeConstants { // general model // Current meta data version. Use this version to write journals and image - public static int meta_version = FeMetaVersion.VERSION_43; + public static int meta_version = FeMetaVersion.VERSION_45; } diff --git a/fe/src/main/java/com/baidu/palo/common/FeMetaVersion.java b/fe/src/main/java/com/baidu/palo/common/FeMetaVersion.java index 43d097909e..b2bedfce61 100644 --- a/fe/src/main/java/com/baidu/palo/common/FeMetaVersion.java +++ b/fe/src/main/java/com/baidu/palo/common/FeMetaVersion.java @@ -82,10 +82,10 @@ public final class FeMetaVersion { // paralle exec param and batch size public static final int VERSION_38 = 38; - + // schema change support row to column public static final int VERSION_39 = 39; - + // persistent brpc port in Backend public static final int VERSION_40 = 40; @@ -97,4 +97,7 @@ public final class FeMetaVersion { // new privilege management public static final int VERSION_43 = 43; + + // streaming load + public static final int VERSION_45 = 45; } diff --git a/fe/src/main/java/com/baidu/palo/common/InternalException.java b/fe/src/main/java/com/baidu/palo/common/InternalException.java deleted file mode 100644 index e2e65646ff..0000000000 --- a/fe/src/main/java/com/baidu/palo/common/InternalException.java +++ /dev/null @@ -1,34 +0,0 @@ -// Modifications copyright (C) 2017, Baidu.com, Inc. -// Copyright 2017 The Apache Software Foundation - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package com.baidu.palo.common; - -/** - * Thrown for internal server errors. - */ -public class InternalException extends Exception { - public InternalException(String msg, Throwable cause) { - super(msg, cause); - } - - public InternalException(String msg) { - super(msg); - } -} diff --git a/fe/src/main/java/com/baidu/palo/common/MarkDownParser.java b/fe/src/main/java/com/baidu/palo/common/MarkDownParser.java index eedef804e7..bbd0ff7f35 100644 --- a/fe/src/main/java/com/baidu/palo/common/MarkDownParser.java +++ b/fe/src/main/java/com/baidu/palo/common/MarkDownParser.java @@ -62,7 +62,7 @@ public class MarkDownParser { state = ParseState.START; } - public Map> parse() throws InternalException { + public Map> parse() throws UserException { while (nextToRead < lines.size()) { Map.Entry keyValue = parseOneItem(); if (keyValue == null) { @@ -77,7 +77,7 @@ public class MarkDownParser { state = ParseState.PARSED_H1; } else { // State error - throw new InternalException("Head first read is not h1."); + throw new UserException("Head first read is not h1."); } break; case PARSED_H1: @@ -90,7 +90,7 @@ public class MarkDownParser { keyValues.put(keyValue.getKey(), keyValue.getValue()); state = ParseState.PARSED_H2; } else { - throw new InternalException("Unknown head level."); + throw new UserException("Unknown head level."); } break; case PARSED_H2: @@ -103,12 +103,12 @@ public class MarkDownParser { keyValues.put(keyValue.getKey(), keyValue.getValue()); } else { // State error - throw new InternalException("Unknown head level when parsing head level(2)"); + throw new UserException("Unknown head level when parsing head level(2)"); } break; default: // State error - throw new InternalException("Unknown parse state."); + throw new UserException("Unknown parse state."); } } diff --git a/fe/src/main/java/com/baidu/palo/common/MetaNotFoundException.java b/fe/src/main/java/com/baidu/palo/common/MetaNotFoundException.java index 90437b302d..b2cb177b01 100644 --- a/fe/src/main/java/com/baidu/palo/common/MetaNotFoundException.java +++ b/fe/src/main/java/com/baidu/palo/common/MetaNotFoundException.java @@ -23,7 +23,7 @@ package com.baidu.palo.common; /** * Exception for meta info is null, like db table partition tablet replica job */ -public class MetaNotFoundException extends Exception { +public class MetaNotFoundException extends UserException { public MetaNotFoundException(String msg) { super(msg); } diff --git a/fe/src/main/java/com/baidu/palo/common/NotImplementedException.java b/fe/src/main/java/com/baidu/palo/common/NotImplementedException.java index e87768acbd..d103e91746 100644 --- a/fe/src/main/java/com/baidu/palo/common/NotImplementedException.java +++ b/fe/src/main/java/com/baidu/palo/common/NotImplementedException.java @@ -23,7 +23,7 @@ package com.baidu.palo.common; /** * Thrown for SQL statements that require as yet unimplemented functionality. */ -public class NotImplementedException extends Exception { +public class NotImplementedException extends UserException { public NotImplementedException(String msg) { super(msg); } diff --git a/fe/src/main/java/com/baidu/palo/common/proc/CloneProcNode.java b/fe/src/main/java/com/baidu/palo/common/proc/CloneProcNode.java index 765b7880ef..a45cd903de 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/CloneProcNode.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/CloneProcNode.java @@ -34,7 +34,9 @@ public class CloneProcNode implements ProcNodeInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() .add("DbId").add("TableId").add("PartitionId").add("IndexId") .add("TabletId").add("BackendId").add("State").add("Type") - .add("Priority").add("CreateTime").add("StartTime").add("FinishTime") + .add("Priority").add("CloneVersion").add("CloneVersionHash") + .add("TaskFailTimes") + .add("CreateTime").add("StartTime").add("FinishTime") .add("Timeout(s)").add("FailMsg") .build(); diff --git a/fe/src/main/java/com/baidu/palo/common/proc/JobsProcDir.java b/fe/src/main/java/com/baidu/palo/common/proc/JobsProcDir.java index 73bf34ca1b..2721f8ce21 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/JobsProcDir.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/JobsProcDir.java @@ -120,9 +120,9 @@ public class JobsProcDir implements ProcDirInterface { // delete pendingNum = 0; - runningNum = 0; - finishedNum = load.getDeleteInfoNum(dbId); - cancelledNum = 0; + runningNum = load.getDeleteJobNumByState(dbId, com.baidu.palo.load.LoadJob.JobState.LOADING); + finishedNum = load.getDeleteJobNumByState(dbId, com.baidu.palo.load.LoadJob.JobState.FINISHED); + cancelledNum = load.getDeleteJobNumByState(dbId, com.baidu.palo.load.LoadJob.JobState.CANCELLED); totalNum = pendingNum + runningNum + finishedNum + cancelledNum; result.addRow(Lists.newArrayList(DELETE, pendingNum.toString(), runningNum.toString(), finishedNum.toString(), cancelledNum.toString(), totalNum.toString())); @@ -130,7 +130,8 @@ public class JobsProcDir implements ProcDirInterface { // rollup RollupHandler rollupHandler = Catalog.getInstance().getRollupHandler(); pendingNum = rollupHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.PENDING, dbId); - runningNum = rollupHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.RUNNING, dbId); + runningNum = rollupHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.RUNNING, dbId) + + rollupHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.FINISHING, dbId); finishedNum = rollupHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.FINISHED, dbId); cancelledNum = rollupHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.CANCELLED, dbId); totalNum = pendingNum + runningNum + finishedNum + cancelledNum; @@ -140,7 +141,8 @@ public class JobsProcDir implements ProcDirInterface { // schema change SchemaChangeHandler schemaChangeHandler = Catalog.getInstance().getSchemaChangeHandler(); pendingNum = schemaChangeHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.PENDING, dbId); - runningNum = schemaChangeHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.RUNNING, dbId); + runningNum = schemaChangeHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.RUNNING, dbId) + + schemaChangeHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.FINISHING, dbId); finishedNum = schemaChangeHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.FINISHED, dbId); cancelledNum = schemaChangeHandler.getAlterJobNum(com.baidu.palo.alter.AlterJob.JobState.CANCELLED, dbId); totalNum = pendingNum + runningNum + finishedNum + cancelledNum; diff --git a/fe/src/main/java/com/baidu/palo/common/proc/ProcService.java b/fe/src/main/java/com/baidu/palo/common/proc/ProcService.java index ee7aa2042e..bc7979eb06 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/ProcService.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/ProcService.java @@ -46,6 +46,7 @@ public final class ProcService { root.register("frontends", new FrontendsProcNode(Catalog.getInstance())); root.register("brokers", Catalog.getInstance().getBrokerMgr().getProcNode()); root.register("load_error_hub_url", new LoadErrorProcNode(Catalog.getInstance())); + root.register("transactions", new TransDbProcDir(Catalog.getInstance())); root.register("monitor", new MonitorProcDir()); root.register("cluster_load_statistic", new ClusterLoadStatisticProcDir()); root.register("current_queries", new CurrentQueryStatisticsProcDir()); diff --git a/fe/src/main/java/com/baidu/palo/common/proc/RollupProcDir.java b/fe/src/main/java/com/baidu/palo/common/proc/RollupProcDir.java index 1b2a7c9b67..02188fb6f2 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/RollupProcDir.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/RollupProcDir.java @@ -31,12 +31,11 @@ import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class RollupProcDir implements ProcDirInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() - .add("JobId").add("TableName").add("CreateTime").add("FinishedTime") + .add("JobId").add("TableName").add("TransactionId").add("CreateTime").add("FinishedTime") .add("BaseIndexName").add("RollupIndexName").add("State").add("Msg") .add("Progress") .build(); diff --git a/fe/src/main/java/com/baidu/palo/common/proc/SchemaChangeProcNode.java b/fe/src/main/java/com/baidu/palo/common/proc/SchemaChangeProcNode.java index e9a3ecd60b..baa66d5f04 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/SchemaChangeProcNode.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/SchemaChangeProcNode.java @@ -20,6 +20,14 @@ package com.baidu.palo.common.proc; +import com.baidu.palo.catalog.Database; +import com.baidu.palo.common.AnalysisException; +import com.baidu.palo.alter.SchemaChangeHandler; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.ArrayList; import com.baidu.palo.alter.SchemaChangeHandler; import com.baidu.palo.catalog.Database; import com.baidu.palo.common.AnalysisException; @@ -32,7 +40,7 @@ import java.util.List; public class SchemaChangeProcNode implements ProcNodeInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() - .add("JobId").add("TableName").add("CreateTime").add("FinishTime") + .add("JobId").add("TableName").add("TransactionId").add("CreateTime").add("FinishTime") .add("IndexName").add("IndexState").add("State").add("Msg") .add("Progress") .build(); diff --git a/fe/src/main/java/com/baidu/palo/common/proc/TableProcDir.java b/fe/src/main/java/com/baidu/palo/common/proc/TableProcDir.java index 20d0b11e1b..b88b425b60 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/TableProcDir.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/TableProcDir.java @@ -21,6 +21,7 @@ package com.baidu.palo.common.proc; import com.baidu.palo.catalog.Database; +import com.baidu.palo.catalog.EsTable; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.catalog.Table; import com.baidu.palo.catalog.Table.TableType; @@ -82,10 +83,13 @@ public class TableProcDir implements ProcDirInterface { } if (entryName.equals(PARTITIONS)) { - if (table.getType() != TableType.OLAP) { - throw new AnalysisException("Table[" + table.getName() + "] is not a OLAP table"); + if (table.getType() == TableType.OLAP) { + return new PartitionsProcDir(db, (OlapTable) table); + } else if (table.getType() == TableType.ELASTICSEARCH) { + return new EsPartitionsProcDir(db, (EsTable) table); + } else { + throw new AnalysisException("Table[" + table.getName() + "] is not a OLAP or ELASTICSEARCH table"); } - return new PartitionsProcDir(db, (OlapTable) table); } else if (entryName.equals(INDEX_SCHEMA)) { return new IndexInfoProcDir(db, table); } else { diff --git a/fe/src/main/java/com/baidu/palo/common/proc/TabletsProcDir.java b/fe/src/main/java/com/baidu/palo/common/proc/TabletsProcDir.java index cf8eea3a3f..c7b114ad9b 100644 --- a/fe/src/main/java/com/baidu/palo/common/proc/TabletsProcDir.java +++ b/fe/src/main/java/com/baidu/palo/common/proc/TabletsProcDir.java @@ -1,6 +1,5 @@ // Modifications copyright (C) 2017, Baidu.com, Inc. // Copyright 2017 The Apache Software Foundation - // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -17,13 +16,12 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - package com.baidu.palo.common.proc; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; -import com.baidu.palo.catalog.MaterializedIndex; import com.baidu.palo.catalog.Replica; +import com.baidu.palo.catalog.MaterializedIndex; import com.baidu.palo.catalog.Tablet; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.util.ListComparator; @@ -46,9 +44,10 @@ import java.util.List; public class TabletsProcDir implements ProcDirInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() .add("TabletId").add("ReplicaId").add("BackendId").add("HostName").add("Version") - .add("VersionHash").add("DataSize").add("RowCount").add("State") + .add("VersionHash").add("LastSuccessVersion").add("LastSuccessVersionHash") + .add("LastFailedVersion").add("LastFailedTime").add("DataSize").add("RowCount").add("State") .add("LastConsistencyCheckTime").add("CheckVersion").add("CheckVersionHash") - .add("VersionCount") + .add("VersionCount") .build(); private Database db; @@ -79,6 +78,10 @@ public class TabletsProcDir implements ProcDirInterface { tabletInfo.add(-1); tabletInfo.add(-1); tabletInfo.add(-1); + tabletInfo.add(-1); + tabletInfo.add("N/A"); + tabletInfo.add(-1); + tabletInfo.add(-1); tabletInfo.add("N/A"); tabletInfo.add("N/A"); tabletInfo.add(-1); @@ -95,10 +98,10 @@ public class TabletsProcDir implements ProcDirInterface { long backendId = replica.getBackendId(); tabletInfo.add(replica.getBackendId()); Backend backend = Catalog.getCurrentSystemInfo().getBackend(backendId); - // backend may be dropped concurrently, ignore it. - if (backend == null) { - continue; - } + // backend may be dropped concurrently, ignore it. + if (backend == null) { + continue; + } String hostName = null; try { InetAddress address = InetAddress.getByName(backend.getHost()); @@ -109,6 +112,10 @@ public class TabletsProcDir implements ProcDirInterface { tabletInfo.add(hostName); tabletInfo.add(replica.getVersion()); tabletInfo.add(replica.getVersionHash()); + tabletInfo.add(replica.getLastSuccessVersion()); + tabletInfo.add(replica.getLastSuccessVersionHash()); + tabletInfo.add(replica.getLastFailedVersion()); + tabletInfo.add(TimeUtils.longToTimeString(replica.getLastFailedTimestamp())); tabletInfo.add(replica.getDataSize()); tabletInfo.add(replica.getRowCount()); tabletInfo.add(replica.getState()); @@ -116,7 +123,7 @@ public class TabletsProcDir implements ProcDirInterface { tabletInfo.add(TimeUtils.longToTimeString(tablet.getLastCheckTime())); tabletInfo.add(tablet.getCheckedVersion()); tabletInfo.add(tablet.getCheckedVersionHash()); - tabletInfo.add(replica.getVersionCount()); + tabletInfo.add(replica.getVersionCount()); tabletInfos.add(tabletInfo); } diff --git a/fe/src/main/java/com/baidu/palo/common/util/BrokerUtil.java b/fe/src/main/java/com/baidu/palo/common/util/BrokerUtil.java index dd6c50d69d..1a97262ad5 100644 --- a/fe/src/main/java/com/baidu/palo/common/util/BrokerUtil.java +++ b/fe/src/main/java/com/baidu/palo/common/util/BrokerUtil.java @@ -5,7 +5,7 @@ import com.baidu.palo.catalog.BrokerMgr; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ClientPool; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.service.FrontendOptions; import com.baidu.palo.thrift.TBrokerFileStatus; import com.baidu.palo.thrift.TBrokerListPathRequest; @@ -25,13 +25,13 @@ public class BrokerUtil { private static final Logger LOG = LogManager.getLogger(BrokerUtil.class); public static void parseBrokerFile(String path, BrokerDesc brokerDesc, List fileStatuses) - throws InternalException { + throws UserException { BrokerMgr.BrokerAddress brokerAddress = null; try { String localIP = FrontendOptions.getLocalHostAddress(); brokerAddress = Catalog.getInstance().getBrokerMgr().getBroker(brokerDesc.getName(), localIP); } catch (AnalysisException e) { - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } TNetworkAddress address = new TNetworkAddress(brokerAddress.ip, brokerAddress.port); TPaloBrokerService.Client client = null; @@ -41,7 +41,7 @@ public class BrokerUtil { try { client = ClientPool.brokerPool.borrowObject(address); } catch (Exception e1) { - throw new InternalException("Create connection to broker(" + address + ") failed."); + throw new UserException("Create connection to broker(" + address + ") failed."); } } boolean failed = true; @@ -56,7 +56,7 @@ public class BrokerUtil { tBrokerListResponse = client.listPath(request); } if (tBrokerListResponse.getOpStatus().getStatusCode() != TBrokerOperationStatusCode.OK) { - throw new InternalException("Broker list path failed.path=" + path + throw new UserException("Broker list path failed.path=" + path + ",broker=" + address + ",msg=" + tBrokerListResponse.getOpStatus().getMessage()); } failed = false; @@ -68,7 +68,7 @@ public class BrokerUtil { } } catch (TException e) { LOG.warn("Broker list path exception, path={}, address={}, exception={}", path, address, e); - throw new InternalException("Broker list path exception.path=" + path + ",broker=" + address); + throw new UserException("Broker list path exception.path=" + path + ",broker=" + address); } finally { if (failed) { ClientPool.brokerPool.invalidateObject(address, client); diff --git a/fe/src/main/java/com/baidu/palo/common/util/Util.java b/fe/src/main/java/com/baidu/palo/common/util/Util.java index 0b9f3f9332..0bdb8e38cb 100644 --- a/fe/src/main/java/com/baidu/palo/common/util/Util.java +++ b/fe/src/main/java/com/baidu/palo/common/util/Util.java @@ -39,6 +39,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Set; import java.util.zip.Adler32; @@ -268,6 +269,14 @@ public class Util { return Math.abs((int) adler32.getValue()); } + public static long generateVersionHash() { + return Math.abs(new Random().nextLong()); + } + + public static int generateSchemaHash() { + return Math.abs(new Random().nextInt()); + } + /** * Chooses k unique random elements from a population sequence */ diff --git a/fe/src/main/java/com/baidu/palo/http/HttpServer.java b/fe/src/main/java/com/baidu/palo/http/HttpServer.java index 8b723a4cd0..770faaf5e1 100755 --- a/fe/src/main/java/com/baidu/palo/http/HttpServer.java +++ b/fe/src/main/java/com/baidu/palo/http/HttpServer.java @@ -35,10 +35,12 @@ import com.baidu.palo.http.meta.MetaService.PutAction; import com.baidu.palo.http.meta.MetaService.RoleAction; import com.baidu.palo.http.meta.MetaService.VersionAction; import com.baidu.palo.http.rest.BootstrapFinishAction; +import com.baidu.palo.http.rest.CancelStreamLoad; import com.baidu.palo.http.rest.CheckDecommissionAction; import com.baidu.palo.http.rest.GetDdlStmtAction; import com.baidu.palo.http.rest.GetLoadInfoAction; import com.baidu.palo.http.rest.GetLogFileAction; +import com.baidu.palo.http.rest.GetStreamLoadState; import com.baidu.palo.http.rest.HealthAction; import com.baidu.palo.http.rest.LoadAction; import com.baidu.palo.http.rest.MetaReplayerCheckAction; @@ -102,6 +104,8 @@ public class HttpServer { GetDdlStmtAction.registerAction(controller); MigrationAction.registerAction(controller); StorageTypeCheckAction.registerAction(controller); + CancelStreamLoad.registerAction(controller); + GetStreamLoadState.registerAction(controller); // add web action IndexAction.registerAction(controller); diff --git a/fe/src/main/java/com/baidu/palo/http/meta/MetaService.java b/fe/src/main/java/com/baidu/palo/http/meta/MetaService.java index 40c0b8b319..808b3ce37c 100644 --- a/fe/src/main/java/com/baidu/palo/http/meta/MetaService.java +++ b/fe/src/main/java/com/baidu/palo/http/meta/MetaService.java @@ -16,7 +16,6 @@ package com.baidu.palo.http.meta; import com.baidu.palo.catalog.Catalog; -import com.baidu.palo.catalog.Database; import com.baidu.palo.common.Config; import com.baidu.palo.ha.FrontendNodeType; import com.baidu.palo.http.ActionController; @@ -29,9 +28,7 @@ import com.baidu.palo.persist.Storage; import com.baidu.palo.persist.StorageInfo; import com.baidu.palo.system.Frontend; -import com.google.common.base.Preconditions; import com.google.common.base.Strings; -import com.google.common.collect.Maps; import com.google.gson.Gson; import org.apache.logging.log4j.LogManager; @@ -41,8 +38,6 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; -import java.util.List; -import java.util.Map; import io.netty.handler.codec.http.HttpMethod; import io.netty.handler.codec.http.HttpResponseStatus; @@ -326,62 +321,11 @@ public class MetaService { */ // TODO: Still need to lock ClusterInfoService to prevent add or drop Backends - LOG.info("begin to dump meta data."); - Catalog catalog = Catalog.getInstance(); - Map lockedDbMap = Maps.newTreeMap(); - - String dumpFilePath; - catalog.readLock(); - try { - List dbNames = catalog.getDbNames(); - if (dbNames == null || dbNames.isEmpty()) { - return; - } - - // sort all dbs - for (String dbName : dbNames) { - Database db = catalog.getDb(dbName); - Preconditions.checkNotNull(db); - lockedDbMap.put(dbName, db); - } - - // lock all dbs - for (Database db : lockedDbMap.values()) { - db.readLock(); - } - LOG.info("acquired all the dbs' read lock."); - - catalog.getAlterInstance().getRollupHandler().readLock(); - catalog.getAlterInstance().getSchemaChangeHandler().readLock(); - catalog.getLoadInstance().readLock(); - - LOG.info("acquired all jobs' read lock."); - long journalId = catalog.getMaxJournalId(); - File dumpFile = new File(Config.meta_dir, "image." + journalId); - dumpFilePath = dumpFile.getAbsolutePath(); - try { - LOG.info("begin to dump {}", dumpFilePath); - catalog.saveImage(dumpFile, journalId); - } catch (IOException e) { - LOG.error(e); - } - - } finally { - // unlock all - - catalog.getLoadInstance().readUnlock(); - catalog.getAlterInstance().getSchemaChangeHandler().readUnLock(); - catalog.getAlterInstance().getRollupHandler().readUnLock(); - - for (Database db : lockedDbMap.values()) { - db.readUnlock(); - } - - catalog.readUnlock(); + String dumpFilePath = Catalog.getInstance().dumpImage(); + if (dumpFilePath == null) { + response.appendContent("dump failed. " + dumpFilePath); } - LOG.info("dump finished."); - response.appendContent("dump finished. " + dumpFilePath); writeResponse(request, response); return; diff --git a/fe/src/main/java/com/baidu/palo/http/rest/LoadAction.java b/fe/src/main/java/com/baidu/palo/http/rest/LoadAction.java index 6baa2dfc03..8b95740a3b 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/LoadAction.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/LoadAction.java @@ -47,10 +47,16 @@ public class LoadAction extends RestBaseAction { public static final String SUB_LABEL_NAME_PARAM = "sub_label"; private ExecuteEnv execEnv; + private boolean isStreamLoad = false; public LoadAction(ActionController controller, ExecuteEnv execEnv) { + this(controller, execEnv, false); + } + + public LoadAction(ActionController controller, ExecuteEnv execEnv, boolean isStreamLoad) { super(controller); this.execEnv = execEnv; + this.isStreamLoad = isStreamLoad; } public static void registerAction(ActionController controller) throws IllegalArgException { @@ -58,11 +64,16 @@ public class LoadAction extends RestBaseAction { LoadAction action = new LoadAction(controller, execEnv); controller.registerHandler(HttpMethod.PUT, "/api/{" + DB_NAME_PARAM + "}/{" + TABLE_NAME_PARAM + "}/_load", action); + + controller.registerHandler(HttpMethod.PUT, + "/api/{" + DB_NAME_PARAM + "}/{" + TABLE_NAME_PARAM + "}/_stream_load", + new LoadAction(controller, execEnv, true)); } @Override - public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) - throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, + BaseRequest request, BaseResponse response) throws DdlException { + // A 'Load' request must have 100-continue header if (!request.getRequest().headers().contains(HttpHeaders.Names.EXPECT)) { throw new DdlException("There is no 100-continue header"); @@ -84,18 +95,23 @@ public class LoadAction extends RestBaseAction { } String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, dbName); + String label = request.getSingleParameter(LABEL_NAME_PARAM); - String subLabel = request.getSingleParameter(SUB_LABEL_NAME_PARAM); - if (Strings.isNullOrEmpty(label)) { - throw new DdlException("No label selected."); + if (!isStreamLoad) { + if (Strings.isNullOrEmpty(label)) { + throw new DdlException("No label selected."); + } } // check auth checkTblAuth(authInfo, fullDbName, tableName, PrivPredicate.LOAD); - // Try to redirect to master - if (redirectToMaster(request, response)) { - return; + if (!isStreamLoad && !Strings.isNullOrEmpty(request.getSingleParameter(SUB_LABEL_NAME_PARAM))) { + // only multi mini load need to redirect to Master, because only Master has the info of table to + // the Backend which the file exists. + if (redirectToMaster(request, response)) { + return; + } } // Choose a backend sequentially. @@ -110,11 +126,15 @@ public class LoadAction extends RestBaseAction { } TNetworkAddress redirectAddr = new TNetworkAddress(backend.getHost(), backend.getHttpPort()); - if (!Strings.isNullOrEmpty(subLabel)) { - redirectAddr = execEnv.getMultiLoadMgr().redirectAddr(fullDbName, label, tableName, redirectAddr); - } - LOG.info("mini load redirect to backend: {}, label: {}", redirectAddr.toString(), label); + if (!isStreamLoad) { + String subLabel = request.getSingleParameter(SUB_LABEL_NAME_PARAM); + if (!Strings.isNullOrEmpty(subLabel)) { + redirectAddr = execEnv.getMultiLoadMgr().redirectAddr(fullDbName, label, tableName, redirectAddr); + } + } + + LOG.info("redirect load action to destination={}", redirectAddr.toString()); redirectTo(request, response, redirectAddr); } } diff --git a/fe/src/main/java/com/baidu/palo/http/rest/MultiAbort.java b/fe/src/main/java/com/baidu/palo/http/rest/MultiAbort.java index ff5e407ab7..a237c0ab76 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/MultiAbort.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/MultiAbort.java @@ -39,14 +39,15 @@ public class MultiAbort extends RestBaseAction { this.execEnv = execEnv; } - public static void registerAction (ActionController controller) throws IllegalArgException { + public static void registerAction(ActionController controller) throws IllegalArgException { ExecuteEnv executeEnv = ExecuteEnv.getInstance(); MultiAbort action = new MultiAbort(controller, executeEnv); controller.registerHandler(HttpMethod.POST, "/api/{db}/_multi_abort", action); } @Override - public void execute(BaseRequest request, BaseResponse response) throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) + throws DdlException { String db = request.getSingleParameter(DB_KEY); if (Strings.isNullOrEmpty(db)) { throw new DdlException("No database selected"); @@ -56,10 +57,10 @@ public class MultiAbort extends RestBaseAction { throw new DdlException("No label selected"); } - AuthorizationInfo authInfo = getAuthorizationInfo(request); String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, db); checkDbAuth(authInfo, fullDbName, PrivPredicate.LOAD); + // only Master has these load info if (redirectToMaster(request, response)) { return; } @@ -68,3 +69,4 @@ public class MultiAbort extends RestBaseAction { sendResult(request, response, RestBaseResult.getOk()); } } + diff --git a/fe/src/main/java/com/baidu/palo/http/rest/MultiCommit.java b/fe/src/main/java/com/baidu/palo/http/rest/MultiCommit.java index 7ef45a944e..8b872043da 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/MultiCommit.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/MultiCommit.java @@ -39,14 +39,15 @@ public class MultiCommit extends RestBaseAction { this.execEnv = execEnv; } - public static void registerAction (ActionController controller) throws IllegalArgException { + public static void registerAction(ActionController controller) throws IllegalArgException { ExecuteEnv executeEnv = ExecuteEnv.getInstance(); MultiCommit action = new MultiCommit(controller, executeEnv); controller.registerHandler(HttpMethod.POST, "/api/{db}/_multi_commit", action); } @Override - public void execute(BaseRequest request, BaseResponse response) throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) + throws DdlException { String db = request.getSingleParameter(DB_KEY); if (Strings.isNullOrEmpty(db)) { throw new DdlException("No database selected"); @@ -56,10 +57,10 @@ public class MultiCommit extends RestBaseAction { throw new DdlException("No label selected"); } - AuthorizationInfo authInfo = getAuthorizationInfo(request); String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, db); checkDbAuth(authInfo, fullDbName, PrivPredicate.LOAD); + // only Master has these load info if (redirectToMaster(request, response)) { return; } @@ -67,3 +68,4 @@ public class MultiCommit extends RestBaseAction { sendResult(request, response, RestBaseResult.getOk()); } } + diff --git a/fe/src/main/java/com/baidu/palo/http/rest/MultiDesc.java b/fe/src/main/java/com/baidu/palo/http/rest/MultiDesc.java index ba2dbf4d3b..1942ffb7d3 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/MultiDesc.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/MultiDesc.java @@ -43,14 +43,15 @@ public class MultiDesc extends RestBaseAction { this.execEnv = execEnv; } - public static void registerAction (ActionController controller) throws IllegalArgException { + public static void registerAction(ActionController controller) throws IllegalArgException { ExecuteEnv executeEnv = ExecuteEnv.getInstance(); MultiDesc action = new MultiDesc(controller, executeEnv); controller.registerHandler(HttpMethod.POST, "/api/{db}/_multi_desc", action); } @Override - public void execute(BaseRequest request, BaseResponse response) throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) + throws DdlException { String db = request.getSingleParameter(DB_KEY); if (Strings.isNullOrEmpty(db)) { throw new DdlException("No database selected"); @@ -60,10 +61,10 @@ public class MultiDesc extends RestBaseAction { throw new DdlException("No label selected"); } - AuthorizationInfo authInfo = getAuthorizationInfo(request); String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, db); checkDbAuth(authInfo, fullDbName, PrivPredicate.LOAD); + // only Master has these load info if (redirectToMaster(request, response)) { return; } @@ -81,3 +82,4 @@ public class MultiDesc extends RestBaseAction { } } } + diff --git a/fe/src/main/java/com/baidu/palo/http/rest/MultiList.java b/fe/src/main/java/com/baidu/palo/http/rest/MultiList.java index 329a40fcae..c7014261cc 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/MultiList.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/MultiList.java @@ -42,26 +42,28 @@ public class MultiList extends RestBaseAction { this.execEnv = execEnv; } - public static void registerAction (ActionController controller) throws IllegalArgException { + public static void registerAction(ActionController controller) throws IllegalArgException { ExecuteEnv executeEnv = ExecuteEnv.getInstance(); MultiList action = new MultiList(controller, executeEnv); controller.registerHandler(HttpMethod.POST, "/api/{db}/_multi_list", action); } @Override - public void execute(BaseRequest request, BaseResponse response) throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) + throws DdlException { String db = request.getSingleParameter(DB_KEY); if (Strings.isNullOrEmpty(db)) { throw new DdlException("No database selected"); } - AuthorizationInfo authInfo = getAuthorizationInfo(request); String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, db); checkDbAuth(authInfo, fullDbName, PrivPredicate.LOAD); + // only Master has these load info if (redirectToMaster(request, response)) { return; } + final List labels = Lists.newArrayList(); execEnv.getMultiLoadMgr().list(fullDbName, labels); sendResult(request, response, new Result(labels)); @@ -75,3 +77,4 @@ public class MultiList extends RestBaseAction { } } } + diff --git a/fe/src/main/java/com/baidu/palo/http/rest/MultiStart.java b/fe/src/main/java/com/baidu/palo/http/rest/MultiStart.java index f557993e47..7e41c1c521 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/MultiStart.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/MultiStart.java @@ -44,14 +44,15 @@ public class MultiStart extends RestBaseAction { this.execEnv = execEnv; } - public static void registerAction (ActionController controller) throws IllegalArgException { + public static void registerAction(ActionController controller) throws IllegalArgException { ExecuteEnv executeEnv = ExecuteEnv.getInstance(); MultiStart action = new MultiStart(controller, executeEnv); controller.registerHandler(HttpMethod.POST, "/api/{db}/_multi_start", action); } @Override - public void execute(BaseRequest request, BaseResponse response) throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) + throws DdlException { String db = request.getSingleParameter(DB_KEY); if (Strings.isNullOrEmpty(db)) { throw new DdlException("No database selected"); @@ -61,10 +62,11 @@ public class MultiStart extends RestBaseAction { throw new DdlException("No label selected"); } - AuthorizationInfo authInfo = getAuthorizationInfo(request); String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, db); checkDbAuth(authInfo, fullDbName, PrivPredicate.LOAD); + // Mutli start request must redirect to master, because all following sub requests will be handled + // on Master if (redirectToMaster(request, response)) { return; } @@ -81,3 +83,4 @@ public class MultiStart extends RestBaseAction { sendResult(request, response, RestBaseResult.getOk()); } } + diff --git a/fe/src/main/java/com/baidu/palo/http/rest/MultiUnload.java b/fe/src/main/java/com/baidu/palo/http/rest/MultiUnload.java index 218d440c90..a58e4391ac 100644 --- a/fe/src/main/java/com/baidu/palo/http/rest/MultiUnload.java +++ b/fe/src/main/java/com/baidu/palo/http/rest/MultiUnload.java @@ -47,7 +47,8 @@ public class MultiUnload extends RestBaseAction { } @Override - public void execute(BaseRequest request, BaseResponse response) throws DdlException { + public void executeWithoutPassword(AuthorizationInfo authInfo, BaseRequest request, BaseResponse response) + throws DdlException { String db = request.getSingleParameter(DB_KEY); if (Strings.isNullOrEmpty(db)) { throw new DdlException("No database selected"); @@ -61,7 +62,6 @@ public class MultiUnload extends RestBaseAction { throw new DdlException("No sub_label selected"); } - AuthorizationInfo authInfo = getAuthorizationInfo(request); String fullDbName = ClusterNamespace.getFullName(authInfo.cluster, db); checkDbAuth(authInfo, fullDbName, PrivPredicate.LOAD); diff --git a/fe/src/main/java/com/baidu/palo/journal/JournalEntity.java b/fe/src/main/java/com/baidu/palo/journal/JournalEntity.java index 327e2cd663..91de45648f 100644 --- a/fe/src/main/java/com/baidu/palo/journal/JournalEntity.java +++ b/fe/src/main/java/com/baidu/palo/journal/JournalEntity.java @@ -57,6 +57,7 @@ import com.baidu.palo.persist.TableInfo; import com.baidu.palo.qe.SessionVariable; import com.baidu.palo.system.Backend; import com.baidu.palo.system.Frontend; +import com.baidu.palo.transaction.TransactionState; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -108,6 +109,10 @@ public class JournalEntity implements Writable { data = new Text(); break; } + case OperationType.OP_SAVE_TRANSACTION_ID: { + data = new Text(); + break; + } case OperationType.OP_CREATE_DB: { data = new Database(); break; @@ -154,6 +159,8 @@ public class JournalEntity implements Writable { break; } case OperationType.OP_START_ROLLUP: + case OperationType.OP_FINISHING_ROLLUP: + case OperationType.OP_FINISHING_SCHEMA_CHANGE: case OperationType.OP_FINISH_ROLLUP: case OperationType.OP_CANCEL_ROLLUP: case OperationType.OP_START_SCHEMA_CHANGE: @@ -298,7 +305,8 @@ public class JournalEntity implements Writable { break; } case OperationType.OP_CREATE_CLUSTER: { - data = new Cluster(); + data = Cluster.read(in); + needRead = false; break; } case OperationType.OP_DROP_CLUSTER: { @@ -342,6 +350,11 @@ public class JournalEntity implements Writable { data = new BackendIdsUpdateInfo(); break; } + case OperationType.OP_UPSERT_TRANSACTION_STATE: + case OperationType.OP_DELETE_TRANSACTION_STATE: { + data = new TransactionState(); + break; + } case OperationType.OP_CREATE_REPOSITORY: { data = Repository.read(in); needRead = false; diff --git a/fe/src/main/java/com/baidu/palo/journal/local/LocalJournalCursor.java b/fe/src/main/java/com/baidu/palo/journal/local/LocalJournalCursor.java index 0fe1ee093c..048837ba43 100644 --- a/fe/src/main/java/com/baidu/palo/journal/local/LocalJournalCursor.java +++ b/fe/src/main/java/com/baidu/palo/journal/local/LocalJournalCursor.java @@ -196,6 +196,12 @@ public final class LocalJournalCursor implements JournalCursor { ret.setData(text); break; } + case OperationType.OP_SAVE_TRANSACTION_ID: { + Text text = new Text(); + text.readFields(in); + ret.setData(text); + break; + } case OperationType.OP_CREATE_DB: { Database db = new Database(); db.readFields(in); diff --git a/fe/src/main/java/com/baidu/palo/load/AsyncDeleteJob.java b/fe/src/main/java/com/baidu/palo/load/AsyncDeleteJob.java index 9c6168f2d8..c12dc5312f 100644 --- a/fe/src/main/java/com/baidu/palo/load/AsyncDeleteJob.java +++ b/fe/src/main/java/com/baidu/palo/load/AsyncDeleteJob.java @@ -28,7 +28,7 @@ import com.baidu.palo.common.io.Writable; import com.baidu.palo.persist.ReplicaPersistInfo; import com.baidu.palo.task.AgentTaskQueue; import com.baidu.palo.task.PushTask; - +import com.baidu.palo.thrift.TTaskType; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -53,6 +53,8 @@ public class AsyncDeleteJob implements Writable { private long dbId; private long tableId; private long partitionId; + + private long transactionId; private long partitionVersion; private long partitionVersionHash; @@ -69,6 +71,7 @@ public class AsyncDeleteJob implements Writable { tabletIds = Sets.newHashSet(); sendReplicaIdToPushTask = Maps.newHashMap(); replicaPersistInfos = Maps.newHashMap(); + transactionId = -1; } public AsyncDeleteJob(long dbId, long tableId, long partitionId, @@ -144,7 +147,8 @@ public class AsyncDeleteJob implements Writable { public void clearTasks() { for (PushTask task : sendReplicaIdToPushTask.values()) { AgentTaskQueue.removePushTask(task.getBackendId(), task.getSignature(), - task.getVersion(), task.getVersionHash(), task.getPushType()); + task.getVersion(), task.getVersionHash(), + task.getPushType(), TTaskType.PUSH); } } diff --git a/fe/src/main/java/com/baidu/palo/load/DeleteInfo.java b/fe/src/main/java/com/baidu/palo/load/DeleteInfo.java index d639a6b9a6..5e19e3da66 100644 --- a/fe/src/main/java/com/baidu/palo/load/DeleteInfo.java +++ b/fe/src/main/java/com/baidu/palo/load/DeleteInfo.java @@ -132,6 +132,11 @@ public class DeleteInfo implements Writable { public DeleteState getState() { return asyncDeleteJob == null ? DeleteState.FINISHED : asyncDeleteJob.getState(); } + + public void updatePartitionVersionInfo(long newVersion, long newVersionHash) { + this.partitionVersion = newVersion; + this.partitionVersionHash = newVersionHash; + } @Override public void write(DataOutput out) throws IOException { diff --git a/fe/src/main/java/com/baidu/palo/load/ExportJob.java b/fe/src/main/java/com/baidu/palo/load/ExportJob.java index 3cf2fcd426..1bee9b9384 100644 --- a/fe/src/main/java/com/baidu/palo/load/ExportJob.java +++ b/fe/src/main/java/com/baidu/palo/load/ExportJob.java @@ -35,9 +35,9 @@ import com.baidu.palo.catalog.Table; import com.baidu.palo.catalog.Type; import com.baidu.palo.common.Config; import com.baidu.palo.common.FeMetaVersion; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.Status; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; import com.baidu.palo.common.util.TimeUtils; @@ -207,7 +207,7 @@ public class ExportJob implements Writable { } } - private ScanNode genScanNode() throws InternalException { + private ScanNode genScanNode() throws UserException { ScanNode scanNode = null; switch (exportTable.getType()) { case OLAP: @@ -240,7 +240,7 @@ public class ExportJob implements Writable { return olapScanNode; } - private PlanFragment genPlanFragment(Table.TableType type, ScanNode scanNode) throws InternalException { + private PlanFragment genPlanFragment(Table.TableType type, ScanNode scanNode) throws UserException { PlanFragment fragment = null; switch (exportTable.getType()) { case OLAP: @@ -262,7 +262,7 @@ public class ExportJob implements Writable { fragment.finalize(analyzer, false); } catch (Exception e) { LOG.info("Fragment finalize failed. e= {}", e); - throw new InternalException("Fragment finalize failed"); + throw new UserException("Fragment finalize failed"); } return fragment; diff --git a/fe/src/main/java/com/baidu/palo/load/Load.java b/fe/src/main/java/com/baidu/palo/load/Load.java index b41db19f21..7285dc1fff 100644 --- a/fe/src/main/java/com/baidu/palo/load/Load.java +++ b/fe/src/main/java/com/baidu/palo/load/Load.java @@ -57,6 +57,7 @@ import com.baidu.palo.common.Pair; import com.baidu.palo.common.util.ListComparator; import com.baidu.palo.common.util.OrderByPair; import com.baidu.palo.common.util.TimeUtils; +import com.baidu.palo.common.util.Util; import com.baidu.palo.load.AsyncDeleteJob.DeleteState; import com.baidu.palo.load.FailMsg.CancelType; import com.baidu.palo.load.LoadJob.EtlJobType; @@ -78,6 +79,11 @@ import com.baidu.palo.thrift.TMiniLoadRequest; import com.baidu.palo.thrift.TNetworkAddress; import com.baidu.palo.thrift.TPriority; import com.baidu.palo.thrift.TPushType; +import com.baidu.palo.transaction.PartitionCommitInfo; +import com.baidu.palo.transaction.TableCommitInfo; +import com.baidu.palo.transaction.TransactionState; +import com.baidu.palo.transaction.TransactionState.LoadJobSourceType; +import com.baidu.palo.transaction.TransactionStatus; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; @@ -87,6 +93,8 @@ import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.gson.Gson; +import io.fabric8.kubernetes.api.model.extensions.Job; + import org.apache.commons.lang.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -101,8 +109,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.Random; import java.util.Set; +import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -125,9 +133,12 @@ public class Load { private Map idToLoadingLoadJob; // loadJobId to loading loadJob private Map idToQuorumFinishedLoadJob; // loadJobId to quorum finished loadJob private Set loadingPartitionIds; // loading partition id set - private Map> dbToMiniLabels; // db to mini uncommit label + // dbId -> set of (label, timestamp) + private Map> dbToMiniLabels; // db to mini uncommitted label + private Map> dbToDeleteInfos; // db to delete job list + private Map> dbToDeleteJobs; // db to delete loadJob list private Set partitionUnderDelete; // save partitions which are running delete jobs private Map idToQuorumFinishedDeleteJob; @@ -150,6 +161,7 @@ public class Load { STATE_CHANGE_MAP.put(JobState.ETL, etlDestStates); Set loadingDestStates = Sets.newHashSet(); + loadingDestStates.add(JobState.FINISHED); loadingDestStates.add(JobState.QUORUM_FINISHED); loadingDestStates.add(JobState.CANCELLED); STATE_CHANGE_MAP.put(JobState.LOADING, loadingDestStates); @@ -196,6 +208,7 @@ public class Load { loadingPartitionIds = Sets.newHashSet(); dbToMiniLabels = Maps.newHashMap(); dbToDeleteInfos = Maps.newHashMap(); + dbToDeleteJobs = Maps.newHashMap(); partitionUnderDelete = Sets.newHashSet(); idToQuorumFinishedDeleteJob = Maps.newLinkedHashMap(); lock = new ReentrantReadWriteLock(true); @@ -217,7 +230,7 @@ public class Load { lock.writeLock().unlock(); } - public boolean addLoadJob(TMiniLoadRequest request) throws DdlException { + public void addLoadJob(TMiniLoadRequest request) throws DdlException { // get params String fullDbName = request.getDb(); String tableName = request.getTbl(); @@ -274,7 +287,6 @@ public class Load { lineDelimiter = params.get(LoadStmt.KEY_IN_PARAM_LINE_DELIMITER); } - DataDescription dataDescription = new DataDescription(tableName, partitionNames, filePaths, columnNames, columnSeparator, false, null); dataDescription.setLineDelimiter(lineDelimiter); @@ -315,7 +327,7 @@ public class Load { // try to register mini label if (!registerMiniLabel(fullDbName, label, timestamp)) { - return false; + throw new DdlException("Same data label[" + label + "] already used"); } try { @@ -323,8 +335,6 @@ public class Load { } finally { deregisterMiniLabel(fullDbName, label); } - - return true; } public void addLoadJob(LoadStmt stmt, EtlJobType etlJobType, long timestamp) throws DdlException { @@ -342,7 +352,9 @@ public class Load { // for insert select from or create as stmt public void addLoadJob(String label, String dbName, - long tableId, List fileList, long timestamp) throws DdlException { + long tableId, Map indexIdToSchemaHash, + long transactionId, + List fileList, long timestamp) throws DdlException { // get db and table Database db = Catalog.getInstance().getDb(dbName); if (db == null) { @@ -367,13 +379,8 @@ public class Load { LoadJob job = createLoadJob(stmt, EtlJobType.INSERT, db, timestamp); // add schema hash - db.readLock(); - try { - for (Map.Entry entry : table.getIndexIdToSchemaHash().entrySet()) { - job.getTableLoadInfo(tableId).addIndexSchemaHash(entry.getKey(), entry.getValue()); - } - } finally { - db.readUnlock(); + for (Map.Entry entry : indexIdToSchemaHash.entrySet()) { + job.getTableLoadInfo(tableId).addIndexSchemaHash(entry.getKey(), entry.getValue()); } // file size use -1 temporarily @@ -387,33 +394,37 @@ public class Load { status.setState(TEtlState.FINISHED); status.setFileMap(fileMap); job.setState(JobState.ETL); + job.setTransactionId(transactionId); // add load job addLoadJob(job, db); } - // This is a final step of all addLoadJob() methods + // This is the final step of all addLoadJob() methods private void addLoadJob(LoadJob job, Database db) throws DdlException { // check cluster capacity Catalog.getCurrentSystemInfo().checkClusterCapacity(db.getClusterName()); - // check db quota - db.checkQuota(); - - // check if table is in restore process - db.readLock(); - try { - for (Long tblId : job.getIdToTableLoadInfo().keySet()) { - Table tbl = db.getTable(tblId); - if (tbl != null && tbl.getType() == TableType.OLAP - && ((OlapTable) tbl).getState() == OlapTableState.RESTORE) { - throw new DdlException("Table " + tbl.getName() + " is in restore process. " - + "Can not load into it"); - } - } - } finally { - db.readUnlock(); + // for original job, check quota + // for delete job, not check + if (!job.isSyncDeleteJob()) { + db.checkQuota(); } + // check if table is in restore process + db.readLock(); + try { + for (Long tblId : job.getIdToTableLoadInfo().keySet()) { + Table tbl = db.getTable(tblId); + if (tbl != null && tbl.getType() == TableType.OLAP + && ((OlapTable) tbl).getState() == OlapTableState.RESTORE) { + throw new DdlException("Table " + tbl.getName() + " is in restore process. " + + "Can not load into it"); + } + } + } finally { + db.readUnlock(); + } + writeLock(); try { unprotectAddLoadJob(job, false /* not replay */); @@ -498,7 +509,7 @@ public class Load { for (DataDescription dataDescription : dataDescriptions) { // create source createSource(db, dataDescription, tableToPartitionSources, job.getDeleteFlag()); - job.addTableName(dataDescription.getTableName()); + job.addTableName(dataDescription.getTableName()); } for (Entry>> tableEntry : tableToPartitionSources.entrySet()) { long tableId = tableEntry.getKey(); @@ -570,7 +581,7 @@ public class Load { cluster = properties.get(LoadStmt.CLUSTER_PROPERTY); } - Pair clusterInfo = Catalog.getInstance().getAuth().getLoadClusterInfo( + Pair clusterInfo = Catalog.getCurrentCatalog().getAuth().getLoadClusterInfo( stmt.getUser(), cluster); cluster = clusterInfo.first; DppConfig clusterConfig = clusterInfo.second; @@ -807,46 +818,62 @@ public class Load { long dbId = job.getDbId(); String label = job.getLabel(); - long timestamp = job.getTimestamp(); if (!isReplay && getAllUnfinishedLoadJob() > Config.max_unfinished_load_job) { throw new DdlException( "Number of unfinished load jobs exceed the max number: " + Config.max_unfinished_load_job); } - - // check label exist - boolean checkMini = true; - if (job.getEtlJobType() == EtlJobType.MINI) { - // already registered, do not need check - checkMini = false; - } - checkLabelUsed(dbId, label, timestamp, checkMini); - // add job - Map> labelToLoadJobs = null; - if (dbLabelToLoadJobs.containsKey(dbId)) { - labelToLoadJobs = dbLabelToLoadJobs.get(dbId); + if (!job.isSyncDeleteJob()) { + // check label exist + boolean checkMini = true; + if (job.getEtlJobType() == EtlJobType.MINI) { + // already registered, do not need check + checkMini = false; + } + + if (isLabelUsed(dbId, label, -1, checkMini)) { + throw new DdlException("Same data label[" + label + "] already used"); + } + + // add job + Map> labelToLoadJobs = null; + if (dbLabelToLoadJobs.containsKey(dbId)) { + labelToLoadJobs = dbLabelToLoadJobs.get(dbId); + } else { + labelToLoadJobs = Maps.newHashMap(); + dbLabelToLoadJobs.put(dbId, labelToLoadJobs); + } + List labelLoadJobs = null; + if (labelToLoadJobs.containsKey(label)) { + labelLoadJobs = labelToLoadJobs.get(label); + } else { + labelLoadJobs = Lists.newArrayList(); + labelToLoadJobs.put(label, labelLoadJobs); + } + + List dbLoadJobs = null; + if (dbToLoadJobs.containsKey(dbId)) { + dbLoadJobs = dbToLoadJobs.get(dbId); + } else { + dbLoadJobs = Lists.newArrayList(); + dbToLoadJobs.put(dbId, dbLoadJobs); + } + idToLoadJob.put(jobId, job); + dbLoadJobs.add(job); + labelLoadJobs.add(job); } else { - labelToLoadJobs = Maps.newHashMap(); - dbLabelToLoadJobs.put(dbId, labelToLoadJobs); - } - List labelLoadJobs = null; - if (labelToLoadJobs.containsKey(label)) { - labelLoadJobs = labelToLoadJobs.get(label); - } else { - labelLoadJobs = Lists.newArrayList(); - labelToLoadJobs.put(label, labelLoadJobs); + List dbDeleteJobs = null; + if (dbToDeleteJobs.containsKey(dbId)) { + dbDeleteJobs = dbToDeleteJobs.get(dbId); + } else { + dbDeleteJobs = Lists.newArrayList(); + dbToDeleteJobs.put(dbId, dbDeleteJobs); + } + idToLoadJob.put(jobId, job); + dbDeleteJobs.add(job); } - List dbLoadJobs = null; - if (dbToLoadJobs.containsKey(dbId)) { - dbLoadJobs = dbToLoadJobs.get(dbId); - } else { - dbLoadJobs = Lists.newArrayList(); - dbToLoadJobs.put(dbId, dbLoadJobs); - } - idToLoadJob.put(jobId, job); - dbLoadJobs.add(job); - labelLoadJobs.add(job); + // beginTransaction Here switch (job.getState()) { case PENDING: @@ -876,7 +903,7 @@ public class Load { private long getAllUnfinishedLoadJob() { return idToPendingLoadJob.size() + idToEtlLoadJob.size() + idToLoadingLoadJob.size() - + idToQuorumFinishedLoadJob.size(); + + idToQuorumFinishedLoadJob.size(); } public void replayAddLoadJob(LoadJob job) throws DdlException { @@ -925,7 +952,6 @@ public class Load { } } - public boolean registerMiniLabel( String fullDbName, String label, long timestamp) throws DdlException { Database db = Catalog.getInstance().getDb(fullDbName); @@ -936,18 +962,18 @@ public class Load { long dbId = db.getId(); writeLock(); try { - if (!checkLabelUsed(dbId, label, timestamp, true)) { + if (isLabelUsed(dbId, label, -1, true)) { return false; } - Set miniLabels = null; + Map miniLabels = null; if (dbToMiniLabels.containsKey(dbId)) { miniLabels = dbToMiniLabels.get(dbId); } else { - miniLabels = Sets.newHashSet(); + miniLabels = Maps.newHashMap(); dbToMiniLabels.put(dbId, miniLabels); } - miniLabels.add(label); + miniLabels.put(label, timestamp); } finally { writeUnlock(); } @@ -968,7 +994,7 @@ public class Load { return; } - Set miniLabels = dbToMiniLabels.get(dbId); + Map miniLabels = dbToMiniLabels.get(dbId); miniLabels.remove(label); if (miniLabels.isEmpty()) { dbToMiniLabels.remove(dbId); @@ -978,7 +1004,7 @@ public class Load { } } - public void checkLabelUsed(String fullDbName, String label, long timestamp) throws DdlException { + public boolean isLabelUsed(String fullDbName, String label, long timestamp) throws DdlException { Database db = Catalog.getInstance().getDb(fullDbName); if (db == null) { throw new DdlException("Db does not exist. name: " + fullDbName); @@ -986,15 +1012,25 @@ public class Load { readLock(); try { - checkLabelUsed(db.getId(), label, timestamp, true); + return isLabelUsed(db.getId(), label, timestamp, true); } finally { readUnlock(); } } - private boolean checkLabelUsed(long dbId, String label, - long timestamp, boolean checkMini) throws DdlException { - final String labelUsedMsg = "Same data label[" + label + "] already used"; + /* + * if timestamp does not equals to -1, this is a retry request: + * 1. if label has been used, and job's timestamp equals to the given one, return true. + * 2. if label has been used, but timestamp is not equal, return false. The caller will finally call + * this method again will timestamp == -1, and return if label has been used or not. + * 3. if label does not exist, just return false, as usual. + * + * if timestamp equals to -1, return true if label has been used, otherwise, return false. + * + * throw DdlException if encounter other exception. + */ + private boolean isLabelUsed(long dbId, String label, long timestamp, boolean checkMini) + throws DdlException { // check dbLabelToLoadJobs if (dbLabelToLoadJobs.containsKey(dbId)) { @@ -1004,10 +1040,17 @@ public class Load { for (LoadJob oldJob : labelLoadJobs) { JobState oldJobState = oldJob.getState(); if (oldJobState != JobState.CANCELLED) { - if (timestamp == oldJob.getTimestamp()) { - return false; + if (timestamp == -1) { + return true; } else { - throw new DdlException(labelUsedMsg); + if (timestamp == oldJob.getTimestamp()) { + // this timestamp is used to verify if this label check is a retry request from backend. + // if the timestamp in request is same as timestamp in existing load job, + // which means this load job is already submitted + return true; + } else { + return false; + } } } } @@ -1017,14 +1060,25 @@ public class Load { // check dbToMiniLabel if (checkMini) { if (dbToMiniLabels.containsKey(dbId)) { - Set uncommittedLabels = dbToMiniLabels.get(dbId); - if (uncommittedLabels.contains(label)) { - throw new DdlException(labelUsedMsg); + Map uncommittedLabels = dbToMiniLabels.get(dbId); + if (uncommittedLabels.containsKey(label)) { + if (timestamp == -1) { + return true; + } else { + if (timestamp == uncommittedLabels.get(label)) { + // this timestamp is used to verify if this label check is a retry request from backend. + // if the timestamp in request is same as timestamp in existing load job, + // which means this load job is already submitted + return true; + } else { + return false; + } + } } } } - return true; + return false; } public boolean cancelLoadJob(CancelLoadStmt stmt) throws DdlException { @@ -1061,24 +1115,24 @@ public class Load { readUnlock(); } - // check auth here, cause we need table info - Set tableNames = job.getTableNames(); - if (tableNames.isEmpty()) { - // forward compatibility - if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, - PrivPredicate.LOAD)) { - ErrorReport.reportDdlException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "CANCEL LOAD"); - } - } else { - for (String tblName : tableNames) { - if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, tblName, - PrivPredicate.LOAD)) { - ErrorReport.reportDdlException(ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "CANCEL LOAD", - ConnectContext.get().getQualifiedUser(), - ConnectContext.get().getRemoteIP(), tblName); - } - } - } + // check auth here, cause we need table info + Set tableNames = job.getTableNames(); + if (tableNames.isEmpty()) { + // forward compatibility + if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, + PrivPredicate.LOAD)) { + ErrorReport.reportDdlException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "CANCEL LOAD"); + } + } else { + for (String tblName : tableNames) { + if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, tblName, + PrivPredicate.LOAD)) { + ErrorReport.reportDdlException(ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "CANCEL LOAD", + ConnectContext.get().getQualifiedUser(), + ConnectContext.get().getRemoteIP(), tblName); + } + } + } // cancel job if (!cancelLoadJob(job, CancelType.USER_CANCEL, "user cancel")) { @@ -1092,7 +1146,7 @@ public class Load { // update job to cancelled JobState srcState = job.getState(); if (!updateLoadJobState(job, JobState.CANCELLED, cancelType, msg)) { - LOG.warn("cancel load job failed. job: {}", job); + LOG.warn("cancel load job failed. job: {}", job, new Exception()); return false; } @@ -1187,7 +1241,16 @@ public class Load { public int getLoadJobNumber() { readLock(); try { - return idToLoadJob.size(); + if (idToLoadJob == null) { + return 0; + } + int loadJobNum = 0; + for (LoadJob loadJob : idToLoadJob.values()) { + if (!loadJob.isSyncDeleteJob()) { + ++ loadJobNum; + } + } + return loadJobNum; } finally { readUnlock(); } @@ -1201,10 +1264,30 @@ public class Load { return dbToLoadJobs; } + public Map> getDbToDeleteJobs() { + return dbToDeleteJobs; + } + public Map> getDbToDeleteInfos() { return dbToDeleteInfos; } + public Set getTxnIdsByDb(Long dbId) { + Set txnIds = Sets.newHashSet(); + readLock(); + try { + List jobs = dbToLoadJobs.get(dbId); + if (jobs != null) { + for (LoadJob loadJob : jobs) { + txnIds.add(loadJob.getTransactionId()); + } + } + } finally { + readUnlock(); + } + return txnIds; + } + public List getDbLoadJobs(long dbId) { readLock(); try { @@ -1301,13 +1384,13 @@ public class Load { return loadJobInfos; } - long start = System.currentTimeMillis(); - LOG.debug("begin to get load job info, size: {}", loadJobs.size()); + long start = System.currentTimeMillis(); + LOG.debug("begin to get load job info, size: {}", loadJobs.size()); for (LoadJob loadJob : loadJobs) { // filter first String label = loadJob.getLabel(); JobState state = loadJob.getState(); - + if (labelValue != null) { if (accurateMatch) { if (!label.equals(labelValue)) { @@ -1319,35 +1402,35 @@ public class Load { } } } - + if (states != null) { if (!states.contains(state)) { continue; } } - // check auth - Set tableNames = loadJob.getTableNames(); - if (tableNames.isEmpty()) { - // forward compatibility - if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, - PrivPredicate.SHOW)) { - continue; - } - } else { - boolean auth = true; - for (String tblName : tableNames) { - if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, - tblName, PrivPredicate.SHOW)) { - auth = false; - break; - } - } - if (!auth) { - continue; - } - } - + // check auth + Set tableNames = loadJob.getTableNames(); + if (tableNames.isEmpty()) { + // forward compatibility + if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, + PrivPredicate.SHOW)) { + continue; + } + } else { + boolean auth = true; + for (String tblName : tableNames) { + if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, + tblName, PrivPredicate.SHOW)) { + auth = false; + break; + } + } + if (!auth) { + continue; + } + } + List jobInfo = new ArrayList(); // jobId @@ -1434,7 +1517,7 @@ public class Load { loadJobInfos.add(jobInfo); } // end for loadJobs - LOG.debug("finished to get load job info, cost: {}", (System.currentTimeMillis() - start)); + LOG.debug("finished to get load job info, cost: {}", (System.currentTimeMillis() - start)); } finally { readUnlock(); } @@ -1451,14 +1534,14 @@ public class Load { return loadJobInfos; } - public LoadJob getLatestJobIdByLabel(long dbId, String labelValue) { - LoadJob job = null; + public long getLatestJobIdByLabel(long dbId, String labelValue) { + LoadJob job = null; long jobId = 0; try { readLock(); List loadJobs = this.dbToLoadJobs.get(dbId); if (loadJobs == null) { - return null; + return 0; } for (LoadJob loadJob : loadJobs) { @@ -1474,14 +1557,14 @@ public class Load { if (currJobId > jobId) { jobId = currJobId; - job = loadJob; + job = loadJob; } } } finally { readUnlock(); } - return job; + return jobId; } public List> getLoadJobUnfinishedInfo(long jobId) { @@ -1584,8 +1667,8 @@ public class Load { // Note: althrough this.loadErrorHubInfo is volatile, no need to lock. // but editlog need be locked public void changeLoadErrorHubInfo(LoadErrorHub.Param info) { - writeLock(); - try { + writeLock(); + try { this.loadErrorHubInfo = info; Catalog.getInstance().getEditLog().logSetLoadErrorHub(info); } finally { @@ -1595,7 +1678,7 @@ public class Load { public static class JobInfo { public String dbName; - public Set tblNames = Sets.newHashSet(); + public Set tblNames = Sets.newHashSet(); public String label; public String clusterName; public JobState state; @@ -1613,7 +1696,7 @@ public class Load { // result saved in info public void getJobInfo(JobInfo info) throws DdlException { String fullDbName = ClusterNamespace.getFullName(info.clusterName, info.dbName); - info.dbName = fullDbName; + info.dbName = fullDbName; Database db = Catalog.getInstance().getDb(fullDbName); if (db == null) { throw new DdlException("Unknown database(" + info.dbName + ")"); @@ -1631,10 +1714,10 @@ public class Load { // only the last one should be running LoadJob job = loadJobs.get(loadJobs.size() - 1); - if (!job.getTableNames().isEmpty()) { - info.tblNames.addAll(job.getTableNames()); - } - + if (!job.getTableNames().isEmpty()) { + info.tblNames.addAll(job.getTableNames()); + } + info.state = job.getState(); if (info.state == JobState.QUORUM_FINISHED) { info.state = JobState.FINISHED; @@ -1648,83 +1731,85 @@ public class Load { } public void unprotectQuorumLoadJob(LoadJob job, Database db) { - // remove loading partitions - removeLoadingPartitions(job); - - // Update database information first - Map replicaInfos = job.getReplicaPersistInfos(); - if (replicaInfos != null) { - for (ReplicaPersistInfo info : replicaInfos.values()) { - OlapTable table = (OlapTable) db.getTable(info.getTableId()); - if (table == null) { - LOG.warn("the table[{}] is missing", info.getIndexId()); - continue; - } - Partition partition = table.getPartition(info.getPartitionId()); - if (partition == null) { - LOG.warn("the partition[{}] is missing", info.getIndexId()); - continue; - } - MaterializedIndex index = partition.getIndex(info.getIndexId()); - if (index == null) { - LOG.warn("the index[{}] is missing", info.getIndexId()); - continue; - } - Tablet tablet = index.getTablet(info.getTabletId()); - if (tablet == null) { - LOG.warn("the tablet[{}] is missing", info.getTabletId()); - continue; - } - - Replica replica = tablet.getReplicaById(info.getReplicaId()); - if (replica == null) { - LOG.warn("the replica[{}] is missing", info.getReplicaId()); - continue; - } - replica.updateInfo(info.getVersion(), info.getVersionHash(), - info.getDataSize(), info.getRowCount()); - } - } - - long jobId = job.getId(); - Map idToTableLoadInfo = job.getIdToTableLoadInfo(); - if (idToTableLoadInfo != null) { - for (Entry tableEntry : idToTableLoadInfo.entrySet()) { - long tableId = tableEntry.getKey(); - OlapTable table = (OlapTable) db.getTable(tableId); - TableLoadInfo tableLoadInfo = tableEntry.getValue(); - for (Entry entry : tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { - long partitionId = entry.getKey(); - Partition partition = table.getPartition(partitionId); - PartitionLoadInfo partitionLoadInfo = entry.getValue(); - if (!partitionLoadInfo.isNeedLoad()) { + // in real time load replica info and partition version is set by transaction manager not by job + if (job.getTransactionId() < 0) { + // remove loading partitions + removeLoadingPartitions(job); + + // Update database information first + Map replicaInfos = job.getReplicaPersistInfos(); + if (replicaInfos != null) { + for (ReplicaPersistInfo info : replicaInfos.values()) { + OlapTable table = (OlapTable) db.getTable(info.getTableId()); + if (table == null) { + LOG.warn("the table[{}] is missing", info.getIndexId()); continue; } - updatePartitionVersion(partition, partitionLoadInfo.getVersion(), - partitionLoadInfo.getVersionHash(), jobId); - - // update table row count - for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { - long tableRowCount = 0L; - for (Tablet tablet : materializedIndex.getTablets()) { - long tabletRowCount = 0L; - for (Replica replica : tablet.getReplicas()) { - long replicaRowCount = replica.getRowCount(); - if (replicaRowCount > tabletRowCount) { - tabletRowCount = replicaRowCount; - } - } - tableRowCount += tabletRowCount; + Partition partition = table.getPartition(info.getPartitionId()); + if (partition == null) { + LOG.warn("the partition[{}] is missing", info.getIndexId()); + continue; + } + MaterializedIndex index = partition.getIndex(info.getIndexId()); + if (index == null) { + LOG.warn("the index[{}] is missing", info.getIndexId()); + continue; + } + Tablet tablet = index.getTablet(info.getTabletId()); + if (tablet == null) { + LOG.warn("the tablet[{}] is missing", info.getTabletId()); + continue; + } + + Replica replica = tablet.getReplicaById(info.getReplicaId()); + if (replica == null) { + LOG.warn("the replica[{}] is missing", info.getReplicaId()); + continue; + } + replica.updateInfo(info.getVersion(), info.getVersionHash(), + info.getDataSize(), info.getRowCount()); + } + } + + long jobId = job.getId(); + Map idToTableLoadInfo = job.getIdToTableLoadInfo(); + if (idToTableLoadInfo != null) { + for (Entry tableEntry : idToTableLoadInfo.entrySet()) { + long tableId = tableEntry.getKey(); + OlapTable table = (OlapTable) db.getTable(tableId); + TableLoadInfo tableLoadInfo = tableEntry.getValue(); + for (Entry entry : tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { + long partitionId = entry.getKey(); + Partition partition = table.getPartition(partitionId); + PartitionLoadInfo partitionLoadInfo = entry.getValue(); + if (!partitionLoadInfo.isNeedLoad()) { + continue; } - materializedIndex.setRowCount(tableRowCount); - } // end for indices - } // end for partitions - } // end for tables + updatePartitionVersion(partition, partitionLoadInfo.getVersion(), + partitionLoadInfo.getVersionHash(), jobId); + + // update table row count + for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { + long tableRowCount = 0L; + for (Tablet tablet : materializedIndex.getTablets()) { + long tabletRowCount = 0L; + for (Replica replica : tablet.getReplicas()) { + long replicaRowCount = replica.getRowCount(); + if (replicaRowCount > tabletRowCount) { + tabletRowCount = replicaRowCount; + } + } + tableRowCount += tabletRowCount; + } + materializedIndex.setRowCount(tableRowCount); + } // end for indices + } // end for partitions + } // end for tables + } + + idToLoadingLoadJob.remove(jobId); + idToQuorumFinishedLoadJob.put(jobId, job); } - - idToLoadingLoadJob.remove(jobId); - idToQuorumFinishedLoadJob.put(jobId, job); - replaceLoadJob(job); } @@ -1745,44 +1830,54 @@ public class Load { } public void unprotectFinishLoadJob(LoadJob job, Database db) { + // in real time load, replica info is not set by job, it is set by transaction manager long jobId = job.getId(); - idToQuorumFinishedLoadJob.remove(jobId); - - // Update database information - Map replicaInfos = job.getReplicaPersistInfos(); - if (replicaInfos != null) { - for (ReplicaPersistInfo info : replicaInfos.values()) { - OlapTable table = (OlapTable) db.getTable(info.getTableId()); - if (table == null) { - LOG.warn("the table[{}] is missing", info.getIndexId()); - continue; + if (job.getTransactionId() < 0) { + idToQuorumFinishedLoadJob.remove(jobId); + + // Update database information + Map replicaInfos = job.getReplicaPersistInfos(); + if (replicaInfos != null) { + for (ReplicaPersistInfo info : replicaInfos.values()) { + OlapTable table = (OlapTable) db.getTable(info.getTableId()); + if (table == null) { + LOG.warn("the table[{}] is missing", info.getIndexId()); + continue; + } + Partition partition = table.getPartition(info.getPartitionId()); + if (partition == null) { + LOG.warn("the partition[{}] is missing", info.getIndexId()); + continue; + } + MaterializedIndex index = partition.getIndex(info.getIndexId()); + if (index == null) { + LOG.warn("the index[{}] is missing", info.getIndexId()); + continue; + } + Tablet tablet = index.getTablet(info.getTabletId()); + if (tablet == null) { + LOG.warn("the tablet[{}] is missing", info.getTabletId()); + continue; + } + + Replica replica = tablet.getReplicaById(info.getReplicaId()); + if (replica == null) { + LOG.warn("the replica[{}] is missing", info.getReplicaId()); + continue; + } + replica.updateInfo(info.getVersion(), info.getVersionHash(), + info.getDataSize(), info.getRowCount()); } - Partition partition = table.getPartition(info.getPartitionId()); - if (partition == null) { - LOG.warn("the partition[{}] is missing", info.getIndexId()); - continue; - } - MaterializedIndex index = partition.getIndex(info.getIndexId()); - if (index == null) { - LOG.warn("the index[{}] is missing", info.getIndexId()); - continue; - } - Tablet tablet = index.getTablet(info.getTabletId()); - if (tablet == null) { - LOG.warn("the tablet[{}] is missing", info.getTabletId()); - continue; - } - - Replica replica = tablet.getReplicaById(info.getReplicaId()); - if (replica == null) { - LOG.warn("the replica[{}] is missing", info.getReplicaId()); - continue; - } - replica.updateInfo(info.getVersion(), info.getVersionHash(), - info.getDataSize(), info.getRowCount()); } + } else { + // in realtime load, does not exist a quorum finish stage, so that should remove job from pending queue and + // loading queue at finish stage + idToPendingLoadJob.remove(jobId); + // for delete load job, it also in id to loading job + idToLoadingLoadJob.remove(jobId); + job.setProgress(100); + job.setLoadFinishTimeMs(System.currentTimeMillis()); } - replaceLoadJob(job); } @@ -1826,49 +1921,78 @@ public class Load { } idToLoadJob.put(jobId, job); - // Replace LoadJob in dbToLoadJobs - List jobs = dbToLoadJobs.get(job.getDbId()); - if (jobs == null) { - LOG.warn("Does not find db in dbToLoadJobs. DbId : {}", job.getDbId()); - return; - } - int pos = 0; - for (LoadJob oneJob : jobs) { - if (oneJob.getId() == jobId) { - break; + if (!job.isSyncDeleteJob()) { + // Replace LoadJob in dbToLoadJobs + List jobs = dbToLoadJobs.get(job.getDbId()); + if (jobs == null) { + LOG.warn("Does not find db in dbToLoadJobs. DbId : {}", + job.getDbId()); + return; } - pos++; - } - if (pos == jobs.size()) { - LOG.warn("Does not find load job for db. DbId : {}, jobId : {}", job.getDbId(), jobId); - return; - } - jobs.remove(pos); - jobs.add(pos, job); - - // Replace LoadJob in dbLabelToLoadJobs - if (dbLabelToLoadJobs.get(job.getDbId()) == null) { - LOG.warn("Does not find db in dbLabelToLoadJobs. DbId : {}", job.getDbId()); - return; - } - jobs = dbLabelToLoadJobs.get(job.getDbId()).get(job.getLabel()); - if (jobs == null) { - LOG.warn("Does not find label for db. label : {}, DbId : {}", job.getLabel(), job.getDbId()); - return; - } - pos = 0; - for (LoadJob oneJob : jobs) { - if (oneJob.getId() == jobId) { - break; + int pos = 0; + for (LoadJob oneJob : jobs) { + if (oneJob.getId() == jobId) { + break; + } + pos++; } - pos++; + if (pos == jobs.size()) { + LOG.warn("Does not find load job for db. DbId : {}, jobId : {}", + job.getDbId(), jobId); + return; + } + jobs.remove(pos); + jobs.add(pos, job); + + // Replace LoadJob in dbLabelToLoadJobs + if (dbLabelToLoadJobs.get(job.getDbId()) == null) { + LOG.warn("Does not find db in dbLabelToLoadJobs. DbId : {}", + job.getDbId()); + return; + } + jobs = dbLabelToLoadJobs.get(job.getDbId()).get(job.getLabel()); + if (jobs == null) { + LOG.warn("Does not find label for db. label : {}, DbId : {}", + job.getLabel(), job.getDbId()); + return; + } + pos = 0; + for (LoadJob oneJob : jobs) { + if (oneJob.getId() == jobId) { + break; + } + pos++; + } + if (pos == jobs.size()) { + LOG.warn("Does not find load job for label. label : {}, DbId : {}", + job.getLabel(), job.getDbId()); + return; + } + jobs.remove(pos); + jobs.add(pos, job); + } else { + // Replace LoadJob in dbToLoadJobs + List jobs = dbToDeleteJobs.get(job.getDbId()); + if (jobs == null) { + LOG.warn("Does not find db in dbToDeleteJobs. DbId : {}", + job.getDbId()); + return; + } + int pos = 0; + for (LoadJob oneJob : jobs) { + if (oneJob.getId() == jobId) { + break; + } + pos++; + } + if (pos == jobs.size()) { + LOG.warn("Does not find delete load job for db. DbId : {}, jobId : {}", + job.getDbId(), jobId); + return; + } + jobs.remove(pos); + jobs.add(pos, job); } - if (pos == jobs.size()) { - LOG.warn("Does not find load job for label. label : {}, DbId : {}", job.getLabel(), job.getDbId()); - return; - } - jobs.remove(pos); - jobs.add(pos, job); } // remove all db jobs from dbToLoadJobs and dbLabelToLoadJobs @@ -1890,6 +2014,9 @@ public class Load { if (dbLabelToLoadJobs.containsKey(dbId)) { dbLabelToLoadJobs.remove(dbId); } + if (dbToDeleteJobs.containsKey(dbId)) { + dbToDeleteJobs.remove(dbId); + } } finally { writeUnlock(); } @@ -1910,7 +2037,6 @@ public class Load { && (job.getState() == JobState.FINISHED || job.getState() == JobState.CANCELLED)) { long dbId = job.getDbId(); String label = job.getLabel(); - // Remove job from idToLoadJob iter.remove(); @@ -1923,6 +2049,15 @@ public class Load { } } + // remove delete job from dbToDeleteJobs + List deleteJobs = dbToDeleteJobs.get(dbId); + if (deleteJobs != null) { + deleteJobs.remove(job); + if (deleteJobs.size() == 0) { + dbToDeleteJobs.remove(dbId); + } + } + // Remove job from dbLabelToLoadJobs Map> mapLabelToJobs = dbLabelToLoadJobs.get(dbId); if (mapLabelToJobs != null) { @@ -2000,6 +2135,8 @@ public class Load { break; case BROKER: break; + case DELETE: + break; default: LOG.warn("unknown etl job type. type: {}, job id: {}", etlJobType.name(), job.getId()); break; @@ -2024,6 +2161,8 @@ public class Load { LOG.warn(errMsg); writeLock(); try { + // sometimes db is dropped and then cancel the job, the job must have transactionid + // transaction state should only be dropped when job is dropped processCancelled(job, cancelType, errMsg); } finally { writeUnlock(); @@ -2073,6 +2212,22 @@ public class Load { } break; case FINISHED: + if (job.getTransactionId() > 0) { + idToPendingLoadJob.remove(jobId); + idToLoadingLoadJob.remove(jobId); + job.setProgress(100); + job.setLoadFinishTimeMs(System.currentTimeMillis()); + // if this is a sync delete job, then update affected version and version hash + if (job.isSyncDeleteJob()) { + TransactionState transactionState = Catalog.getCurrentGlobalTransactionMgr() + .getTransactionState(job.getTransactionId()); + DeleteInfo deleteInfo = job.getDeleteInfo(); + TableCommitInfo tableCommitInfo = transactionState.getTableCommitInfo(deleteInfo.getTableId()); + PartitionCommitInfo partitionCommitInfo = tableCommitInfo.getPartitionCommitInfo(deleteInfo.getPartitionId()); + deleteInfo.updatePartitionVersionInfo(partitionCommitInfo.getVersion(), + partitionCommitInfo.getVersionHash()); + } + } MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); idToQuorumFinishedLoadJob.remove(jobId); job.setState(destState); @@ -2081,10 +2236,13 @@ public class Load { for (PushTask pushTask : job.getPushTasks()) { AgentTaskQueue.removePushTask(pushTask.getBackendId(), pushTask.getSignature(), pushTask.getVersion(), pushTask.getVersionHash(), - pushTask.getPushType()); + pushTask.getPushType(), pushTask.getTaskType()); } // Clear the Map and Set in this job, reduce the memory cost for finished load job. - job.clearRedundantInfoForHistoryJob(); + // for delete job, keep the map and set because some of them is used in show proc method + if (!job.isSyncDeleteJob()) { + job.clearRedundantInfoForHistoryJob(); + } // Write edit log Catalog.getInstance().getEditLog().logLoadDone(job); break; @@ -2188,8 +2346,7 @@ public class Load { private void updatePartitionVersion(Partition partition, long version, long versionHash, long jobId) { long partitionId = partition.getId(); - partition.setCommittedVersion(version); - partition.setCommittedVersionHash(versionHash); + partition.updateCommitVersionAndVersionHash(version, versionHash); LOG.info("update partition version success. version: {}, version hash: {}, job id: {}, partition id: {}", version, versionHash, jobId, partitionId); } @@ -2198,6 +2355,16 @@ public class Load { long jobId = job.getId(); JobState srcState = job.getState(); CancelType tmpCancelType = CancelType.UNKNOWN; + // should abort in transaction manager first because it maybe abort job successfully and abort in transaction manager failed + // then there will be rubbish transactions in transaction manager + try { + Catalog.getCurrentGlobalTransactionMgr().abortTransaction( + job.getTransactionId(), + job.getFailMsg().toString()); + } catch (Exception e) { + LOG.info("errors while abort transaction", e); + return false; + } switch (srcState) { case PENDING: idToPendingLoadJob.remove(jobId); @@ -2210,7 +2377,6 @@ public class Load { case LOADING: // remove partition from loading set removeLoadingPartitions(job); - idToLoadingLoadJob.remove(jobId); tmpCancelType = CancelType.LOAD_RUN_FAIL; break; @@ -2222,7 +2388,7 @@ public class Load { Preconditions.checkState(false, "wrong job state: " + srcState.name()); break; } - + // set failMsg and state CancelType newCancelType = cancelType; if (newCancelType == CancelType.UNKNOWN) { @@ -2238,13 +2404,12 @@ public class Load { for (PushTask pushTask : job.getPushTasks()) { AgentTaskQueue.removePushTask(pushTask.getBackendId(), pushTask.getSignature(), pushTask.getVersion(), pushTask.getVersionHash(), - pushTask.getPushType()); + pushTask.getPushType(), pushTask.getTaskType()); } } // Clear the Map and Set in this job, reduce the memory cost of canceled load job. job.clearRedundantInfoForHistoryJob(); - // Write edit log Catalog.getInstance().getEditLog().logLoadCancel(job); return true; @@ -2267,6 +2432,11 @@ public class Load { } private void recoverLoadingPartitions(LoadJob job) { + // loading partition ids is used to avoid concurrent loading to a single partition + // but in realtime load, concurrent loading is allowed, so it is useless + if (job.getTransactionId() > 0) { + return; + } for (TableLoadInfo tableLoadInfo : job.getIdToTableLoadInfo().values()) { Map idToPartitionLoadInfo = tableLoadInfo.getIdToPartitionLoadInfo(); for (Entry entry : idToPartitionLoadInfo.entrySet()) { @@ -2366,7 +2536,13 @@ public class Load { // add to deleteInfos if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_11) { - unprotectAddDeleteInfo(deleteInfo); + long dbId = deleteInfo.getDbId(); + List deleteInfos = dbToDeleteInfos.get(dbId); + if (deleteInfos == null) { + deleteInfos = Lists.newArrayList(); + dbToDeleteInfos.put(dbId, deleteInfos); + } + deleteInfos.add(deleteInfo); } if (deleteInfo.getAsyncDeleteJob() != null) { @@ -2441,7 +2617,120 @@ public class Load { db.writeUnlock(); } } + private void checkDeleteV2(OlapTable table, Partition partition, List conditions, List deleteConditions, boolean preCheck) + throws DdlException { + // check partition state + PartitionState state = partition.getState(); + if (state != PartitionState.NORMAL) { + // ErrorReport.reportDdlException(ErrorCode.ERR_BAD_PARTITION_STATE, partition.getName(), state.name()); + throw new DdlException("Partition[" + partition.getName() + "]' state is not NORNAL: " + state.name()); + } + // do not need check whether partition has loading job + + // async delete job does not exist any more + + // check condition column is key column and condition value + Map nameToColumn = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); + for (Column column : table.getBaseSchema()) { + nameToColumn.put(column.getName(), column); + } + for (Predicate condition : conditions) { + SlotRef slotRef = null; + if (condition instanceof BinaryPredicate) { + BinaryPredicate binaryPredicate = (BinaryPredicate) condition; + slotRef = (SlotRef) binaryPredicate.getChild(0); + } else if (condition instanceof IsNullPredicate) { + IsNullPredicate isNullPredicate = (IsNullPredicate) condition; + slotRef = (SlotRef) isNullPredicate.getChild(0); + } + String columnName = slotRef.getColumnName(); + if (!nameToColumn.containsKey(columnName)) { + ErrorReport.reportDdlException(ErrorCode.ERR_BAD_FIELD_ERROR, columnName, table.getName()); + } + + Column column = nameToColumn.get(columnName); + if (!column.isKey()) { + // ErrorReport.reportDdlException(ErrorCode.ERR_NOT_KEY_COLUMN, columnName); + throw new DdlException("Column[" + columnName + "] is not key column"); + } + + if (condition instanceof BinaryPredicate) { + String value = null; + try { + BinaryPredicate binaryPredicate = (BinaryPredicate) condition; + value = ((LiteralExpr) binaryPredicate.getChild(1)).getStringValue(); + LiteralExpr.create(value, Type.fromPrimitiveType(column.getDataType())); + } catch (AnalysisException e) { + // ErrorReport.reportDdlException(ErrorCode.ERR_INVALID_VALUE, value); + throw new DdlException("Invalid column value[" + value + "]"); + } + } + + // set schema column name + slotRef.setCol(column.getName()); + } + Map> indexIdToSchema = table.getIndexIdToSchema(); + for (MaterializedIndex index : partition.getMaterializedIndices()) { + // check table has condition column + Map indexNameToColumn = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); + for (Column column : indexIdToSchema.get(index.getId())) { + indexNameToColumn.put(column.getName(), column); + } + String indexName = table.getIndexNameById(index.getId()); + for (Predicate condition : conditions) { + String columnName = null; + if (condition instanceof BinaryPredicate) { + BinaryPredicate binaryPredicate = (BinaryPredicate) condition; + columnName = ((SlotRef) binaryPredicate.getChild(0)).getColumnName(); + } else if (condition instanceof IsNullPredicate) { + IsNullPredicate isNullPredicate = (IsNullPredicate) condition; + columnName = ((SlotRef) isNullPredicate.getChild(0)).getColumnName(); + } + Column column = indexNameToColumn.get(columnName); + if (column == null) { + ErrorReport.reportDdlException(ErrorCode.ERR_BAD_FIELD_ERROR, columnName, indexName); + } + + if (table.getKeysType() == KeysType.DUP_KEYS && !column.isKey()) { + throw new DdlException("Column[" + columnName + "] is not key column in index[" + indexName + "]"); + } + } + + // do not need to check replica version and backend alive + + } // end for indices + + if (deleteConditions == null) { + return; + } + + // save delete conditions + for (Predicate condition : conditions) { + if (condition instanceof BinaryPredicate) { + BinaryPredicate binaryPredicate = (BinaryPredicate) condition; + SlotRef slotRef = (SlotRef) binaryPredicate.getChild(0); + String columnName = slotRef.getColumnName(); + StringBuilder sb = new StringBuilder(); + sb.append(columnName).append(" ").append(binaryPredicate.getOp().name()).append(" \"") + .append(((LiteralExpr) binaryPredicate.getChild(1)).getStringValue()).append("\""); + deleteConditions.add(sb.toString()); + } else if (condition instanceof IsNullPredicate) { + IsNullPredicate isNullPredicate = (IsNullPredicate) condition; + SlotRef slotRef = (SlotRef) isNullPredicate.getChild(0); + String columnName = slotRef.getColumnName(); + StringBuilder sb = new StringBuilder(); + sb.append(columnName); + if (isNullPredicate.isNotNull()) { + sb.append(" IS NOT NULL"); + } else { + sb.append(" IS NULL"); + } + deleteConditions.add(sb.toString()); + } + } + } + private void checkDelete(OlapTable table, Partition partition, List conditions, long checkVersion, long checkVersionHash, List deleteConditions, Map> asyncTabletIdToBackends, boolean preCheck) @@ -2685,10 +2974,20 @@ public class Load { try { for (AsyncDeleteJob job : idToQuorumFinishedDeleteJob.values()) { if (job.getPartitionId() == partitionId) { - throw new DdlException("Partition[" + partitionName + "] has running delete job. " + throw new DdlException("Partition[" + partitionName + "] has running async delete job. " + "See 'SHOW DELETE'"); } } + for (long dbId : dbToDeleteJobs.keySet()) { + List loadJobs = dbToDeleteJobs.get(dbId); + for (LoadJob loadJob : loadJobs) { + if (loadJob.getDeleteInfo().getPartitionId() == partitionId + && loadJob.getState() == JobState.LOADING) { + throw new DdlException("Partition[" + partitionName + "] has running async delete job. " + + "See 'SHOW DELETE'"); + } + } + } } finally { readUnlock(); } @@ -2708,6 +3007,117 @@ public class Load { if (db == null) { throw new DdlException("Db does not exist. name: " + dbName); } + + long tableId = -1; + long partitionId = -1; + LoadJob loadDeleteJob = null; + db.readLock(); + try { + Table table = db.getTable(tableName); + if (table == null) { + throw new DdlException("Table does not exist. name: " + tableName); + } + + if (table.getType() != TableType.OLAP) { + throw new DdlException("Not olap type table. type: " + table.getType().name()); + } + OlapTable olapTable = (OlapTable) table; + + if (olapTable.getState() != OlapTableState.NORMAL) { + throw new DdlException("Table's state is not normal: " + tableName); + } + + tableId = olapTable.getId(); + Partition partition = olapTable.getPartition(partitionName); + if (partition == null) { + throw new DdlException("Partition does not exist. name: " + partitionName); + } + partitionId = partition.getId(); + + List deleteConditions = Lists.newArrayList(); + // pre check + checkDeleteV2(olapTable, partition, conditions, + deleteConditions, true); + checkAndAddRunningSyncDeleteJob(partitionId, partitionName); + // do not use transaction id generator, or the id maybe duplicated + long jobId = Catalog.getInstance().getNextId(); + String jobLabel = "delete_" + UUID.randomUUID(); + // the version info in delete info will be updated after job finished + DeleteInfo deleteInfo = new DeleteInfo(db.getId(), tableId, tableName, + partition.getId(), partitionName, + -1, 0, deleteConditions); + loadDeleteJob = new LoadJob(jobId, db.getId(), tableId, + partitionId, jobLabel, olapTable.getIndexIdToSchemaHash(), conditions, deleteInfo); + Map idToTabletLoadInfo = Maps.newHashMap(); + for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { + for (Tablet tablet : materializedIndex.getTablets()) { + long tabletId = tablet.getId(); + // tabletLoadInfo is empty, because delete load does not need filepath filesize info + TabletLoadInfo tabletLoadInfo = new TabletLoadInfo("", -1); + idToTabletLoadInfo.put(tabletId, tabletLoadInfo); + } + } + loadDeleteJob.setIdToTabletLoadInfo(idToTabletLoadInfo); + loadDeleteJob.setState(JobState.LOADING); + long transactionId = Catalog.getCurrentGlobalTransactionMgr().beginTransaction(db.getId(), jobLabel, + "fe", LoadJobSourceType.FRONTEND); + loadDeleteJob.setTransactionId(transactionId); + // the delete job will be persist in editLog + addLoadJob(loadDeleteJob, db); + } catch (Throwable t) { + LOG.debug("error occurred during prepare delete", t); + throw new DdlException(t.getMessage(), t); + } finally { + db.readUnlock(); + } + try { + // TODO wait loadDeleteJob to finished, using while true? or condition wait + long startDeleteTime = System.currentTimeMillis(); + long timeout = loadDeleteJob.getDeleteJobTimeout(); + while (true) { + db.writeLock(); + try { + if (loadDeleteJob.getState() == JobState.FINISHED + || loadDeleteJob.getState() == JobState.CANCELLED) { + break; + } + if (System.currentTimeMillis() - startDeleteTime > timeout) { + TransactionState transactionState = Catalog.getCurrentGlobalTransactionMgr().getTransactionState(loadDeleteJob.getTransactionId()); + if (transactionState.getTransactionStatus() == TransactionStatus.PREPARE) { + boolean isSuccess = cancelLoadJob(loadDeleteJob, CancelType.TIMEOUT, "load delete job timeout"); + if (isSuccess) { + throw new DdlException("timeout when waiting delete"); + } + } + } + } finally { + db.writeUnlock(); + } + Thread.sleep(1000); + } + } catch (Exception e) { + String failMsg = "delete unknown, " + e.getMessage(); + LOG.warn(failMsg, e); + throw new DdlException(failMsg); + } finally { + writeLock(); + try { + partitionUnderDelete.remove(partitionId); + } finally { + writeUnlock(); + } + } + } + + public void deleteOld(DeleteStmt stmt) throws DdlException { + String dbName = stmt.getDbName(); + String tableName = stmt.getTableName(); + String partitionName = stmt.getPartitionName(); + List conditions = stmt.getDeleteConditions(); + Database db = Catalog.getInstance().getDb(dbName); + if (db == null) { + throw new DdlException("Db does not exist. name: " + dbName); + } DeleteInfo deleteInfo = null; @@ -2750,7 +3160,7 @@ public class Load { null, asyncTabletIdToBackends, true); newVersion = committedVersion + 1; - newVersionHash = Math.abs(new Random().nextLong()); + newVersionHash = Util.generateVersionHash(); deleteInfo = new DeleteInfo(db.getId(), tableId, tableName, partition.getId(), partitionName, newVersion, newVersionHash, null); @@ -2856,7 +3266,11 @@ public class Load { replica.getVersion(), replica.getVersionHash(), replica.getDataSize(), - replica.getRowCount()); + replica.getRowCount(), + replica.getLastFailedVersion(), + replica.getLastFailedVersionHash(), + replica.getLastSuccessVersion(), + replica.getLastSuccessVersionHash()); deleteInfo.addReplicaPersistInfo(info); } } @@ -2922,7 +3336,7 @@ public class Load { PushTask pushTask = (PushTask) task; AgentTaskQueue.removePushTask(pushTask.getBackendId(), pushTask.getSignature(), pushTask.getVersion(), pushTask.getVersionHash(), - pushTask.getPushType()); + pushTask.getPushType(), pushTask.getTaskType()); } writeLock(); @@ -2938,12 +3352,21 @@ public class Load { LinkedList> infos = new LinkedList>(); readLock(); try { - AsyncDeleteJob job = idToQuorumFinishedDeleteJob.get(jobId); + LoadJob job = null; + for (long dbId : dbToDeleteJobs.keySet()) { + List loadJobs = dbToDeleteJobs.get(dbId); + for (LoadJob loadJob : loadJobs) { + if (loadJob.getId() == jobId) { + job = loadJob; + break; + } + } + } if (job == null) { return infos; } - for (Long tabletId : job.getTabletIds()) { + for (Long tabletId : job.getIdToTabletLoadInfo().keySet()) { List info = Lists.newArrayList(); info.add(tabletId); infos.add(info); @@ -2954,14 +3377,36 @@ public class Load { return infos; } + + public int getDeleteJobNumByState(long dbId, JobState state) { + readLock(); + try { + List deleteJobs = dbToDeleteJobs.get(dbId); + if (deleteJobs == null) { + return 0; + } else { + int deleteJobNum = 0; + for (LoadJob job : deleteJobs) { + if (job.getState() == state) { + ++ deleteJobNum; + } + } + return deleteJobNum; + } + } finally { + readUnlock(); + } + } public int getDeleteInfoNum(long dbId) { readLock(); try { - if (dbToDeleteInfos.containsKey(dbId)) { - return dbToDeleteInfos.get(dbId).size(); + List deleteJobs = dbToDeleteJobs.get(dbId); + if (deleteJobs == null) { + return 0; + } else { + return deleteJobs.size(); } - return 0; } finally { readUnlock(); } @@ -2969,29 +3414,35 @@ public class Load { public List> getDeleteInfosByDb(long dbId, boolean forUser) { LinkedList> infos = new LinkedList>(); - Database db = Catalog.getInstance().getDb(dbId); - if (db == null) { - return infos; - } + Database db = Catalog.getInstance().getDb(dbId); + if (db == null) { + return infos; + } - String dbName = db.getFullName(); + String dbName = db.getFullName(); readLock(); try { - List deleteInfos = dbToDeleteInfos.get(dbId); - if (deleteInfos == null) { + List deleteJobs = dbToDeleteJobs.get(dbId); + if (deleteJobs == null) { return infos; } - for (DeleteInfo deleteInfo : deleteInfos) { + for (LoadJob loadJob : deleteJobs) { + + DeleteInfo deleteInfo = loadJob.getDeleteInfo(); + if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, deleteInfo.getTableName(), PrivPredicate.LOAD)) { continue; } + List info = Lists.newArrayList(); if (!forUser) { - info.add(deleteInfo.getJobId()); + // do not get job id from delete info, because async delete job == null + // just get it from load job + info.add(loadJob.getId()); info.add(deleteInfo.getTableId()); } info.add(deleteInfo.getTableName()); @@ -3008,8 +3459,12 @@ public class Load { info.add(deleteInfo.getPartitionVersion()); info.add(deleteInfo.getPartitionVersionHash()); } - - info.add(deleteInfo.getState().name()); + // for loading state, should not display loading, show deleting instead + if (loadJob.getState() == JobState.LOADING) { + info.add("DELETING"); + } else { + info.add(loadJob.getState().name()); + } infos.add(info); } @@ -3092,12 +3547,13 @@ public class Load { readLock(); try { long maxTime = Long.MIN_VALUE; - List deleteInfos = dbToDeleteInfos.get(dbId); - if (deleteInfos != null) { - for (DeleteInfo info : deleteInfos) { - if (info.getCreateTimeMs() > maxTime) { - maxTime = info.getCreateTimeMs(); - deleteInfo = info; + List deleteJobs = dbToDeleteJobs.get(dbId); + if (deleteJobs != null) { + for (LoadJob loadJob : deleteJobs) { + if (loadJob.getDeleteInfo().getCreateTimeMs() > maxTime + && loadJob.getState() == JobState.FINISHED) { + maxTime = loadJob.getDeleteInfo().getCreateTimeMs(); + deleteInfo = loadJob.getDeleteInfo(); } } } @@ -3149,3 +3605,5 @@ public class Load { return num; } } + + diff --git a/fe/src/main/java/com/baidu/palo/load/Load.java.rej b/fe/src/main/java/com/baidu/palo/load/Load.java.rej deleted file mode 100644 index f2e001574f..0000000000 --- a/fe/src/main/java/com/baidu/palo/load/Load.java.rej +++ /dev/null @@ -1,3155 +0,0 @@ ---- fe/src/com/baidu/palo/load/Load.java -+++ fe/src/com/baidu/palo/load/Load.java -@@ -1,3152 +0,0 @@ --// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -- --// Licensed under the Apache License, Version 2.0 (the "License"); --// you may not use this file except in compliance with the License. --// You may obtain a copy of the License at --// --// http://www.apache.org/licenses/LICENSE-2.0 --// --// Unless required by applicable law or agreed to in writing, --// software distributed under the License is distributed on an --// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --// KIND, either express or implied. See the License for the --// specific language governing permissions and limitations --// under the License. -- --package com.baidu.palo.load; -- --import com.baidu.palo.analysis.BinaryPredicate; --import com.baidu.palo.analysis.CancelLoadStmt; --import com.baidu.palo.analysis.ColumnSeparator; --import com.baidu.palo.analysis.DataDescription; --import com.baidu.palo.analysis.DeleteStmt; --import com.baidu.palo.analysis.IsNullPredicate; --import com.baidu.palo.analysis.LabelName; --import com.baidu.palo.analysis.LiteralExpr; --import com.baidu.palo.analysis.LoadStmt; --import com.baidu.palo.analysis.Predicate; --import com.baidu.palo.analysis.SlotRef; --import com.baidu.palo.catalog.Catalog; --import com.baidu.palo.catalog.Column; --import com.baidu.palo.catalog.Database; --import com.baidu.palo.catalog.KeysType; --import com.baidu.palo.catalog.MaterializedIndex; --import com.baidu.palo.catalog.OlapTable; --import com.baidu.palo.catalog.OlapTable.OlapTableState; --import com.baidu.palo.catalog.Partition; --import com.baidu.palo.catalog.Partition.PartitionState; --import com.baidu.palo.catalog.PrimitiveType; --import com.baidu.palo.catalog.Replica; --import com.baidu.palo.catalog.Table; --import com.baidu.palo.catalog.Table.TableType; --import com.baidu.palo.catalog.Tablet; --import com.baidu.palo.catalog.TabletInvertedIndex; --import com.baidu.palo.catalog.TabletMeta; --import com.baidu.palo.catalog.Type; --import com.baidu.palo.cluster.ClusterNamespace; --import com.baidu.palo.common.AnalysisException; --import com.baidu.palo.common.Config; --import com.baidu.palo.common.DdlException; --import com.baidu.palo.common.ErrorCode; --import com.baidu.palo.common.ErrorReport; --import com.baidu.palo.common.FeMetaVersion; --import com.baidu.palo.common.FeNameFormat; --import com.baidu.palo.common.LoadException; --import com.baidu.palo.common.MarkedCountDownLatch; --import com.baidu.palo.common.Pair; --import com.baidu.palo.common.util.ListComparator; --import com.baidu.palo.common.util.OrderByPair; --import com.baidu.palo.common.util.TimeUtils; --import com.baidu.palo.load.AsyncDeleteJob.DeleteState; --import com.baidu.palo.load.FailMsg.CancelType; --import com.baidu.palo.load.LoadJob.EtlJobType; --import com.baidu.palo.load.LoadJob.JobState; --import com.baidu.palo.metric.MetricRepo; --import com.baidu.palo.mysql.privilege.PrivPredicate; --import com.baidu.palo.persist.ReplicaPersistInfo; --import com.baidu.palo.qe.ConnectContext; --import com.baidu.palo.system.Backend; --import com.baidu.palo.task.AgentBatchTask; --import com.baidu.palo.task.AgentClient; --import com.baidu.palo.task.AgentTask; --import com.baidu.palo.task.AgentTaskExecutor; --import com.baidu.palo.task.AgentTaskQueue; --import com.baidu.palo.task.CancelDeleteTask; --import com.baidu.palo.task.PushTask; --import com.baidu.palo.thrift.TEtlState; --import com.baidu.palo.thrift.TMiniLoadRequest; --import com.baidu.palo.thrift.TNetworkAddress; --import com.baidu.palo.thrift.TPriority; --import com.baidu.palo.thrift.TPushType; -- --import com.google.common.base.Joiner; --import com.google.common.base.Preconditions; --import com.google.common.base.Strings; --import com.google.common.collect.Lists; --import com.google.common.collect.Maps; --import com.google.common.collect.Sets; --import com.google.gson.Gson; -- --import org.apache.commons.lang.StringUtils; --import org.apache.logging.log4j.LogManager; --import org.apache.logging.log4j.Logger; -- --import java.util.ArrayList; --import java.util.Arrays; --import java.util.Collection; --import java.util.Collections; --import java.util.HashMap; --import java.util.Iterator; --import java.util.LinkedList; --import java.util.List; --import java.util.Map; --import java.util.Map.Entry; --import java.util.Random; --import java.util.Set; --import java.util.concurrent.TimeUnit; --import java.util.concurrent.locks.ReentrantReadWriteLock; -- --public class Load { -- private static final Logger LOG = LogManager.getLogger(Load.class); -- -- // valid state change map -- private static final Map> STATE_CHANGE_MAP = Maps.newHashMap(); -- -- // system dpp config -- public static DppConfig dppDefaultConfig = null; -- public static Map clusterToDppConfig = Maps.newHashMap(); -- -- // load job meta -- private Map idToLoadJob; // loadJobId to loadJob -- private Map> dbToLoadJobs; // db to loadJob list -- private Map>> dbLabelToLoadJobs; // db label to loadJob list -- private Map idToPendingLoadJob; // loadJobId to pending loadJob -- private Map idToEtlLoadJob; // loadJobId to etl loadJob -- private Map idToLoadingLoadJob; // loadJobId to loading loadJob -- private Map idToQuorumFinishedLoadJob; // loadJobId to quorum finished loadJob -- private Set loadingPartitionIds; // loading partition id set -- private Map> dbToMiniLabels; // db to mini uncommit label -- -- private Map> dbToDeleteInfos; // db to delete job list -- -- private Set partitionUnderDelete; // save partitions which are running delete jobs -- private Map idToQuorumFinishedDeleteJob; -- -- private volatile LoadErrorHub.Param loadErrorHubInfo = new LoadErrorHub.Param(); -- -- // lock for load job -- // lock is private and must use after db lock -- private ReentrantReadWriteLock lock; -- -- static { -- Set pendingDestStates = Sets.newHashSet(); -- pendingDestStates.add(JobState.ETL); -- pendingDestStates.add(JobState.CANCELLED); -- STATE_CHANGE_MAP.put(JobState.PENDING, pendingDestStates); -- -- Set etlDestStates = Sets.newHashSet(); -- etlDestStates.add(JobState.LOADING); -- etlDestStates.add(JobState.CANCELLED); -- STATE_CHANGE_MAP.put(JobState.ETL, etlDestStates); -- -- Set loadingDestStates = Sets.newHashSet(); -- loadingDestStates.add(JobState.QUORUM_FINISHED); -- loadingDestStates.add(JobState.CANCELLED); -- STATE_CHANGE_MAP.put(JobState.LOADING, loadingDestStates); -- -- Set quorumFinishedDestStates = Sets.newHashSet(); -- quorumFinishedDestStates.add(JobState.FINISHED); -- STATE_CHANGE_MAP.put(JobState.QUORUM_FINISHED, quorumFinishedDestStates); -- -- // system dpp config -- Gson gson = new Gson(); -- try { -- Map defaultConfig = -- (HashMap) gson.fromJson(Config.dpp_default_config_str, HashMap.class); -- dppDefaultConfig = DppConfig.create(defaultConfig); -- -- Map> clusterToConfig = -- (HashMap>) gson.fromJson(Config.dpp_config_str, HashMap.class); -- for (Entry> entry : clusterToConfig.entrySet()) { -- String cluster = entry.getKey(); -- DppConfig dppConfig = dppDefaultConfig.getCopiedDppConfig(); -- dppConfig.update(DppConfig.create(entry.getValue())); -- dppConfig.check(); -- -- clusterToDppConfig.put(cluster, dppConfig); -- } -- -- if (!clusterToDppConfig.containsKey(Config.dpp_default_cluster)) { -- throw new LoadException("Default cluster not exist"); -- } -- } catch (Throwable e) { -- LOG.error("dpp default config ill-formed", e); -- System.exit(-1); -- } -- } -- -- public Load() { -- idToLoadJob = Maps.newHashMap(); -- dbToLoadJobs = Maps.newHashMap(); -- dbLabelToLoadJobs = Maps.newHashMap(); -- idToPendingLoadJob = Maps.newLinkedHashMap(); -- idToEtlLoadJob = Maps.newLinkedHashMap(); -- idToLoadingLoadJob = Maps.newLinkedHashMap(); -- idToQuorumFinishedLoadJob = Maps.newLinkedHashMap(); -- loadingPartitionIds = Sets.newHashSet(); -- dbToMiniLabels = Maps.newHashMap(); -- dbToDeleteInfos = Maps.newHashMap(); -- partitionUnderDelete = Sets.newHashSet(); -- idToQuorumFinishedDeleteJob = Maps.newLinkedHashMap(); -- lock = new ReentrantReadWriteLock(true); -- } -- -- public void readLock() { -- lock.readLock().lock(); -- } -- -- public void readUnlock() { -- lock.readLock().unlock(); -- } -- -- private void writeLock() { -- lock.writeLock().lock(); -- } -- -- private void writeUnlock() { -- lock.writeLock().unlock(); -- } -- -- public boolean addLoadJob(TMiniLoadRequest request) throws DdlException { -- // get params -- String fullDbName = request.getDb(); -- String tableName = request.getTbl(); -- String label = request.getLabel(); -- long timestamp = 0; -- if (request.isSetTimestamp()) { -- timestamp = request.getTimestamp(); -- } -- TNetworkAddress beAddr = request.getBackend(); -- String filePathsValue = request.getFiles().get(0); -- Map params = request.getProperties(); -- -- // create load stmt -- // label name -- LabelName labelName = new LabelName(fullDbName, label); -- -- // data descriptions -- // file paths -- if (Strings.isNullOrEmpty(filePathsValue)) { -- throw new DdlException("File paths are not specified"); -- } -- List filePaths = Arrays.asList(filePathsValue.split(",")); -- -- // partitions | column names | separator | line delimiter -- List partitionNames = null; -- List columnNames = null; -- ColumnSeparator columnSeparator = null; -- List hllColumnPairList = null; -- String lineDelimiter = null; -- if (params != null) { -- String specifiedPartitions = params.get(LoadStmt.KEY_IN_PARAM_PARTITIONS); -- if (!Strings.isNullOrEmpty(specifiedPartitions)) { -- partitionNames = Arrays.asList(specifiedPartitions.split(",")); -- } -- String specifiedColumns = params.get(LoadStmt.KEY_IN_PARAM_COLUMNS); -- if (!Strings.isNullOrEmpty(specifiedColumns)) { -- columnNames = Arrays.asList(specifiedColumns.split(",")); -- } -- -- final String hll = params.get(LoadStmt.KEY_IN_PARAM_HLL); -- if (!Strings.isNullOrEmpty(hll)) { -- hllColumnPairList = Arrays.asList(hll.split(":")); -- } -- -- String columnSeparatorStr = params.get(LoadStmt.KEY_IN_PARAM_COLUMN_SEPARATOR); -- if (columnSeparatorStr != null) { -- columnSeparator = new ColumnSeparator(columnSeparatorStr); -- try { -- columnSeparator.analyze(); -- } catch (AnalysisException e) { -- throw new DdlException(e.getMessage()); -- } -- } -- lineDelimiter = params.get(LoadStmt.KEY_IN_PARAM_LINE_DELIMITER); -- } -- -- -- DataDescription dataDescription = new DataDescription(tableName, partitionNames, filePaths, columnNames, -- columnSeparator, false, null); -- dataDescription.setLineDelimiter(lineDelimiter); -- dataDescription.setBeAddr(beAddr); -- // parse hll param pair -- if (hllColumnPairList != null) { -- for (int i = 0; i < hllColumnPairList.size(); i++) { -- final String pairStr = hllColumnPairList.get(i); -- final List pairList = Arrays.asList(pairStr.split(",")); -- if (pairList.size() != 2) { -- throw new DdlException("hll param format error"); -- } -- -- final String resultColumn = pairList.get(0); -- final String hashColumn = pairList.get(1); -- final Pair> pair = new Pair>( -- DataDescription.FUNCTION_HASH_HLL, -- Arrays.asList(hashColumn)); -- dataDescription.addColumnMapping(resultColumn, pair); -- } -- } -- -- List dataDescriptions = Lists.newArrayList(dataDescription); -- -- // job properties -- Map properties = Maps.newHashMap(); -- if (params != null) { -- String maxFilterRatio = params.get(LoadStmt.MAX_FILTER_RATIO_PROPERTY); -- if (!Strings.isNullOrEmpty(maxFilterRatio)) { -- properties.put(LoadStmt.MAX_FILTER_RATIO_PROPERTY, maxFilterRatio); -- } -- String timeout = params.get(LoadStmt.TIMEOUT_PROPERTY); -- if (!Strings.isNullOrEmpty(timeout)) { -- properties.put(LoadStmt.TIMEOUT_PROPERTY, timeout); -- } -- } -- LoadStmt stmt = new LoadStmt(labelName, dataDescriptions, null, null, properties); -- -- // try to register mini label -- if (!registerMiniLabel(fullDbName, label, timestamp)) { -- return false; -- } -- -- try { -- addLoadJob(stmt, EtlJobType.MINI, timestamp); -- } finally { -- deregisterMiniLabel(fullDbName, label); -- } -- -- return true; -- } -- -- public void addLoadJob(LoadStmt stmt, EtlJobType etlJobType, long timestamp) throws DdlException { -- // get db -- String dbName = stmt.getLabel().getDbName(); -- Database db = Catalog.getInstance().getDb(dbName); -- if (db == null) { -- throw new DdlException("Database[" + dbName + "] does not exist"); -- } -- -- // create job -- LoadJob job = createLoadJob(stmt, etlJobType, db, timestamp); -- addLoadJob(job, db); -- } -- -- // for insert select from or create as stmt -- public void addLoadJob(String label, String dbName, -- long tableId, List fileList, long timestamp) throws DdlException { -- // get db and table -- Database db = Catalog.getInstance().getDb(dbName); -- if (db == null) { -- throw new DdlException("Database[" + dbName + "] does not exist"); -- } -- -- OlapTable table = null; -- db.readLock(); -- try { -- table = (OlapTable) db.getTable(tableId); -- } finally { -- db.readUnlock(); -- } -- if (table == null) { -- throw new DdlException("Table[" + tableId + "] does not exist"); -- } -- -- // create job -- DataDescription desc = new DataDescription(table.getName(), null, Lists.newArrayList(""), -- null, null, false, null); -- LoadStmt stmt = new LoadStmt(new LabelName(dbName, label), Lists.newArrayList(desc), null, null, null); -- LoadJob job = createLoadJob(stmt, EtlJobType.INSERT, db, timestamp); -- -- // add schema hash -- db.readLock(); -- try { -- for (Map.Entry entry : table.getIndexIdToSchemaHash().entrySet()) { -- job.getTableLoadInfo(tableId).addIndexSchemaHash(entry.getKey(), entry.getValue()); -- } -- } finally { -- db.readUnlock(); -- } -- -- // file size use -1 temporarily -- Map fileMap = Maps.newHashMap(); -- for (String filePath : fileList) { -- fileMap.put(filePath, -1L); -- } -- -- // update job info to etl finish -- EtlStatus status = job.getEtlJobStatus(); -- status.setState(TEtlState.FINISHED); -- status.setFileMap(fileMap); -- job.setState(JobState.ETL); -- -- // add load job -- addLoadJob(job, db); -- } -- -- // This is a final step of all addLoadJob() methods -- private void addLoadJob(LoadJob job, Database db) throws DdlException { -- // check cluster capacity -- Catalog.getCurrentSystemInfo().checkClusterCapacity(db.getClusterName()); -- // check db quota -- db.checkQuota(); -- -- // check if table is in restore process -- db.readLock(); -- try { -- for (Long tblId : job.getIdToTableLoadInfo().keySet()) { -- Table tbl = db.getTable(tblId); -- if (tbl != null && tbl.getType() == TableType.OLAP -- && ((OlapTable) tbl).getState() == OlapTableState.RESTORE) { -- throw new DdlException("Table " + tbl.getName() + " is in restore process. " -- + "Can not load into it"); -- } -- } -- } finally { -- db.readUnlock(); -- } -- -- writeLock(); -- try { -- unprotectAddLoadJob(job, false /* not replay */); -- MetricRepo.COUNTER_LOAD_ADD.increase(1L); -- Catalog.getInstance().getEditLog().logLoadStart(job); -- } finally { -- writeUnlock(); -- } -- LOG.info("add load job. job: {}", job); -- } -- -- private LoadJob createLoadJob(LoadStmt stmt, EtlJobType etlJobType, -- Database db, long timestamp) throws DdlException { -- // get params -- String label = stmt.getLabel().getLabelName(); -- List dataDescriptions = stmt.getDataDescriptions(); -- Map properties = stmt.getProperties(); -- -- // check params -- try { -- FeNameFormat.checkLabel(label); -- } catch (AnalysisException e) { -- throw new DdlException(e.getMessage()); -- } -- if (dataDescriptions == null || dataDescriptions.isEmpty()) { -- throw new DdlException("No data file in load statement."); -- } -- -- // create job -- LoadJob job = new LoadJob(label); -- job.setEtlJobType(etlJobType); -- job.setDbId(db.getId()); -- job.setTimestamp(timestamp); -- job.setBrokerDesc(stmt.getBrokerDesc()); -- -- // resource info -- if (ConnectContext.get() != null) { -- job.setResourceInfo(ConnectContext.get().toResourceCtx()); -- job.setExecMemLimit(ConnectContext.get().getSessionVariable().getMaxExecMemByte()); -- } -- -- // job properties -- if (properties != null) { -- if (properties.containsKey(LoadStmt.TIMEOUT_PROPERTY)) { -- try { -- job.setTimeoutSecond(Integer.parseInt(properties.get(LoadStmt.TIMEOUT_PROPERTY))); -- } catch (NumberFormatException e) { -- throw new DdlException("Timeout is not INT", e); -- } -- } -- -- if (properties.containsKey(LoadStmt.MAX_FILTER_RATIO_PROPERTY)) { -- try { -- job.setMaxFilterRatio(Double.parseDouble(properties.get(LoadStmt.MAX_FILTER_RATIO_PROPERTY))); -- } catch (NumberFormatException e) { -- throw new DdlException("Max filter ratio is not DOUBLE", e); -- } -- } -- -- if (properties.containsKey(LoadStmt.LOAD_DELETE_FLAG_PROPERTY)) { -- String flag = properties.get(LoadStmt.LOAD_DELETE_FLAG_PROPERTY); -- if (flag.equalsIgnoreCase("true") || flag.equalsIgnoreCase("false")) { -- job.setDeleteFlag(Boolean.parseBoolean(flag)); -- } else { -- throw new DdlException("Value of delete flag is invalid"); -- } -- } -- -- if (properties.containsKey(LoadStmt.EXEC_MEM_LIMIT)) { -- try { -- job.setExecMemLimit(Long.parseLong(properties.get(LoadStmt.EXEC_MEM_LIMIT))); -- } catch (NumberFormatException e) { -- throw new DdlException("Execute memory limit is not Long", e); -- } -- } -- } -- -- // job table load info -- Map idToTableLoadInfo = Maps.newHashMap(); -- // tableId partitionId sources -- Map>> tableToPartitionSources = Maps.newHashMap(); -- for (DataDescription dataDescription : dataDescriptions) { -- // create source -- createSource(db, dataDescription, tableToPartitionSources, job.getDeleteFlag()); -- job.addTableName(dataDescription.getTableName()); -- } -- for (Entry>> tableEntry : tableToPartitionSources.entrySet()) { -- long tableId = tableEntry.getKey(); -- Map> partitionToSources = tableEntry.getValue(); -- -- Map idToPartitionLoadInfo = Maps.newHashMap(); -- for (Entry> partitionEntry : partitionToSources.entrySet()) { -- PartitionLoadInfo info = new PartitionLoadInfo(partitionEntry.getValue()); -- idToPartitionLoadInfo.put(partitionEntry.getKey(), info); -- } -- idToTableLoadInfo.put(tableId, new TableLoadInfo(idToPartitionLoadInfo)); -- } -- job.setIdToTableLoadInfo(idToTableLoadInfo); -- -- if (etlJobType == EtlJobType.BROKER) { -- PullLoadSourceInfo sourceInfo = new PullLoadSourceInfo(); -- for (DataDescription dataDescription : dataDescriptions) { -- BrokerFileGroup fileGroup = new BrokerFileGroup(dataDescription, stmt.getBrokerDesc()); -- fileGroup.parse(db); -- sourceInfo.addFileGroup(fileGroup); -- } -- job.setPullLoadSourceInfo(sourceInfo); -- LOG.info("Source info is {}", sourceInfo); -- } -- -- if (etlJobType == EtlJobType.MINI) { -- // mini etl tasks -- Map idToEtlTask = Maps.newHashMap(); -- long etlTaskId = 0; -- db.readLock(); -- try { -- for (DataDescription dataDescription : dataDescriptions) { -- String tableName = dataDescription.getTableName(); -- OlapTable table = (OlapTable) db.getTable(tableName); -- if (table == null) { -- throw new DdlException("Table[" + tableName + "] does not exist"); -- } -- -- TNetworkAddress beAddress = dataDescription.getBeAddr(); -- Backend backend = Catalog.getCurrentSystemInfo().getBackendWithBePort(beAddress.getHostname(), -- beAddress.getPort()); -- if (!Catalog.getCurrentSystemInfo().checkBackendAvailable(backend.getId())) { -- throw new DdlException("Etl backend is null or not available"); -- } -- -- MiniEtlTaskInfo taskInfo = new MiniEtlTaskInfo(etlTaskId++, backend.getId(), table.getId()); -- idToEtlTask.put(taskInfo.getId(), taskInfo); -- } -- } finally { -- db.readUnlock(); -- } -- job.setMiniEtlTasks(idToEtlTask); -- job.setPrority(TPriority.HIGH); -- -- if (job.getTimeoutSecond() == 0) { -- // set default timeout -- job.setTimeoutSecond(Config.mini_load_default_timeout_second); -- } -- -- } else if (etlJobType == EtlJobType.HADOOP) { -- // hadoop dpp cluster config -- // default dpp config -- DppConfig dppConfig = dppDefaultConfig.getCopiedDppConfig(); -- -- // get dpp config by cluster -- // 1. from user -- String cluster = stmt.getCluster(); -- if (cluster == null && properties != null) { -- cluster = properties.get(LoadStmt.CLUSTER_PROPERTY); -- } -- -- Pair clusterInfo = Catalog.getInstance().getAuth().getLoadClusterInfo( -- stmt.getUser(), cluster); -- cluster = clusterInfo.first; -- DppConfig clusterConfig = clusterInfo.second; -- -- // 2. from system -- if (cluster == null || clusterConfig == null) { -- if (cluster == null) { -- cluster = Config.dpp_default_cluster; -- } -- -- clusterConfig = clusterToDppConfig.get(cluster); -- if (clusterConfig == null) { -- throw new DdlException("Load cluster[" + cluster + "] does not exist"); -- } -- } -- -- dppConfig.update(clusterConfig); -- -- try { -- // parse user define hadoop and bos configs -- dppConfig.updateHadoopConfigs(properties); -- -- // check and set cluster info -- dppConfig.check(); -- job.setClusterInfo(cluster, dppConfig); -- job.setPrority(dppConfig.getPriority()); -- } catch (LoadException e) { -- throw new DdlException(e.getMessage()); -- } -- -- if (job.getTimeoutSecond() == 0) { -- // set default timeout -- job.setTimeoutSecond(Config.hadoop_load_default_timeout_second); -- } -- } else if (etlJobType == EtlJobType.BROKER) { -- if (job.getTimeoutSecond() == 0) { -- // set default timeout -- job.setTimeoutSecond(Config.pull_load_task_default_timeout_second); -- } -- } else if (etlJobType == EtlJobType.INSERT) { -- job.setPrority(TPriority.HIGH); -- } -- -- // job id -- job.setId(Catalog.getInstance().getNextId()); -- -- return job; -- } -- -- private void createSource(Database db, DataDescription dataDescription, -- Map>> tableToPartitionSources, boolean deleteFlag) throws DdlException { -- Source source = new Source(dataDescription.getFilePathes()); -- long tableId = -1; -- Set sourcePartitionIds = Sets.newHashSet(); -- -- // source column names and partitions -- String tableName = dataDescription.getTableName(); -- Map>> columnToFunction = null; -- db.readLock(); -- try { -- Table table = db.getTable(tableName); -- if (table == null) { -- throw new DdlException("Table [" + tableName + "] does not exist"); -- } -- tableId = table.getId(); -- if (table.getType() != TableType.OLAP) { -- throw new DdlException("Table [" + tableName + "] is not olap table"); -- } -- -- if (((OlapTable) table).getState() == OlapTableState.RESTORE) { -- throw new DdlException("Table [" + tableName + "] is under restore"); -- } -- -- if (((OlapTable) table).getKeysType() != KeysType.AGG_KEYS && dataDescription.isNegative()) { -- throw new DdlException("Load for AGG_KEYS table should not specify NEGATIVE"); -- } -- -- if (((OlapTable) table).getKeysType() != KeysType.UNIQUE_KEYS && deleteFlag) { -- throw new DdlException("Delete flag can only be used for UNIQUE_KEYS table"); -- } -- -- // get table schema -- List tableSchema = table.getBaseSchema(); -- Map nameToTableColumn = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); -- for (Column column : tableSchema) { -- nameToTableColumn.put(column.getName(), column); -- } -- -- // source columns -- List columnNames = Lists.newArrayList(); -- List assignColumnNames = dataDescription.getColumnNames(); -- if (assignColumnNames == null) { -- // use table columns -- for (Column column : tableSchema) { -- columnNames.add(column.getName()); -- } -- } else { -- // convert column to schema format -- for (String assignCol : assignColumnNames) { -- if (nameToTableColumn.containsKey(assignCol)) { -- columnNames.add(nameToTableColumn.get(assignCol).getName()); -- } else { -- columnNames.add(assignCol); -- } -- } -- } -- source.setColumnNames(columnNames); -- -- // check default value -- Map>> assignColumnToFunction = dataDescription.getColumnMapping(); -- for (Column column : tableSchema) { -- String columnName = column.getName(); -- if (columnNames.contains(columnName)) { -- continue; -- } -- -- if (assignColumnToFunction != null && assignColumnToFunction.containsKey(columnName)) { -- continue; -- } -- -- if (column.getDefaultValue() != null || column.isAllowNull()) { -- continue; -- } -- -- if (deleteFlag && !column.isKey()) { -- List args = Lists.newArrayList(); -- args.add("0"); -- Pair> functionPair = new Pair>("default_value", args); -- assignColumnToFunction.put(columnName, functionPair); -- continue; -- } -- -- throw new DdlException("Column has no default value. column: " + columnName); -- } -- -- // check hll -- for (Column column : tableSchema) { -- if (column.getColumnType().getType() == PrimitiveType.HLL) { -- if (assignColumnToFunction != null && !assignColumnToFunction.containsKey(column.getName())) { -- throw new DdlException("Hll column is not assigned. column:" + column.getName()); -- } -- } -- } -- // check mapping column exist in table -- // check function -- // convert mapping column and func arg columns to schema format -- Map columnNameMap = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); -- for (String columnName : columnNames) { -- columnNameMap.put(columnName, columnName); -- } -- if (assignColumnToFunction != null) { -- columnToFunction = Maps.newHashMap(); -- for (Entry>> entry : assignColumnToFunction.entrySet()) { -- String mappingColumnName = entry.getKey(); -- if (!nameToTableColumn.containsKey(mappingColumnName)) { -- throw new DdlException("Mapping column is not in table. column: " + mappingColumnName); -- } -- -- Column mappingColumn = nameToTableColumn.get(mappingColumnName); -- Pair> function = entry.getValue(); -- try { -- DataDescription.validateMappingFunction(function.first, function.second, columnNameMap, -- mappingColumn, dataDescription.isPullLoad()); -- } catch (AnalysisException e) { -- throw new DdlException(e.getMessage()); -- } -- -- columnToFunction.put(mappingColumn.getName(), function); -- } -- } -- -- // partitions of this source -- OlapTable olapTable = (OlapTable) table; -- List partitionNames = dataDescription.getPartitionNames(); -- if (partitionNames == null || partitionNames.isEmpty()) { -- partitionNames = new ArrayList(); -- for (Partition partition : olapTable.getPartitions()) { -- partitionNames.add(partition.getName()); -- } -- } -- for (String partitionName : partitionNames) { -- Partition partition = olapTable.getPartition(partitionName); -- if (partition == null) { -- throw new DdlException("Partition [" + partitionName + "] does not exist"); -- } -- sourcePartitionIds.add(partition.getId()); -- } -- } finally { -- db.readUnlock(); -- } -- -- // column separator -- String columnSeparator = dataDescription.getColumnSeparator(); -- if (!Strings.isNullOrEmpty(columnSeparator)) { -- source.setColumnSeparator(columnSeparator); -- } -- -- // line delimiter -- String lineDelimiter = dataDescription.getLineDelimiter(); -- if (!Strings.isNullOrEmpty(lineDelimiter)) { -- source.setLineDelimiter(lineDelimiter); -- } -- -- // source negative -- source.setNegative(dataDescription.isNegative()); -- -- // column mapping functions -- if (columnToFunction != null) { -- source.setColumnToFunction(columnToFunction); -- } -- -- // add source to table partition map -- Map> partitionToSources = null; -- if (tableToPartitionSources.containsKey(tableId)) { -- partitionToSources = tableToPartitionSources.get(tableId); -- } else { -- partitionToSources = Maps.newHashMap(); -- tableToPartitionSources.put(tableId, partitionToSources); -- } -- for (long partitionId : sourcePartitionIds) { -- List sources = null; -- if (partitionToSources.containsKey(partitionId)) { -- sources = partitionToSources.get(partitionId); -- } else { -- sources = new ArrayList(); -- partitionToSources.put(partitionId, sources); -- } -- sources.add(source); -- } -- } -- -- public void unprotectAddLoadJob(LoadJob job, boolean isReplay) throws DdlException { -- long jobId = job.getId(); -- long dbId = job.getDbId(); -- String label = job.getLabel(); -- -- long timestamp = job.getTimestamp(); -- if (!isReplay && getAllUnfinishedLoadJob() > Config.max_unfinished_load_job) { -- throw new DdlException( -- "Number of unfinished load jobs exceed the max number: " + Config.max_unfinished_load_job); -- } -- -- // check label exist -- boolean checkMini = true; -- if (job.getEtlJobType() == EtlJobType.MINI) { -- // already registered, do not need check -- checkMini = false; -- } -- checkLabelUsed(dbId, label, timestamp, checkMini); -- -- // add job -- Map> labelToLoadJobs = null; -- if (dbLabelToLoadJobs.containsKey(dbId)) { -- labelToLoadJobs = dbLabelToLoadJobs.get(dbId); -- } else { -- labelToLoadJobs = Maps.newHashMap(); -- dbLabelToLoadJobs.put(dbId, labelToLoadJobs); -- } -- List labelLoadJobs = null; -- if (labelToLoadJobs.containsKey(label)) { -- labelLoadJobs = labelToLoadJobs.get(label); -- } else { -- labelLoadJobs = Lists.newArrayList(); -- labelToLoadJobs.put(label, labelLoadJobs); -- } -- -- List dbLoadJobs = null; -- if (dbToLoadJobs.containsKey(dbId)) { -- dbLoadJobs = dbToLoadJobs.get(dbId); -- } else { -- dbLoadJobs = Lists.newArrayList(); -- dbToLoadJobs.put(dbId, dbLoadJobs); -- } -- idToLoadJob.put(jobId, job); -- dbLoadJobs.add(job); -- labelLoadJobs.add(job); -- -- switch (job.getState()) { -- case PENDING: -- idToPendingLoadJob.put(jobId, job); -- break; -- case ETL: -- idToEtlLoadJob.put(jobId, job); -- break; -- case LOADING: -- idToLoadingLoadJob.put(jobId, job); -- // recover loadingPartitionIds -- recoverLoadingPartitions(job); -- break; -- case QUORUM_FINISHED: -- // The state QUORUM_FINISHED could only occur when loading image file -- idToQuorumFinishedLoadJob.put(jobId, job); -- break; -- case FINISHED: -- break; -- case CANCELLED: -- break; -- default: -- // Impossible to be other state -- Preconditions.checkNotNull(null, "Should not be here"); -- } -- } -- -- private long getAllUnfinishedLoadJob() { -- return idToPendingLoadJob.size() + idToEtlLoadJob.size() + idToLoadingLoadJob.size() -- + idToQuorumFinishedLoadJob.size(); -- } -- -- public void replayAddLoadJob(LoadJob job) throws DdlException { -- writeLock(); -- try { -- unprotectAddLoadJob(job, true /* replay */); -- } finally { -- writeUnlock(); -- } -- } -- -- public void unprotectEtlLoadJob(LoadJob job) { -- long jobId = job.getId(); -- idToPendingLoadJob.remove(jobId); -- idToEtlLoadJob.put(jobId, job); -- -- replaceLoadJob(job); -- } -- -- public void replayEtlLoadJob(LoadJob job) throws DdlException { -- writeLock(); -- try { -- unprotectEtlLoadJob(job); -- } finally { -- writeUnlock(); -- } -- } -- -- public void unprotectLoadingLoadJob(LoadJob job) { -- long jobId = job.getId(); -- idToEtlLoadJob.remove(jobId); -- idToLoadingLoadJob.put(jobId, job); -- -- // recover loadingPartitionIds -- recoverLoadingPartitions(job); -- -- replaceLoadJob(job); -- } -- -- public void replayLoadingLoadJob(LoadJob job) throws DdlException { -- writeLock(); -- try { -- unprotectLoadingLoadJob(job); -- } finally { -- writeUnlock(); -- } -- } -- -- -- public boolean registerMiniLabel( -- String fullDbName, String label, long timestamp) throws DdlException { -- Database db = Catalog.getInstance().getDb(fullDbName); -- if (db == null) { -- throw new DdlException("Db does not exist. name: " + fullDbName); -- } -- -- long dbId = db.getId(); -- writeLock(); -- try { -- if (!checkLabelUsed(dbId, label, timestamp, true)) { -- return false; -- } -- -- Set miniLabels = null; -- if (dbToMiniLabels.containsKey(dbId)) { -- miniLabels = dbToMiniLabels.get(dbId); -- } else { -- miniLabels = Sets.newHashSet(); -- dbToMiniLabels.put(dbId, miniLabels); -- } -- miniLabels.add(label); -- } finally { -- writeUnlock(); -- } -- -- return true; -- } -- -- public void deregisterMiniLabel(String fullDbName, String label) throws DdlException { -- Database db = Catalog.getInstance().getDb(fullDbName); -- if (db == null) { -- throw new DdlException("Db does not exist. name: " + fullDbName); -- } -- -- long dbId = db.getId(); -- writeLock(); -- try { -- if (!dbToMiniLabels.containsKey(dbId)) { -- return; -- } -- -- Set miniLabels = dbToMiniLabels.get(dbId); -- miniLabels.remove(label); -- if (miniLabels.isEmpty()) { -- dbToMiniLabels.remove(dbId); -- } -- } finally { -- writeUnlock(); -- } -- } -- -- public void checkLabelUsed(String fullDbName, String label, long timestamp) throws DdlException { -- Database db = Catalog.getInstance().getDb(fullDbName); -- if (db == null) { -- throw new DdlException("Db does not exist. name: " + fullDbName); -- } -- -- readLock(); -- try { -- checkLabelUsed(db.getId(), label, timestamp, true); -- } finally { -- readUnlock(); -- } -- } -- -- private boolean checkLabelUsed(long dbId, String label, -- long timestamp, boolean checkMini) throws DdlException { -- final String labelUsedMsg = "Same data label[" + label + "] already used"; -- -- // check dbLabelToLoadJobs -- if (dbLabelToLoadJobs.containsKey(dbId)) { -- Map> labelToLoadJobs = dbLabelToLoadJobs.get(dbId); -- if (labelToLoadJobs.containsKey(label)) { -- List labelLoadJobs = labelToLoadJobs.get(label); -- for (LoadJob oldJob : labelLoadJobs) { -- JobState oldJobState = oldJob.getState(); -- if (oldJobState != JobState.CANCELLED) { -- if (timestamp == oldJob.getTimestamp()) { -- return false; -- } else { -- throw new DdlException(labelUsedMsg); -- } -- } -- } -- } -- } -- -- // check dbToMiniLabel -- if (checkMini) { -- if (dbToMiniLabels.containsKey(dbId)) { -- Set uncommittedLabels = dbToMiniLabels.get(dbId); -- if (uncommittedLabels.contains(label)) { -- throw new DdlException(labelUsedMsg); -- } -- } -- } -- -- return true; -- } -- -- public boolean cancelLoadJob(CancelLoadStmt stmt) throws DdlException { -- // get params -- String dbName = stmt.getDbName(); -- String label = stmt.getLabel(); -- -- // get load job and check state -- Database db = Catalog.getInstance().getDb(dbName); -- if (db == null) { -- throw new DdlException("Db does not exist. name: " + dbName); -- } -- LoadJob job = null; -- readLock(); -- try { -- Map> labelToLoadJobs = dbLabelToLoadJobs.get(db.getId()); -- if (labelToLoadJobs == null) { -- throw new DdlException("Load job does not exist"); -- } -- -- List loadJobs = labelToLoadJobs.get(label); -- if (loadJobs == null) { -- throw new DdlException("Load job does not exist"); -- } -- // only the last one should be running -- job = loadJobs.get(loadJobs.size() - 1); -- JobState state = job.getState(); -- if (state == JobState.CANCELLED) { -- throw new DdlException("Load job has been cancelled"); -- } else if (state == JobState.QUORUM_FINISHED || state == JobState.FINISHED) { -- throw new DdlException("Load job has been finished"); -- } -- } finally { -- readUnlock(); -- } -- -- // check auth here, cause we need table info -- Set tableNames = job.getTableNames(); -- if (tableNames.isEmpty()) { -- // forward compatibility -- if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, -- PrivPredicate.LOAD)) { -- ErrorReport.reportDdlException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "CANCEL LOAD"); -- } -- } else { -- for (String tblName : tableNames) { -- if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, tblName, -- PrivPredicate.LOAD)) { -- ErrorReport.reportDdlException(ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "CANCEL LOAD", -- ConnectContext.get().getQualifiedUser(), -- ConnectContext.get().getRemoteIP(), tblName); -- } -- } -- } -- -- // cancel job -- if (!cancelLoadJob(job, CancelType.USER_CANCEL, "user cancel")) { -- throw new DdlException("Cancel load job fail"); -- } -- -- return true; -- } -- -- public boolean cancelLoadJob(LoadJob job, CancelType cancelType, String msg) { -- // update job to cancelled -- JobState srcState = job.getState(); -- if (!updateLoadJobState(job, JobState.CANCELLED, cancelType, msg)) { -- LOG.warn("cancel load job failed. job: {}", job); -- return false; -- } -- -- // clear -- if (job.getHadoopDppConfig() != null) { -- clearJob(job, srcState); -- } -- -- if (job.getBrokerDesc() != null) { -- if (srcState == JobState.ETL) { -- // Cancel job id -- Catalog.getInstance().getPullLoadJobMgr().cancelJob(job.getId()); -- } -- } -- LOG.info("cancel load job success. job: {}", job); -- return true; -- } -- -- public void unprotectCancelLoadJob(LoadJob job) { -- long jobId = job.getId(); -- LoadJob oldJob = idToLoadJob.get(jobId); -- if (oldJob == null) { -- LOG.warn("cancel job does not exist. id: {}", jobId); -- return; -- } -- -- switch (oldJob.getState()) { -- case PENDING: -- idToPendingLoadJob.remove(jobId); -- break; -- case ETL: -- idToEtlLoadJob.remove(jobId); -- break; -- case LOADING: -- idToLoadingLoadJob.remove(jobId); -- // remove loading partitions -- removeLoadingPartitions(oldJob); -- break; -- default: -- LOG.warn("cancel job has wrong src state: {}", oldJob.getState().name()); -- return; -- } -- -- replaceLoadJob(job); -- } -- -- public void replayCancelLoadJob(LoadJob job) { -- writeLock(); -- try { -- unprotectCancelLoadJob(job); -- } finally { -- writeUnlock(); -- } -- } -- -- public void removeDeleteJobAndSetState(AsyncDeleteJob job) { -- job.clearTasks(); -- writeLock(); -- try { -- idToQuorumFinishedDeleteJob.remove(job.getJobId()); -- -- List deleteInfos = dbToDeleteInfos.get(job.getDbId()); -- Preconditions.checkNotNull(deleteInfos); -- -- for (DeleteInfo deleteInfo : deleteInfos) { -- if (deleteInfo.getJobId() == job.getJobId()) { -- deleteInfo.getAsyncDeleteJob().setState(DeleteState.FINISHED); -- LOG.info("replay set async delete job to finished: {}", job.getJobId()); -- } -- } -- -- } finally { -- writeUnlock(); -- } -- } -- -- public List getQuorumFinishedDeleteJobs() { -- List jobs = Lists.newArrayList(); -- Collection stateJobs = null; -- readLock(); -- try { -- stateJobs = idToQuorumFinishedDeleteJob.values(); -- if (stateJobs != null) { -- jobs.addAll(stateJobs); -- } -- } finally { -- readUnlock(); -- } -- return jobs; -- } -- -- public int getLoadJobNumber() { -- readLock(); -- try { -- return idToLoadJob.size(); -- } finally { -- readUnlock(); -- } -- } -- -- public Map getIdToLoadJob() { -- return idToLoadJob; -- } -- -- public Map> getDbToLoadJobs() { -- return dbToLoadJobs; -- } -- -- public Map> getDbToDeleteInfos() { -- return dbToDeleteInfos; -- } -- -- public List getDbLoadJobs(long dbId) { -- readLock(); -- try { -- return dbToLoadJobs.get(dbId); -- } finally { -- readUnlock(); -- } -- } -- -- public List getLoadJobs(JobState jobState) { -- List jobs = new ArrayList(); -- Collection stateJobs = null; -- readLock(); -- try { -- switch (jobState) { -- case PENDING: -- stateJobs = idToPendingLoadJob.values(); -- break; -- case ETL: -- stateJobs = idToEtlLoadJob.values(); -- break; -- case LOADING: -- stateJobs = idToLoadingLoadJob.values(); -- break; -- case QUORUM_FINISHED: -- stateJobs = idToQuorumFinishedLoadJob.values(); -- break; -- default: -- break; -- } -- if (stateJobs != null) { -- jobs.addAll(stateJobs); -- } -- } finally { -- readUnlock(); -- } -- return jobs; -- } -- -- public int getLoadJobNum(JobState jobState, long dbId) { -- readLock(); -- try { -- List loadJobs = this.dbToLoadJobs.get(dbId); -- if (loadJobs == null) { -- return 0; -- } -- -- int jobNum = 0; -- for (LoadJob job : loadJobs) { -- if (job.getState() == jobState) { -- ++jobNum; -- } -- } -- return jobNum; -- } finally { -- readUnlock(); -- } -- } -- -- public LoadJob getLoadJob(long jobId) { -- readLock(); -- try { -- return idToLoadJob.get(jobId); -- } finally { -- readUnlock(); -- } -- } -- -- public AsyncDeleteJob getAsyncDeleteJob(long jobId) { -- readLock(); -- try { -- return idToQuorumFinishedDeleteJob.get(jobId); -- } finally { -- readUnlock(); -- } -- } -- -- public List getCopiedAsyncDeleteJobs() { -- readLock(); -- try { -- return Lists.newArrayList(idToQuorumFinishedDeleteJob.values()); -- } finally { -- readUnlock(); -- } -- } -- -- public LinkedList> getLoadJobInfosByDb(long dbId, String dbName, String labelValue, -- boolean accurateMatch, Set states, ArrayList orderByPairs) { -- LinkedList> loadJobInfos = new LinkedList>(); -- readLock(); -- try { -- List loadJobs = this.dbToLoadJobs.get(dbId); -- if (loadJobs == null) { -- return loadJobInfos; -- } -- -- long start = System.currentTimeMillis(); -- LOG.debug("begin to get load job info, size: {}", loadJobs.size()); -- for (LoadJob loadJob : loadJobs) { -- // filter first -- String label = loadJob.getLabel(); -- JobState state = loadJob.getState(); -- -- if (labelValue != null) { -- if (accurateMatch) { -- if (!label.equals(labelValue)) { -- continue; -- } -- } else { -- if (!label.contains(labelValue)) { -- continue; -- } -- } -- } -- -- if (states != null) { -- if (!states.contains(state)) { -- continue; -- } -- } -- -- // check auth -- Set tableNames = loadJob.getTableNames(); -- if (tableNames.isEmpty()) { -- // forward compatibility -- if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, -- PrivPredicate.SHOW)) { -- continue; -- } -- } else { -- boolean auth = true; -- for (String tblName : tableNames) { -- if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, -- tblName, PrivPredicate.SHOW)) { -- auth = false; -- break; -- } -- } -- if (!auth) { -- continue; -- } -- } -- -- List jobInfo = new ArrayList(); -- -- // jobId -- jobInfo.add(loadJob.getId()); -- // label -- jobInfo.add(label); -- // state -- jobInfo.add(state.name()); -- -- // progress -- switch (loadJob.getState()) { -- case PENDING: -- jobInfo.add("ETL:0%; LOAD:0%"); -- break; -- case ETL: -- jobInfo.add("ETL:" + loadJob.getProgress() + "%; LOAD:0%"); -- break; -- case LOADING: -- jobInfo.add("ETL:100%; LOAD:" + loadJob.getProgress() + "%"); -- break; -- case QUORUM_FINISHED: -- jobInfo.add("ETL:100%; LOAD:100%"); -- break; -- case FINISHED: -- jobInfo.add("ETL:100%; LOAD:100%"); -- break; -- case CANCELLED: -- jobInfo.add("ETL:N/A; LOAD:N/A"); -- break; -- default: -- jobInfo.add("ETL:N/A; LOAD:N/A"); -- break; -- } -- -- // etl info -- EtlStatus status = loadJob.getEtlJobStatus(); -- if (status == null || status.getState() == TEtlState.CANCELLED) { -- jobInfo.add("N/A"); -- } else { -- Map counters = status.getCounters(); -- List info = Lists.newArrayList(); -- for (String key : counters.keySet()) { -- // XXX: internal etl job return all counters -- if (key.equalsIgnoreCase("HDFS bytes read") -- || key.equalsIgnoreCase("Map input records") -- || key.startsWith("dpp.") -- || loadJob.getEtlJobType() == EtlJobType.MINI) { -- info.add(key + "=" + counters.get(key)); -- } -- } // end for counters -- if (info.isEmpty()) { -- jobInfo.add("N/A"); -- } else { -- jobInfo.add(StringUtils.join(info, "; ")); -- } -- } -- -- // task info -- jobInfo.add("cluster:" + loadJob.getHadoopCluster() -- + "; timeout(s):" + loadJob.getTimeoutSecond() -- + "; max_filter_ratio:" + loadJob.getMaxFilterRatio()); -- -- // error msg -- if (loadJob.getState() == JobState.CANCELLED) { -- FailMsg failMsg = loadJob.getFailMsg(); -- jobInfo.add("type:" + failMsg.getCancelType() + "; msg:" + failMsg.getMsg()); -- } else { -- jobInfo.add("N/A"); -- } -- -- // create time -- jobInfo.add(TimeUtils.longToTimeString(loadJob.getCreateTimeMs())); -- // etl start time -- jobInfo.add(TimeUtils.longToTimeString(loadJob.getEtlStartTimeMs())); -- // etl end time -- jobInfo.add(TimeUtils.longToTimeString(loadJob.getEtlFinishTimeMs())); -- // load start time -- jobInfo.add(TimeUtils.longToTimeString(loadJob.getLoadStartTimeMs())); -- // load end time -- jobInfo.add(TimeUtils.longToTimeString(loadJob.getLoadFinishTimeMs())); -- // tracking url -- jobInfo.add(status.getTrackingUrl()); -- -- loadJobInfos.add(jobInfo); -- } // end for loadJobs -- -- LOG.debug("finished to get load job info, cost: {}", (System.currentTimeMillis() - start)); -- } finally { -- readUnlock(); -- } -- -- ListComparator> comparator = null; -- if (orderByPairs != null) { -- OrderByPair[] orderByPairArr = new OrderByPair[orderByPairs.size()]; -- comparator = new ListComparator>(orderByPairs.toArray(orderByPairArr)); -- } else { -- // sort by id asc -- comparator = new ListComparator>(0); -- } -- Collections.sort(loadJobInfos, comparator); -- return loadJobInfos; -- } -- -- public LoadJob getLatestJobIdByLabel(long dbId, String labelValue) { -- LoadJob job = null; -- long jobId = 0; -- try { -- readLock(); -- List loadJobs = this.dbToLoadJobs.get(dbId); -- if (loadJobs == null) { -- return null; -- } -- -- for (LoadJob loadJob : loadJobs) { -- String label = loadJob.getLabel(); -- -- if (labelValue != null) { -- if (!label.equals(labelValue)) { -- continue; -- } -- } -- -- long currJobId = loadJob.getId(); -- -- if (currJobId > jobId) { -- jobId = currJobId; -- job = loadJob; -- } -- } -- } finally { -- readUnlock(); -- } -- -- return job; -- } -- -- public List> getLoadJobUnfinishedInfo(long jobId) { -- LinkedList> infos = new LinkedList>(); -- TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex(); -- readLock(); -- try { -- LoadJob loadJob = getLoadJob(jobId); -- if (loadJob == null -- || (loadJob.getState() != JobState.LOADING && loadJob.getState() != JobState.QUORUM_FINISHED)) { -- return infos; -- } -- -- long dbId = loadJob.getDbId(); -- Database db = Catalog.getInstance().getDb(dbId); -- if (db == null) { -- return infos; -- } -- -- db.readLock(); -- try { -- Map tabletMap = loadJob.getIdToTabletLoadInfo(); -- for (long tabletId : tabletMap.keySet()) { -- TabletMeta tabletMeta = invertedIndex.getTabletMeta(tabletId); -- if (tabletMeta == null) { -- // tablet may be dropped during loading -- continue; -- } -- -- long tableId = tabletMeta.getTableId(); -- -- OlapTable table = (OlapTable) db.getTable(tableId); -- if (table == null) { -- continue; -- } -- -- long partitionId = tabletMeta.getPartitionId(); -- Partition partition = table.getPartition(partitionId); -- if (partition == null) { -- continue; -- } -- -- long indexId = tabletMeta.getIndexId(); -- MaterializedIndex index = partition.getIndex(indexId); -- if (index == null) { -- continue; -- } -- -- Tablet tablet = index.getTablet(tabletId); -- if (tablet == null) { -- continue; -- } -- -- PartitionLoadInfo partitionLoadInfo = loadJob.getPartitionLoadInfo(tableId, partitionId); -- long version = partitionLoadInfo.getVersion(); -- long versionHash = partitionLoadInfo.getVersionHash(); -- -- for (Replica replica : tablet.getReplicas()) { -- if (replica.checkVersionCatchUp(version, versionHash)) { -- continue; -- } -- -- List info = Lists.newArrayList(); -- info.add(replica.getBackendId()); -- info.add(tabletId); -- info.add(replica.getId()); -- info.add(replica.getVersion()); -- info.add(replica.getVersionHash()); -- info.add(partitionId); -- info.add(version); -- info.add(versionHash); -- -- infos.add(info); -- } -- } // end for tablet -- -- } finally { -- db.readUnlock(); -- } -- -- } finally { -- readUnlock(); -- } -- -- // sort by version, backendId -- ListComparator> comparator = new ListComparator>(3, 0); -- Collections.sort(infos, comparator); -- -- return infos; -- } -- -- public LoadErrorHub.Param getLoadErrorHubInfo() { -- return loadErrorHubInfo; -- } -- -- public void setLoadErrorHubInfo(LoadErrorHub.Param info) { -- this.loadErrorHubInfo = info; -- } -- -- // Note: althrough this.loadErrorHubInfo is volatile, no need to lock. -- // but editlog need be locked -- public void changeLoadErrorHubInfo(LoadErrorHub.Param info) { -- writeLock(); -- try { -- this.loadErrorHubInfo = info; -- Catalog.getInstance().getEditLog().logSetLoadErrorHub(info); -- } finally { -- writeUnlock(); -- } -- } -- -- public static class JobInfo { -- public String dbName; -- public Set tblNames = Sets.newHashSet(); -- public String label; -- public String clusterName; -- public JobState state; -- public String failMsg; -- public String trackingUrl; -- -- public JobInfo(String dbName, String label, String clusterName) { -- this.dbName = dbName; -- this.label = label; -- this.clusterName = clusterName; -- } -- } -- -- // Get job state -- // result saved in info -- public void getJobInfo(JobInfo info) throws DdlException { -- String fullDbName = ClusterNamespace.getFullName(info.clusterName, info.dbName); -- info.dbName = fullDbName; -- Database db = Catalog.getInstance().getDb(fullDbName); -- if (db == null) { -- throw new DdlException("Unknown database(" + info.dbName + ")"); -- } -- readLock(); -- try { -- Map> labelToLoadJobs = dbLabelToLoadJobs.get(db.getId()); -- if (labelToLoadJobs == null) { -- throw new DdlException("No jobs belong to database(" + info.dbName + ")"); -- } -- List loadJobs = labelToLoadJobs.get(info.label); -- if (loadJobs == null) { -- throw new DdlException("Unknown job(" + info.label + ")"); -- } -- // only the last one should be running -- LoadJob job = loadJobs.get(loadJobs.size() - 1); -- -- if (!job.getTableNames().isEmpty()) { -- info.tblNames.addAll(job.getTableNames()); -- } -- -- info.state = job.getState(); -- if (info.state == JobState.QUORUM_FINISHED) { -- info.state = JobState.FINISHED; -- } -- -- info.failMsg = job.getFailMsg().getMsg(); -- info.trackingUrl = job.getEtlJobStatus().getTrackingUrl(); -- } finally { -- readUnlock(); -- } -- } -- -- public void unprotectQuorumLoadJob(LoadJob job, Database db) { -- // remove loading partitions -- removeLoadingPartitions(job); -- -- // Update database information first -- Map replicaInfos = job.getReplicaPersistInfos(); -- if (replicaInfos != null) { -- for (ReplicaPersistInfo info : replicaInfos.values()) { -- OlapTable table = (OlapTable) db.getTable(info.getTableId()); -- if (table == null) { -- LOG.warn("the table[{}] is missing", info.getIndexId()); -- continue; -- } -- Partition partition = table.getPartition(info.getPartitionId()); -- if (partition == null) { -- LOG.warn("the partition[{}] is missing", info.getIndexId()); -- continue; -- } -- MaterializedIndex index = partition.getIndex(info.getIndexId()); -- if (index == null) { -- LOG.warn("the index[{}] is missing", info.getIndexId()); -- continue; -- } -- Tablet tablet = index.getTablet(info.getTabletId()); -- if (tablet == null) { -- LOG.warn("the tablet[{}] is missing", info.getTabletId()); -- continue; -- } -- -- Replica replica = tablet.getReplicaById(info.getReplicaId()); -- if (replica == null) { -- LOG.warn("the replica[{}] is missing", info.getReplicaId()); -- continue; -- } -- replica.updateInfo(info.getVersion(), info.getVersionHash(), -- info.getDataSize(), info.getRowCount()); -- } -- } -- -- long jobId = job.getId(); -- Map idToTableLoadInfo = job.getIdToTableLoadInfo(); -- if (idToTableLoadInfo != null) { -- for (Entry tableEntry : idToTableLoadInfo.entrySet()) { -- long tableId = tableEntry.getKey(); -- OlapTable table = (OlapTable) db.getTable(tableId); -- TableLoadInfo tableLoadInfo = tableEntry.getValue(); -- for (Entry entry : tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { -- long partitionId = entry.getKey(); -- Partition partition = table.getPartition(partitionId); -- PartitionLoadInfo partitionLoadInfo = entry.getValue(); -- if (!partitionLoadInfo.isNeedLoad()) { -- continue; -- } -- updatePartitionVersion(partition, partitionLoadInfo.getVersion(), -- partitionLoadInfo.getVersionHash(), jobId); -- -- // update table row count -- for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { -- long tableRowCount = 0L; -- for (Tablet tablet : materializedIndex.getTablets()) { -- long tabletRowCount = 0L; -- for (Replica replica : tablet.getReplicas()) { -- long replicaRowCount = replica.getRowCount(); -- if (replicaRowCount > tabletRowCount) { -- tabletRowCount = replicaRowCount; -- } -- } -- tableRowCount += tabletRowCount; -- } -- materializedIndex.setRowCount(tableRowCount); -- } // end for indices -- } // end for partitions -- } // end for tables -- } -- -- idToLoadingLoadJob.remove(jobId); -- idToQuorumFinishedLoadJob.put(jobId, job); -- -- replaceLoadJob(job); -- } -- -- public void replayQuorumLoadJob(LoadJob job, Catalog catalog) throws DdlException { -- // TODO: need to call this.writeLock()? -- Database db = catalog.getDb(job.getDbId()); -- db.writeLock(); -- try { -- writeLock(); -- try { -- unprotectQuorumLoadJob(job, db); -- } finally { -- writeUnlock(); -- } -- } finally { -- db.writeUnlock(); -- } -- } -- -- public void unprotectFinishLoadJob(LoadJob job, Database db) { -- long jobId = job.getId(); -- idToQuorumFinishedLoadJob.remove(jobId); -- -- // Update database information -- Map replicaInfos = job.getReplicaPersistInfos(); -- if (replicaInfos != null) { -- for (ReplicaPersistInfo info : replicaInfos.values()) { -- OlapTable table = (OlapTable) db.getTable(info.getTableId()); -- if (table == null) { -- LOG.warn("the table[{}] is missing", info.getIndexId()); -- continue; -- } -- Partition partition = table.getPartition(info.getPartitionId()); -- if (partition == null) { -- LOG.warn("the partition[{}] is missing", info.getIndexId()); -- continue; -- } -- MaterializedIndex index = partition.getIndex(info.getIndexId()); -- if (index == null) { -- LOG.warn("the index[{}] is missing", info.getIndexId()); -- continue; -- } -- Tablet tablet = index.getTablet(info.getTabletId()); -- if (tablet == null) { -- LOG.warn("the tablet[{}] is missing", info.getTabletId()); -- continue; -- } -- -- Replica replica = tablet.getReplicaById(info.getReplicaId()); -- if (replica == null) { -- LOG.warn("the replica[{}] is missing", info.getReplicaId()); -- continue; -- } -- replica.updateInfo(info.getVersion(), info.getVersionHash(), -- info.getDataSize(), info.getRowCount()); -- } -- } -- -- replaceLoadJob(job); -- } -- -- public void replayFinishLoadJob(LoadJob job, Catalog catalog) { -- // TODO: need to call this.writeLock()? -- Database db = catalog.getDb(job.getDbId()); -- db.writeLock(); -- try { -- writeLock(); -- try { -- unprotectFinishLoadJob(job, db); -- } finally { -- writeUnlock(); -- } -- } finally { -- db.writeUnlock(); -- } -- } -- -- public void replayClearRollupInfo(ReplicaPersistInfo info, Catalog catalog) { -- Database db = catalog.getDb(info.getDbId()); -- db.writeLock(); -- try { -- OlapTable olapTable = (OlapTable) db.getTable(info.getTableId()); -- Partition partition = olapTable.getPartition(info.getPartitionId()); -- MaterializedIndex index = partition.getIndex(info.getIndexId()); -- index.clearRollupIndexInfo(); -- } finally { -- db.writeUnlock(); -- } -- } -- -- private void replaceLoadJob(LoadJob job) { -- long jobId = job.getId(); -- -- // Replace LoadJob in idToLoadJob -- if (!idToLoadJob.containsKey(jobId)) { -- // This may happen when we drop db while there are still load jobs running -- LOG.warn("Does not find load job in idToLoadJob. JobId : {}", jobId); -- return; -- } -- idToLoadJob.put(jobId, job); -- -- // Replace LoadJob in dbToLoadJobs -- List jobs = dbToLoadJobs.get(job.getDbId()); -- if (jobs == null) { -- LOG.warn("Does not find db in dbToLoadJobs. DbId : {}", job.getDbId()); -- return; -- } -- int pos = 0; -- for (LoadJob oneJob : jobs) { -- if (oneJob.getId() == jobId) { -- break; -- } -- pos++; -- } -- if (pos == jobs.size()) { -- LOG.warn("Does not find load job for db. DbId : {}, jobId : {}", job.getDbId(), jobId); -- return; -- } -- jobs.remove(pos); -- jobs.add(pos, job); -- -- // Replace LoadJob in dbLabelToLoadJobs -- if (dbLabelToLoadJobs.get(job.getDbId()) == null) { -- LOG.warn("Does not find db in dbLabelToLoadJobs. DbId : {}", job.getDbId()); -- return; -- } -- jobs = dbLabelToLoadJobs.get(job.getDbId()).get(job.getLabel()); -- if (jobs == null) { -- LOG.warn("Does not find label for db. label : {}, DbId : {}", job.getLabel(), job.getDbId()); -- return; -- } -- pos = 0; -- for (LoadJob oneJob : jobs) { -- if (oneJob.getId() == jobId) { -- break; -- } -- pos++; -- } -- if (pos == jobs.size()) { -- LOG.warn("Does not find load job for label. label : {}, DbId : {}", job.getLabel(), job.getDbId()); -- return; -- } -- jobs.remove(pos); -- jobs.add(pos, job); -- } -- -- // remove all db jobs from dbToLoadJobs and dbLabelToLoadJobs -- // only remove finished or cancelled job from idToLoadJob -- // LoadChecker will update other state jobs to cancelled or finished, -- // and they will be removed by removeOldLoadJobs periodically -- public void removeDbLoadJob(long dbId) { -- writeLock(); -- try { -- if (dbToLoadJobs.containsKey(dbId)) { -- List dbLoadJobs = dbToLoadJobs.remove(dbId); -- for (LoadJob job : dbLoadJobs) { -- JobState state = job.getState(); -- if (state == JobState.CANCELLED || state == JobState.FINISHED) { -- idToLoadJob.remove(job.getId()); -- } -- } -- } -- if (dbLabelToLoadJobs.containsKey(dbId)) { -- dbLabelToLoadJobs.remove(dbId); -- } -- } finally { -- writeUnlock(); -- } -- } -- -- // Added by ljb. Remove old load jobs from idToLoadJob, dbToLoadJobs and dbLabelToLoadJobs -- // This function is called periodically. every Configure.label_keep_max_second seconds -- public void removeOldLoadJobs() { -- long currentTimeMs = System.currentTimeMillis(); -- -- writeLock(); -- try { -- Iterator> iter = idToLoadJob.entrySet().iterator(); -- while (iter.hasNext()) { -- Map.Entry entry = iter.next(); -- LoadJob job = entry.getValue(); -- if ((currentTimeMs - job.getCreateTimeMs()) / 1000 > Config.label_keep_max_second -- && (job.getState() == JobState.FINISHED || job.getState() == JobState.CANCELLED)) { -- long dbId = job.getDbId(); -- String label = job.getLabel(); -- -- // Remove job from idToLoadJob -- iter.remove(); -- -- // Remove job from dbToLoadJobs -- List loadJobs = dbToLoadJobs.get(dbId); -- if (loadJobs != null) { -- loadJobs.remove(job); -- if (loadJobs.size() == 0) { -- dbToLoadJobs.remove(dbId); -- } -- } -- -- // Remove job from dbLabelToLoadJobs -- Map> mapLabelToJobs = dbLabelToLoadJobs.get(dbId); -- if (mapLabelToJobs != null) { -- loadJobs = mapLabelToJobs.get(label); -- if (loadJobs != null) { -- loadJobs.remove(job); -- if (loadJobs.size() == 0) { -- mapLabelToJobs.remove(label); -- if (mapLabelToJobs.size() == 0) { -- dbLabelToLoadJobs.remove(dbId); -- } -- } -- } -- } -- } -- } -- } finally { -- writeUnlock(); -- } -- } -- -- // clear dpp output and kill etl job -- public void clearJob(LoadJob job, JobState srcState) { -- JobState state = job.getState(); -- if (state != JobState.CANCELLED && state != JobState.FINISHED) { -- LOG.warn("job state error. state: {}", state); -- return; -- } -- -- EtlJobType etlJobType = job.getEtlJobType(); -- switch (etlJobType) { -- case HADOOP: -- DppScheduler dppScheduler = new DppScheduler(job.getHadoopDppConfig()); -- // kill etl job -- if (state == JobState.CANCELLED && srcState == JobState.ETL) { -- try { -- dppScheduler.killEtlJob(job.getHadoopEtlJobId()); -- } catch (Exception e) { -- LOG.warn("kill etl job error", e); -- } -- } -- -- // delete all dirs releated to job label, use "" instead of job.getEtlOutputDir() -- // hdfs://host:port/outputPath/dbId/loadLabel/ -- DppConfig dppConfig = job.getHadoopDppConfig(); -- String outputPath = DppScheduler.getEtlOutputPath(dppConfig.getFsDefaultName(), -- dppConfig.getOutputPath(), job.getDbId(), job.getLabel(), ""); -- try { -- dppScheduler.deleteEtlOutputPath(outputPath); -- } catch (Exception e) { -- LOG.warn("delete etl output path error", e); -- } -- break; -- case MINI: -- for (MiniEtlTaskInfo taskInfo : job.getMiniEtlTasks().values()) { -- long backendId = taskInfo.getBackendId(); -- Backend backend = Catalog.getCurrentSystemInfo().getBackend(backendId); -- if (backend == null) { -- LOG.warn("backend does not exist. id: {}", backendId); -- break; -- } -- -- long dbId = job.getDbId(); -- Database db = Catalog.getInstance().getDb(dbId); -- if (db == null) { -- LOG.warn("db does not exist. id: {}", dbId); -- break; -- } -- -- AgentClient client = new AgentClient(backend.getHost(), backend.getBePort()); -- client.deleteEtlFiles(dbId, job.getId(), db.getFullName(), job.getLabel()); -- } -- break; -- case INSERT: -- break; -- case BROKER: -- break; -- default: -- LOG.warn("unknown etl job type. type: {}, job id: {}", etlJobType.name(), job.getId()); -- break; -- } -- } -- -- public boolean updateLoadJobState(LoadJob job, JobState destState) { -- return updateLoadJobState(job, destState, CancelType.UNKNOWN, null); -- } -- -- public boolean updateLoadJobState(LoadJob job, JobState destState, CancelType cancelType, String msg) { -- boolean result = true; -- JobState srcState = null; -- -- long jobId = job.getId(); -- long dbId = job.getDbId(); -- Database db = Catalog.getInstance().getDb(dbId); -- String errMsg = msg; -- if (db == null) { -- // if db is null, update job to cancelled -- errMsg = "db does not exist. id: " + dbId; -- LOG.warn(errMsg); -- writeLock(); -- try { -- processCancelled(job, cancelType, errMsg); -- } finally { -- writeUnlock(); -- } -- } else { -- db.writeLock(); -- try { -- writeLock(); -- try { -- // check state -- srcState = job.getState(); -- if (!STATE_CHANGE_MAP.containsKey(srcState)) { -- LOG.warn("src state error. src state: {}", srcState.name()); -- return false; -- } -- Set destStates = STATE_CHANGE_MAP.get(srcState); -- if (!destStates.contains(destState)) { -- LOG.warn("state change error. src state: {}, dest state: {}", -- srcState.name(), destState.name()); -- return false; -- } -- -- switch (destState) { -- case ETL: -- idToPendingLoadJob.remove(jobId); -- idToEtlLoadJob.put(jobId, job); -- job.setProgress(0); -- job.setEtlStartTimeMs(System.currentTimeMillis()); -- job.setState(destState); -- Catalog.getInstance().getEditLog().logLoadEtl(job); -- break; -- case LOADING: -- idToEtlLoadJob.remove(jobId); -- idToLoadingLoadJob.put(jobId, job); -- job.setProgress(0); -- job.setLoadStartTimeMs(System.currentTimeMillis()); -- job.setState(destState); -- Catalog.getInstance().getEditLog().logLoadLoading(job); -- break; -- case QUORUM_FINISHED: -- if (processQuorumFinished(job, db)) { -- // Write edit log -- Catalog.getInstance().getEditLog().logLoadQuorum(job); -- } else { -- errMsg = "process loading finished fail"; -- processCancelled(job, cancelType, errMsg); -- } -- break; -- case FINISHED: -- MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); -- idToQuorumFinishedLoadJob.remove(jobId); -- job.setState(destState); -- -- // clear push tasks -- for (PushTask pushTask : job.getPushTasks()) { -- AgentTaskQueue.removePushTask(pushTask.getBackendId(), pushTask.getSignature(), -- pushTask.getVersion(), pushTask.getVersionHash(), -- pushTask.getPushType()); -- } -- // Clear the Map and Set in this job, reduce the memory cost for finished load job. -- job.clearRedundantInfoForHistoryJob(); -- // Write edit log -- Catalog.getInstance().getEditLog().logLoadDone(job); -- break; -- case CANCELLED: -- processCancelled(job, cancelType, errMsg); -- break; -- default: -- Preconditions.checkState(false, "wrong job state: " + destState.name()); -- break; -- } -- } finally { -- writeUnlock(); -- } -- } finally { -- db.writeUnlock(); -- } -- } -- -- // check current job state -- if (destState != job.getState()) { -- result = false; -- } -- return result; -- } -- -- private boolean processQuorumFinished(LoadJob job, Database db) { -- long jobId = job.getId(); -- // remove partition from loading set -- removeLoadingPartitions(job); -- -- // check partition exist -- Map idToTableLoadInfo = job.getIdToTableLoadInfo(); -- for (Entry tableEntry : idToTableLoadInfo.entrySet()) { -- long tableId = tableEntry.getKey(); -- OlapTable table = (OlapTable) db.getTable(tableId); -- if (table == null) { -- LOG.warn("table does not exist, id: {}", tableId); -- return false; -- } -- -- TableLoadInfo tableLoadInfo = tableEntry.getValue(); -- for (Entry partitionEntry : tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { -- long partitionId = partitionEntry.getKey(); -- PartitionLoadInfo partitionLoadInfo = partitionEntry.getValue(); -- if (!partitionLoadInfo.isNeedLoad()) { -- continue; -- } -- -- Partition partition = table.getPartition(partitionId); -- if (partition == null) { -- LOG.warn("partition does not exist, id: {}", partitionId); -- return false; -- } -- } -- } -- -- // update partition version and index row count -- for (Entry tableEntry : idToTableLoadInfo.entrySet()) { -- long tableId = tableEntry.getKey(); -- OlapTable table = (OlapTable) db.getTable(tableId); -- -- TableLoadInfo tableLoadInfo = tableEntry.getValue(); -- for (Entry entry : tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { -- long partitionId = entry.getKey(); -- Partition partition = table.getPartition(partitionId); -- PartitionLoadInfo partitionLoadInfo = entry.getValue(); -- if (!partitionLoadInfo.isNeedLoad()) { -- continue; -- } -- -- updatePartitionVersion(partition, partitionLoadInfo.getVersion(), -- partitionLoadInfo.getVersionHash(), jobId); -- -- for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { -- long tableRowCount = 0L; -- for (Tablet tablet : materializedIndex.getTablets()) { -- long tabletRowCount = 0L; -- for (Replica replica : tablet.getReplicas()) { -- long replicaRowCount = replica.getRowCount(); -- if (replicaRowCount > tabletRowCount) { -- tabletRowCount = replicaRowCount; -- } -- } -- tableRowCount += tabletRowCount; -- } -- materializedIndex.setRowCount(tableRowCount); -- } -- } -- } -- -- // When start up or checkpoint, Job may stay in pending queue. So remove it. -- idToPendingLoadJob.remove(jobId); -- -- idToLoadingLoadJob.remove(jobId); -- idToQuorumFinishedLoadJob.put(jobId, job); -- job.setProgress(100); -- job.setLoadFinishTimeMs(System.currentTimeMillis()); -- job.setState(JobState.QUORUM_FINISHED); -- return true; -- } -- -- private void updatePartitionVersion(Partition partition, long version, long versionHash, long jobId) { -- long partitionId = partition.getId(); -- partition.setCommittedVersion(version); -- partition.setCommittedVersionHash(versionHash); -- LOG.info("update partition version success. version: {}, version hash: {}, job id: {}, partition id: {}", -- version, versionHash, jobId, partitionId); -- } -- -- private boolean processCancelled(LoadJob job, CancelType cancelType, String msg) { -- long jobId = job.getId(); -- JobState srcState = job.getState(); -- CancelType tmpCancelType = CancelType.UNKNOWN; -- switch (srcState) { -- case PENDING: -- idToPendingLoadJob.remove(jobId); -- tmpCancelType = CancelType.ETL_SUBMIT_FAIL; -- break; -- case ETL: -- idToEtlLoadJob.remove(jobId); -- tmpCancelType = CancelType.ETL_RUN_FAIL; -- break; -- case LOADING: -- // remove partition from loading set -- removeLoadingPartitions(job); -- -- idToLoadingLoadJob.remove(jobId); -- tmpCancelType = CancelType.LOAD_RUN_FAIL; -- break; -- case QUORUM_FINISHED: -- idToQuorumFinishedLoadJob.remove(jobId); -- tmpCancelType = CancelType.LOAD_RUN_FAIL; -- break; -- default: -- Preconditions.checkState(false, "wrong job state: " + srcState.name()); -- break; -- } -- -- // set failMsg and state -- CancelType newCancelType = cancelType; -- if (newCancelType == CancelType.UNKNOWN) { -- newCancelType = tmpCancelType; -- } -- FailMsg failMsg = new FailMsg(newCancelType, msg); -- job.setFailMsg(failMsg); -- job.setLoadFinishTimeMs(System.currentTimeMillis()); -- job.setState(JobState.CANCELLED); -- -- // clear push tasks -- if (srcState == JobState.LOADING || srcState == JobState.QUORUM_FINISHED) { -- for (PushTask pushTask : job.getPushTasks()) { -- AgentTaskQueue.removePushTask(pushTask.getBackendId(), pushTask.getSignature(), -- pushTask.getVersion(), pushTask.getVersionHash(), -- pushTask.getPushType()); -- } -- } -- -- // Clear the Map and Set in this job, reduce the memory cost of canceled load job. -- job.clearRedundantInfoForHistoryJob(); -- // Write edit log -- Catalog.getInstance().getEditLog().logLoadCancel(job); -- -- return true; -- } -- -- public boolean addLoadingPartitions(Set partitionIds) { -- writeLock(); -- try { -- for (long partitionId : partitionIds) { -- if (loadingPartitionIds.contains(partitionId)) { -- LOG.info("partition {} is loading", partitionId); -- return false; -- } -- } -- loadingPartitionIds.addAll(partitionIds); -- return true; -- } finally { -- writeUnlock(); -- } -- } -- -- private void recoverLoadingPartitions(LoadJob job) { -- for (TableLoadInfo tableLoadInfo : job.getIdToTableLoadInfo().values()) { -- Map idToPartitionLoadInfo = tableLoadInfo.getIdToPartitionLoadInfo(); -- for (Entry entry : idToPartitionLoadInfo.entrySet()) { -- PartitionLoadInfo partitionLoadInfo = entry.getValue(); -- if (partitionLoadInfo.isNeedLoad()) { -- loadingPartitionIds.add(entry.getKey()); -- } -- } -- } -- } -- -- public void removeLoadingPartitions(Set partitionIds) { -- writeLock(); -- try { -- loadingPartitionIds.removeAll(partitionIds); -- } finally { -- writeUnlock(); -- } -- } -- -- private void removeLoadingPartitions(LoadJob job) { -- for (TableLoadInfo tableLoadInfo : job.getIdToTableLoadInfo().values()) { -- Map idToPartitionLoadInfo = tableLoadInfo.getIdToPartitionLoadInfo(); -- for (Entry entry : idToPartitionLoadInfo.entrySet()) { -- PartitionLoadInfo partitionLoadInfo = entry.getValue(); -- if (partitionLoadInfo.isNeedLoad()) { -- loadingPartitionIds.remove(entry.getKey()); -- } -- } -- } -- } -- -- public boolean checkPartitionLoadFinished(long partitionId, List quorumFinishedLoadJobs) { -- readLock(); -- try { -- for (JobState state : JobState.values()) { -- if (state == JobState.FINISHED || state == JobState.CANCELLED) { -- continue; -- } -- -- // we check PENDING / ETL / LOADING -- List loadJobs = this.getLoadJobs(state); -- for (LoadJob loadJob : loadJobs) { -- Preconditions.checkNotNull(loadJob.getIdToTableLoadInfo()); -- for (TableLoadInfo tableLoadInfo : loadJob.getIdToTableLoadInfo().values()) { -- if (tableLoadInfo.getIdToPartitionLoadInfo().containsKey(partitionId)) { -- if (state == JobState.QUORUM_FINISHED) { -- if (quorumFinishedLoadJobs != null) { -- quorumFinishedLoadJobs.add(loadJob); -- } else { -- return false; -- } -- } else { -- return false; -- } -- } -- } -- } -- } -- return true; -- } finally { -- readUnlock(); -- } -- } -- -- public void unprotectAddDeleteInfo(DeleteInfo deleteInfo) { -- long dbId = deleteInfo.getDbId(); -- List deleteInfos = dbToDeleteInfos.get(dbId); -- if (deleteInfos == null) { -- deleteInfos = Lists.newArrayList(); -- dbToDeleteInfos.put(dbId, deleteInfos); -- } -- deleteInfos.add(deleteInfo); -- -- if (deleteInfo.getAsyncDeleteJob() != null && deleteInfo.getState() == DeleteState.QUORUM_FINISHED) { -- AsyncDeleteJob asyncDeleteJob = deleteInfo.getAsyncDeleteJob(); -- idToQuorumFinishedDeleteJob.put(asyncDeleteJob.getJobId(), asyncDeleteJob); -- LOG.info("unprotected add asyncDeleteJob when load image: {}", asyncDeleteJob.getJobId()); -- } -- } -- -- public void unprotectDelete(DeleteInfo deleteInfo, Database db) { -- OlapTable table = (OlapTable) db.getTable(deleteInfo.getTableId()); -- Partition partition = table.getPartition(deleteInfo.getPartitionId()); -- updatePartitionVersion(partition, deleteInfo.getPartitionVersion(), deleteInfo.getPartitionVersionHash(), -1); -- -- List replicaInfos = deleteInfo.getReplicaPersistInfos(); -- if (replicaInfos != null) { -- for (ReplicaPersistInfo info : replicaInfos) { -- MaterializedIndex index = partition.getIndex(info.getIndexId()); -- Tablet tablet = index.getTablet(info.getTabletId()); -- Replica replica = tablet.getReplicaById(info.getReplicaId()); -- replica.updateInfo(info.getVersion(), info.getVersionHash(), -- info.getDataSize(), info.getRowCount()); -- } -- } -- -- // add to deleteInfos -- if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_11) { -- unprotectAddDeleteInfo(deleteInfo); -- } -- -- if (deleteInfo.getAsyncDeleteJob() != null) { -- AsyncDeleteJob asyncDeleteJob = deleteInfo.getAsyncDeleteJob(); -- idToQuorumFinishedDeleteJob.put(asyncDeleteJob.getJobId(), asyncDeleteJob); -- LOG.info("unprotected add asyncDeleteJob: {}", asyncDeleteJob.getJobId()); -- } -- } -- -- public void replayFinishAsyncDeleteJob(AsyncDeleteJob deleteJob, Catalog catalog) { -- Database db = catalog.getDb(deleteJob.getDbId()); -- db.writeLock(); -- try { -- writeLock(); -- try { -- // Update database information -- Map replicaInfos = deleteJob.getReplicaPersistInfos(); -- if (replicaInfos != null) { -- for (ReplicaPersistInfo info : replicaInfos.values()) { -- OlapTable table = (OlapTable) db.getTable(info.getTableId()); -- if (table == null) { -- LOG.warn("the table[{}] is missing", info.getIndexId()); -- continue; -- } -- Partition partition = table.getPartition(info.getPartitionId()); -- if (partition == null) { -- LOG.warn("the partition[{}] is missing", info.getIndexId()); -- continue; -- } -- MaterializedIndex index = partition.getIndex(info.getIndexId()); -- if (index == null) { -- LOG.warn("the index[{}] is missing", info.getIndexId()); -- continue; -- } -- Tablet tablet = index.getTablet(info.getTabletId()); -- if (tablet == null) { -- LOG.warn("the tablet[{}] is missing", info.getTabletId()); -- continue; -- } -- -- Replica replica = tablet.getReplicaById(info.getReplicaId()); -- if (replica == null) { -- LOG.warn("the replica[{}] is missing", info.getReplicaId()); -- continue; -- } -- replica.updateInfo(info.getVersion(), info.getVersionHash(), -- info.getDataSize(), info.getRowCount()); -- } -- } -- } finally { -- writeUnlock(); -- } -- } finally { -- db.writeUnlock(); -- } -- -- removeDeleteJobAndSetState(deleteJob); -- LOG.info("unprotected finish asyncDeleteJob: {}", deleteJob.getJobId()); -- } -- -- public void replayDelete(DeleteInfo deleteInfo, Catalog catalog) { -- Database db = catalog.getDb(deleteInfo.getDbId()); -- db.writeLock(); -- try { -- writeLock(); -- try { -- unprotectDelete(deleteInfo, db); -- } finally { -- writeUnlock(); -- } -- } finally { -- db.writeUnlock(); -- } -- } -- -- private void checkDelete(OlapTable table, Partition partition, List conditions, -- long checkVersion, long checkVersionHash, List deleteConditions, -- Map> asyncTabletIdToBackends, boolean preCheck) -- throws DdlException { -- // check partition state -- PartitionState state = partition.getState(); -- if (state != PartitionState.NORMAL) { -- // ErrorReport.reportDdlException(ErrorCode.ERR_BAD_PARTITION_STATE, partition.getName(), state.name()); -- throw new DdlException("Partition[" + partition.getName() + "]' state is not NORNAL: " + state.name()); -- } -- -- // check running load job -- List quorumFinishedLoadJobs = Lists.newArrayList(); -- if (!checkPartitionLoadFinished(partition.getId(), quorumFinishedLoadJobs)) { -- // ErrorReport.reportDdlException(ErrorCode.ERR_PARTITION_HAS_LOADING_JOBS, partition.getName()); -- throw new DdlException("Partition[" + partition.getName() + "] has unfinished load jobs"); -- } -- -- // get running async delete job -- List asyncDeleteJobs = getCopiedAsyncDeleteJobs(); -- -- // check condition column is key column and condition value -- Map nameToColumn = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); -- for (Column column : table.getBaseSchema()) { -- nameToColumn.put(column.getName(), column); -- } -- for (Predicate condition : conditions) { -- SlotRef slotRef = null; -- if (condition instanceof BinaryPredicate) { -- BinaryPredicate binaryPredicate = (BinaryPredicate) condition; -- slotRef = (SlotRef) binaryPredicate.getChild(0); -- } else if (condition instanceof IsNullPredicate) { -- IsNullPredicate isNullPredicate = (IsNullPredicate) condition; -- slotRef = (SlotRef) isNullPredicate.getChild(0); -- } -- String columnName = slotRef.getColumnName(); -- if (!nameToColumn.containsKey(columnName)) { -- ErrorReport.reportDdlException(ErrorCode.ERR_BAD_FIELD_ERROR, columnName, table.getName()); -- } -- -- Column column = nameToColumn.get(columnName); -- if (!column.isKey()) { -- // ErrorReport.reportDdlException(ErrorCode.ERR_NOT_KEY_COLUMN, columnName); -- throw new DdlException("Column[" + columnName + "] is not key column"); -- } -- -- if (condition instanceof BinaryPredicate) { -- String value = null; -- try { -- BinaryPredicate binaryPredicate = (BinaryPredicate) condition; -- value = ((LiteralExpr) binaryPredicate.getChild(1)).getStringValue(); -- LiteralExpr.create(value, Type.fromPrimitiveType(column.getDataType())); -- } catch (AnalysisException e) { -- // ErrorReport.reportDdlException(ErrorCode.ERR_INVALID_VALUE, value); -- throw new DdlException("Invalid column value[" + value + "]"); -- } -- } -- -- // set schema column name -- slotRef.setCol(column.getName()); -- } -- -- long tableId = table.getId(); -- long partitionId = partition.getId(); -- Map> indexIdToSchema = table.getIndexIdToSchema(); -- for (MaterializedIndex index : partition.getMaterializedIndices()) { -- // check table has condition column -- Map indexNameToColumn = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); -- for (Column column : indexIdToSchema.get(index.getId())) { -- indexNameToColumn.put(column.getName(), column); -- } -- String indexName = table.getIndexNameById(index.getId()); -- for (Predicate condition : conditions) { -- String columnName = null; -- if (condition instanceof BinaryPredicate) { -- BinaryPredicate binaryPredicate = (BinaryPredicate) condition; -- columnName = ((SlotRef) binaryPredicate.getChild(0)).getColumnName(); -- } else if (condition instanceof IsNullPredicate) { -- IsNullPredicate isNullPredicate = (IsNullPredicate) condition; -- columnName = ((SlotRef) isNullPredicate.getChild(0)).getColumnName(); -- } -- Column column = indexNameToColumn.get(columnName); -- if (column == null) { -- ErrorReport.reportDdlException(ErrorCode.ERR_BAD_FIELD_ERROR, columnName, indexName); -- } -- -- if (table.getKeysType() == KeysType.DUP_KEYS && !column.isKey()) { -- throw new DdlException("Column[" + columnName + "] is not key column in index[" + indexName + "]"); -- } -- } -- -- // check replica version and backend alive -- short replicationNum = table.getPartitionInfo().getReplicationNum(partition.getId()); -- for (Tablet tablet : index.getTablets()) { -- Set needAsyncBackendIds = Sets.newHashSet(); -- for (Replica replica : tablet.getReplicas()) { -- if (!Catalog.getCurrentSystemInfo().checkBackendAvailable(replica.getBackendId())) { -- LOG.warn("backend[{}] is not alive when delete check. pre: {}", -- replica.getBackendId(), preCheck); -- needAsyncBackendIds.add(replica.getBackendId()); -- continue; -- } -- -- // check replica version. -- // here is a little bit confused. the main idea is -- // 1. check if replica catch up the version -- // 2. if not catch up and this is pre check, make sure there will be right quorum finished load jobs -- // to fill the version gap between 'replica committed version' and 'partition committed version'. -- // 3. if not catch up and this is after check -- // 1) if diff version == 1, some sync delete task may failed. add async delete task. -- // 2) if diff version > 1, make sure there will be right quorum finished load jobs -- // to fill the version gap between 'replica committed version' and 'delete version - 1'. -- // if ok, add async delete task. -- if (!replica.checkVersionCatchUp(checkVersion, checkVersionHash)) { -- long replicaVersion = replica.getVersion(); -- if (replicaVersion == checkVersion) { -- // in this case, version is same but version hash is not. -- // which mean the current replica version is a non-committed version. -- // so the replica's committed version should be the previous one. -- --replicaVersion; -- } -- -- // the *diffVersion* is number of versions need to be check -- // for now: -- // *replicaVersion* : the 'committed version' of the replica -- // *checkVersion* : -- // 1) if preCheck, this is partition committed version -- // 2) if not preCheck, this is delete version -- long diffVersion = checkVersion - replicaVersion; -- Preconditions.checkState(diffVersion > 0); -- for (int i = 1; i <= diffVersion; i++) { -- boolean find = false; -- long theVersion = replicaVersion + i; -- for (LoadJob loadJob : quorumFinishedLoadJobs) { -- if (theVersion == loadJob.getPartitionLoadInfo(tableId, partitionId).getVersion()) { -- find = true; -- break; -- } -- } -- -- for (AsyncDeleteJob deleteJob : asyncDeleteJobs) { -- if (tableId == deleteJob.getTableId() && partitionId == deleteJob.getPartitionId() -- && theVersion == deleteJob.getPartitionVersion()) { -- find = true; -- break; -- } -- } -- -- if (!find) { -- if (theVersion == checkVersion && !preCheck) { -- // the sync delete task of this replica may failed. -- // add async delete task after. -- continue; -- } else { -- // this should not happend. add log to observe. -- LOG.error("replica version does not catch up with version: {}-{}. " -- + "replica: {}-{}-{}-{}", -- checkVersion, checkVersionHash, replica.getId(), tablet.getId(), -- replica.getBackendId(), replica.getState()); -- throw new DdlException("Replica[" + tablet.getId() + "-" + replica.getId() -- + "] is not catch up with version: " + checkVersion + "-" -- + replica.getVersion()); -- } -- } -- } -- -- needAsyncBackendIds.add(replica.getBackendId()); -- } // end check replica version -- } // end for replicas -- -- if (replicationNum - needAsyncBackendIds.size() < replicationNum / 2 + 1) { -- String backendsStr = Joiner.on(", ").join(needAsyncBackendIds); -- LOG.warn("too many unavailable replica in tablet[{}], backends:[{}]", tablet.getId(), backendsStr); -- throw new DdlException("Too many replicas are not available. Wait 10 mins and try again." -- + " if still not work, contact Palo RD"); -- } -- -- if (!needAsyncBackendIds.isEmpty()) { -- LOG.info("add tablet[{}] to async delete. backends: {}", -- tablet.getId(), needAsyncBackendIds); -- asyncTabletIdToBackends.put(tablet.getId(), needAsyncBackendIds); -- } -- } // end for tablets -- } // end for indices -- -- if (deleteConditions == null) { -- return; -- } -- -- // save delete conditions -- for (Predicate condition : conditions) { -- if (condition instanceof BinaryPredicate) { -- BinaryPredicate binaryPredicate = (BinaryPredicate) condition; -- SlotRef slotRef = (SlotRef) binaryPredicate.getChild(0); -- String columnName = slotRef.getColumnName(); -- StringBuilder sb = new StringBuilder(); -- sb.append(columnName).append(" ").append(binaryPredicate.getOp().name()).append(" \"") -- .append(((LiteralExpr) binaryPredicate.getChild(1)).getStringValue()).append("\""); -- deleteConditions.add(sb.toString()); -- } else if (condition instanceof IsNullPredicate) { -- IsNullPredicate isNullPredicate = (IsNullPredicate) condition; -- SlotRef slotRef = (SlotRef) isNullPredicate.getChild(0); -- String columnName = slotRef.getColumnName(); -- StringBuilder sb = new StringBuilder(); -- sb.append(columnName); -- if (isNullPredicate.isNotNull()) { -- sb.append(" IS NOT NULL"); -- } else { -- sb.append(" IS NULL"); -- } -- deleteConditions.add(sb.toString()); -- } -- } -- } -- -- private void checkAndAddRunningSyncDeleteJob(long partitionId, String partitionName) throws DdlException { -- // check if there are synchronized delete job under going -- writeLock(); -- try { -- checkHasRunningSyncDeleteJob(partitionId, partitionName); -- partitionUnderDelete.add(partitionId); -- } finally { -- writeUnlock(); -- } -- } -- -- private void checkHasRunningSyncDeleteJob(long partitionId, String partitionName) throws DdlException { -- // check if there are synchronized delete job under going -- readLock(); -- try { -- if (partitionUnderDelete.contains(partitionId)) { -- throw new DdlException("Partition[" + partitionName + "] has running delete job. See 'SHOW DELETE'"); -- } -- } finally { -- readUnlock(); -- } -- } -- -- private void checkHasRunningAsyncDeleteJob(long partitionId, String partitionName) throws DdlException { -- readLock(); -- try { -- for (AsyncDeleteJob job : idToQuorumFinishedDeleteJob.values()) { -- if (job.getPartitionId() == partitionId) { -- throw new DdlException("Partition[" + partitionName + "] has running delete job. " -- + "See 'SHOW DELETE'"); -- } -- } -- } finally { -- readUnlock(); -- } -- } -- -- public void checkHashRunningDeleteJob(long partitionId, String partitionName) throws DdlException { -- checkHasRunningSyncDeleteJob(partitionId, partitionName); -- checkHasRunningAsyncDeleteJob(partitionId, partitionName); -- } -- -- public void delete(DeleteStmt stmt) throws DdlException { -- String dbName = stmt.getDbName(); -- String tableName = stmt.getTableName(); -- String partitionName = stmt.getPartitionName(); -- List conditions = stmt.getDeleteConditions(); -- Database db = Catalog.getInstance().getDb(dbName); -- if (db == null) { -- throw new DdlException("Db does not exist. name: " + dbName); -- } -- -- DeleteInfo deleteInfo = null; -- -- long tableId = -1; -- long partitionId = -1; -- long committedVersion = -1; -- long committedVersionHash = -1; -- long newVersion = -1; -- long newVersionHash = -1; -- AgentBatchTask deleteBatchTask = null; -- int totalReplicaNum = 0; -- Map> asyncTabletIdToBackends = Maps.newHashMap(); -- db.readLock(); -- try { -- Table table = db.getTable(tableName); -- if (table == null) { -- throw new DdlException("Table does not exist. name: " + tableName); -- } -- -- if (table.getType() != TableType.OLAP) { -- throw new DdlException("Not olap type table. type: " + table.getType().name()); -- } -- OlapTable olapTable = (OlapTable) table; -- -- if (olapTable.getState() != OlapTableState.NORMAL) { -- throw new DdlException("Table's state is not normal: " + tableName); -- } -- -- tableId = olapTable.getId(); -- Partition partition = olapTable.getPartition(partitionName); -- if (partition == null) { -- throw new DdlException("Partition does not exist. name: " + partitionName); -- } -- partitionId = partition.getId(); -- -- // pre check -- committedVersion = partition.getCommittedVersion(); -- committedVersionHash = partition.getCommittedVersionHash(); -- checkDelete(olapTable, partition, conditions, committedVersion, committedVersionHash, -- null, asyncTabletIdToBackends, true); -- -- newVersion = committedVersion + 1; -- newVersionHash = Math.abs(new Random().nextLong()); -- deleteInfo = new DeleteInfo(db.getId(), tableId, tableName, -- partition.getId(), partitionName, -- newVersion, newVersionHash, null); -- -- checkAndAddRunningSyncDeleteJob(deleteInfo.getPartitionId(), partitionName); -- -- // create sync delete tasks -- deleteBatchTask = new AgentBatchTask(); -- for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { -- int schemaHash = olapTable.getSchemaHashByIndexId(materializedIndex.getId()); -- for (Tablet tablet : materializedIndex.getTablets()) { -- long tabletId = tablet.getId(); -- for (Replica replica : tablet.getReplicas()) { -- -- if (asyncTabletIdToBackends.containsKey(tabletId) -- && asyncTabletIdToBackends.get(tabletId).contains(replica.getBackendId())) { -- continue; -- } -- -- AgentTask pushTask = new PushTask(null, replica.getBackendId(), db.getId(), -- tableId, partition.getId(), -- materializedIndex.getId(), tabletId, replica.getId(), -- schemaHash, newVersion, -- newVersionHash, null, -1L, 0, -1L, TPushType.DELETE, -- conditions, false, TPriority.HIGH); -- if (AgentTaskQueue.addTask(pushTask)) { -- deleteBatchTask.addTask(pushTask); -- ++totalReplicaNum; -- } -- } -- } -- } -- } finally { -- db.readUnlock(); -- } -- -- // send tasks to backends -- MarkedCountDownLatch countDownLatch = new MarkedCountDownLatch(totalReplicaNum); -- for (AgentTask task : deleteBatchTask.getAllTasks()) { -- countDownLatch.addMark(task.getBackendId(), task.getSignature()); -- ((PushTask) task).setCountDownLatch(countDownLatch); -- } -- AgentTaskExecutor.submit(deleteBatchTask); -- long timeout = Config.tablet_delete_timeout_second * 1000L * totalReplicaNum; -- boolean ok = false; -- try { -- ok = countDownLatch.await(timeout, TimeUnit.MILLISECONDS); -- } catch (InterruptedException e) { -- LOG.warn("InterruptedException: ", e); -- ok = false; -- } -- -- if (!ok) { -- // sync delete failed for unknown reason. -- // use async delete to try to make up after. -- LOG.warn("sync delete failed. try async delete. table: {}, partition: {}", tableName, partitionName); -- } -- -- Partition partition = null; -- try { -- // after check -- db.writeLock(); -- try { -- OlapTable table = (OlapTable) db.getTable(tableName); -- if (table == null) { -- throw new DdlException("Table does not exist. name: " + tableName); -- } -- -- partition = table.getPartition(partitionName); -- if (partition == null) { -- throw new DdlException("Partition does not exist. name: " + partitionName); -- } -- -- // after check -- // 1. check partition committed version first -- if (partition.getCommittedVersion() > committedVersion -- || (committedVersion == partition.getCommittedVersion() -- && committedVersionHash != partition.getCommittedVersionHash())) { -- LOG.warn("before delete version: {}-{}. after delete version: {}-{}", -- committedVersion, committedVersionHash, -- partition.getCommittedVersion(), partition.getCommittedVersionHash()); -- throw new DdlException("There may have some load job done during delete job. Try again"); -- } -- -- // 2. after check -- List deleteConditions = Lists.newArrayList(); -- checkDelete(table, partition, conditions, newVersion, newVersionHash, deleteConditions, -- asyncTabletIdToBackends, false); -- deleteInfo.setDeleteConditions(deleteConditions); -- -- // update partition's version -- updatePartitionVersion(partition, newVersion, newVersionHash, -1); -- -- for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { -- long indexId = materializedIndex.getId(); -- for (Tablet tablet : materializedIndex.getTablets()) { -- long tabletId = tablet.getId(); -- for (Replica replica : tablet.getReplicas()) { -- ReplicaPersistInfo info = -- ReplicaPersistInfo.createForCondDelete(indexId, -- tabletId, -- replica.getId(), -- replica.getVersion(), -- replica.getVersionHash(), -- replica.getDataSize(), -- replica.getRowCount()); -- deleteInfo.addReplicaPersistInfo(info); -- } -- } -- } -- -- writeLock(); -- try { -- // handle async delete jobs -- if (!asyncTabletIdToBackends.isEmpty()) { -- AsyncDeleteJob asyncDeleteJob = new AsyncDeleteJob(db.getId(), tableId, partition.getId(), -- newVersion, newVersionHash, -- conditions); -- for (Long tabletId : asyncTabletIdToBackends.keySet()) { -- asyncDeleteJob.addTabletId(tabletId); -- } -- deleteInfo.setAsyncDeleteJob(asyncDeleteJob); -- idToQuorumFinishedDeleteJob.put(asyncDeleteJob.getJobId(), asyncDeleteJob); -- LOG.info("finished create async delete job: {}", asyncDeleteJob.getJobId()); -- } -- -- // save delete info -- List deleteInfos = dbToDeleteInfos.get(db.getId()); -- if (deleteInfos == null) { -- deleteInfos = Lists.newArrayList(); -- dbToDeleteInfos.put(db.getId(), deleteInfos); -- } -- deleteInfos.add(deleteInfo); -- } finally { -- writeUnlock(); -- } -- -- // Write edit log -- Catalog.getInstance().getEditLog().logFinishSyncDelete(deleteInfo); -- LOG.info("delete job finished at: {}. table: {}, partition: {}", -- TimeUtils.longToTimeString(System.currentTimeMillis()), tableName, partitionName); -- } finally { -- db.writeUnlock(); -- } -- } catch (Exception e) { -- // cancel delete -- // need not save cancel delete task in AgentTaskQueue -- AgentBatchTask cancelDeleteBatchTask = new AgentBatchTask(); -- for (AgentTask task : deleteBatchTask.getAllTasks()) { -- PushTask pushTask = (PushTask) task; -- CancelDeleteTask cancelDeleteTask = -- new CancelDeleteTask(task.getBackendId(), task.getDbId(), task.getTableId(), -- task.getPartitionId(), task.getIndexId(), task.getTabletId(), -- pushTask.getSchemaHash(), pushTask.getVersion(), -- pushTask.getVersionHash()); -- cancelDeleteBatchTask.addTask(cancelDeleteTask); -- } -- if (cancelDeleteBatchTask.getTaskNum() > 0) { -- AgentTaskExecutor.submit(cancelDeleteBatchTask); -- } -- -- String failMsg = "delete fail, " + e.getMessage(); -- LOG.warn(failMsg); -- throw new DdlException(failMsg); -- } finally { -- // clear tasks -- List tasks = deleteBatchTask.getAllTasks(); -- for (AgentTask task : tasks) { -- PushTask pushTask = (PushTask) task; -- AgentTaskQueue.removePushTask(pushTask.getBackendId(), pushTask.getSignature(), -- pushTask.getVersion(), pushTask.getVersionHash(), -- pushTask.getPushType()); -- } -- -- writeLock(); -- try { -- partitionUnderDelete.remove(partitionId); -- } finally { -- writeUnlock(); -- } -- } -- } -- -- public List> getAsyncDeleteJobInfo(long jobId) { -- LinkedList> infos = new LinkedList>(); -- readLock(); -- try { -- AsyncDeleteJob job = idToQuorumFinishedDeleteJob.get(jobId); -- if (job == null) { -- return infos; -- } -- -- for (Long tabletId : job.getTabletIds()) { -- List info = Lists.newArrayList(); -- info.add(tabletId); -- infos.add(info); -- } -- } finally { -- readUnlock(); -- } -- -- return infos; -- } -- -- public int getDeleteInfoNum(long dbId) { -- readLock(); -- try { -- if (dbToDeleteInfos.containsKey(dbId)) { -- return dbToDeleteInfos.get(dbId).size(); -- } -- return 0; -- } finally { -- readUnlock(); -- } -- } -- -- public List> getDeleteInfosByDb(long dbId, boolean forUser) { -- LinkedList> infos = new LinkedList>(); -- Database db = Catalog.getInstance().getDb(dbId); -- if (db == null) { -- return infos; -- } -- -- String dbName = db.getFullName(); -- readLock(); -- try { -- List deleteInfos = dbToDeleteInfos.get(dbId); -- if (deleteInfos == null) { -- return infos; -- } -- -- for (DeleteInfo deleteInfo : deleteInfos) { -- if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, -- deleteInfo.getTableName(), -- PrivPredicate.LOAD)) { -- continue; -- } -- -- List info = Lists.newArrayList(); -- if (!forUser) { -- info.add(deleteInfo.getJobId()); -- info.add(deleteInfo.getTableId()); -- } -- info.add(deleteInfo.getTableName()); -- if (!forUser) { -- info.add(deleteInfo.getPartitionId()); -- } -- info.add(deleteInfo.getPartitionName()); -- -- info.add(TimeUtils.longToTimeString(deleteInfo.getCreateTimeMs())); -- String conds = Joiner.on(", ").join(deleteInfo.getDeleteConditions()); -- info.add(conds); -- -- if (!forUser) { -- info.add(deleteInfo.getPartitionVersion()); -- info.add(deleteInfo.getPartitionVersionHash()); -- } -- -- info.add(deleteInfo.getState().name()); -- infos.add(info); -- } -- -- } finally { -- readUnlock(); -- } -- -- // sort by createTimeMs -- int sortIndex; -- if (!forUser) { -- sortIndex = 4; -- } else { -- sortIndex = 2; -- } -- ListComparator> comparator = new ListComparator>(sortIndex); -- Collections.sort(infos, comparator); -- return infos; -- } -- -- public void removeOldDeleteJobs() { -- long currentTimeMs = System.currentTimeMillis(); -- -- writeLock(); -- try { -- Iterator>> iter1 = dbToDeleteInfos.entrySet().iterator(); -- while (iter1.hasNext()) { -- Map.Entry> entry = iter1.next(); -- Iterator iter2 = entry.getValue().iterator(); -- while (iter2.hasNext()) { -- DeleteInfo deleteInfo = iter2.next(); -- if ((currentTimeMs - deleteInfo.getCreateTimeMs()) / 1000 > Config.label_keep_max_second) { -- iter2.remove(); -- } -- } -- -- if (entry.getValue().isEmpty()) { -- iter1.remove(); -- } -- } -- } finally { -- writeUnlock(); -- } -- } -- -- public void removeDbDeleteJob(long dbId) { -- writeLock(); -- try { -- dbToDeleteInfos.remove(dbId); -- } finally { -- writeUnlock(); -- } -- } -- -- public LoadJob getLastestFinishedLoadJob(long dbId) { -- LoadJob job = null; -- readLock(); -- try { -- long maxTime = Long.MIN_VALUE; -- List jobs = dbToLoadJobs.get(dbId); -- if (jobs != null) { -- for (LoadJob loadJob : jobs) { -- if (loadJob.getState() != JobState.QUORUM_FINISHED && loadJob.getState() != JobState.FINISHED) { -- continue; -- } -- if (loadJob.getLoadFinishTimeMs() > maxTime) { -- maxTime = loadJob.getLoadFinishTimeMs(); -- job = loadJob; -- } -- } -- } -- } finally { -- readUnlock(); -- } -- -- return job; -- } -- -- public DeleteInfo getLastestFinishedDeleteInfo(long dbId) { -- DeleteInfo deleteInfo = null; -- readLock(); -- try { -- long maxTime = Long.MIN_VALUE; -- List deleteInfos = dbToDeleteInfos.get(dbId); -- if (deleteInfos != null) { -- for (DeleteInfo info : deleteInfos) { -- if (info.getCreateTimeMs() > maxTime) { -- maxTime = info.getCreateTimeMs(); -- deleteInfo = info; -- } -- } -- } -- } finally { -- readUnlock(); -- } -- return deleteInfo; -- } -- -- public Integer getLoadJobNumByTypeAndState(EtlJobType type, JobState state) { -- int num = 0; -- readLock(); -- try { -- Map jobMap = null; -- if (state == null || state == JobState.CANCELLED || state == JobState.FINISHED) { -- jobMap = idToLoadJob; -- } else { -- switch (state) { -- case PENDING: -- jobMap = idToPendingLoadJob; -- break; -- case ETL: -- jobMap = idToEtlLoadJob; -- break; -- case LOADING: -- jobMap = idToLoadingLoadJob; -- break; -- case QUORUM_FINISHED: -- jobMap = idToQuorumFinishedLoadJob; -- break; -- default: -- break; -- } -- } -- Preconditions.checkNotNull(jobMap); -- -- for (LoadJob job : jobMap.values()) { -- if (job.getEtlJobType() == type) { -- if (state != null && job.getState() != state) { -- continue; -- } -- ++num; -- } -- } -- -- } finally { -- readUnlock(); -- } -- return num; -- } --} -- diff --git a/fe/src/main/java/com/baidu/palo/load/LoadChecker.java b/fe/src/main/java/com/baidu/palo/load/LoadChecker.java index ae5a1d8579..336e50f3ac 100644 --- a/fe/src/main/java/com/baidu/palo/load/LoadChecker.java +++ b/fe/src/main/java/com/baidu/palo/load/LoadChecker.java @@ -15,48 +15,54 @@ package com.baidu.palo.load; +import com.baidu.palo.alter.RollupJob; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; -import com.baidu.palo.catalog.MaterializedIndex; -import com.baidu.palo.catalog.MaterializedIndex.IndexState; -import com.baidu.palo.catalog.OlapTable; -import com.baidu.palo.catalog.Partition; import com.baidu.palo.catalog.Replica; import com.baidu.palo.catalog.Replica.ReplicaState; +import com.baidu.palo.transaction.GlobalTransactionMgr; +import com.baidu.palo.transaction.TabletCommitInfo; +import com.baidu.palo.transaction.TransactionCommitFailedException; +import com.baidu.palo.transaction.TransactionState; +import com.baidu.palo.transaction.TransactionStatus; +import com.baidu.palo.catalog.MaterializedIndex.IndexState; +import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.catalog.Tablet; +import com.baidu.palo.catalog.MaterializedIndex; +import com.baidu.palo.catalog.Partition; +import com.baidu.palo.catalog.TabletInvertedIndex; import com.baidu.palo.clone.Clone; import com.baidu.palo.clone.CloneJob.JobPriority; import com.baidu.palo.clone.CloneJob.JobType; import com.baidu.palo.common.Config; +import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.common.util.Daemon; import com.baidu.palo.load.AsyncDeleteJob.DeleteState; import com.baidu.palo.load.FailMsg.CancelType; import com.baidu.palo.load.LoadJob.EtlJobType; import com.baidu.palo.load.LoadJob.JobState; -import com.baidu.palo.persist.ReplicaPersistInfo; import com.baidu.palo.task.AgentBatchTask; -import com.baidu.palo.task.AgentTask; import com.baidu.palo.task.AgentTaskExecutor; import com.baidu.palo.task.AgentTaskQueue; import com.baidu.palo.task.HadoopLoadEtlTask; +import com.baidu.palo.task.MiniLoadEtlTask; +import com.baidu.palo.task.MiniLoadPendingTask; import com.baidu.palo.task.HadoopLoadPendingTask; import com.baidu.palo.task.InsertLoadEtlTask; import com.baidu.palo.task.MasterTask; import com.baidu.palo.task.MasterTaskExecutor; -import com.baidu.palo.task.MiniLoadEtlTask; -import com.baidu.palo.task.MiniLoadPendingTask; import com.baidu.palo.task.PullLoadEtlTask; import com.baidu.palo.task.PullLoadPendingTask; import com.baidu.palo.task.PushTask; import com.baidu.palo.thrift.TPriority; import com.baidu.palo.thrift.TPushType; - -import com.google.common.base.Preconditions; +import com.baidu.palo.thrift.TTaskType; import com.google.common.collect.Maps; -import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -231,11 +237,6 @@ public class LoadChecker extends Daemon { private void runOneLoadingJob(LoadJob job) { // check timeout Load load = Catalog.getInstance().getLoadInstance(); - if (checkTimeout(job)) { - load.cancelLoadJob(job, CancelType.TIMEOUT, "loading timeout to cancel"); - return; - } - // get db long dbId = job.getDbId(); Database db = Catalog.getInstance().getDb(dbId); @@ -244,6 +245,38 @@ public class LoadChecker extends Daemon { return; } + if (job.getTransactionId() < 0) { + LOG.warn("cancel load job {} because it is an old type job, user should resubmit it", job); + load.cancelLoadJob(job, CancelType.UNKNOWN, "cancelled because system is during upgrade, user should resubmit it"); + return; + } + // check if the job is aborted in transaction manager + TransactionState state = Catalog.getCurrentGlobalTransactionMgr() + .getTransactionState(job.getTransactionId()); + if (state == null) { + LOG.warn("cancel load job {} because could not find transaction state", job); + load.cancelLoadJob(job, CancelType.UNKNOWN, "transaction state lost"); + return; + } + if (state.getTransactionStatus() == TransactionStatus.ABORTED) { + load.cancelLoadJob(job, CancelType.LOAD_RUN_FAIL, + "job is aborted in transaction manager [" + state + "]"); + return; + } else if (state.getTransactionStatus() == TransactionStatus.COMMITTED) { + // if job is committed and then fe restart, the progress is not persisted, so that set it here + job.setProgress(100); + LOG.debug("job {} is already committed, just wait it to be visiable, transaction state {}", job, state); + return; + } else if (state.getTransactionStatus() == TransactionStatus.VISIBLE) { + // if job is committed and then fe restart, the progress is not persisted, so that set it here + load.updateLoadJobState(job, JobState.FINISHED); + return; + } + + if (checkTimeout(job)) { + load.cancelLoadJob(job, CancelType.TIMEOUT, "loading timeout to cancel"); + return; + } // submit push tasks to backends Set jobTotalTablets = submitPushTasks(job, db); if (jobTotalTablets == null) { @@ -251,17 +284,57 @@ public class LoadChecker extends Daemon { return; } - // update load progress - Set quorumTablets = job.getQuorumTablets(); - job.setProgress(quorumTablets.size() * 100 / jobTotalTablets.size()); - - // check job quorum finished - if (quorumTablets.containsAll(jobTotalTablets)) { - if (load.updateLoadJobState(job, JobState.QUORUM_FINISHED)) { - LOG.info("load job quorum finished. job: {}", job); + // yiguolei: for real time load we use full finished replicas + Set fullTablets = job.getFullTablets(); + if (state.isRunning()) { + job.setProgress(fullTablets.size() * 100 / jobTotalTablets.size()); + } else { + job.setProgress(100); + } + + long stragglerTimeout = job.isSyncDeleteJob() ? job.getDeleteJobTimeout() / 2 + : Config.load_straggler_wait_second * 1000; + if (job.getQuorumTablets().containsAll(jobTotalTablets)) { + // commit the job to transaction manager and not care about the result + // if could not commit successfully and commit again until job is timeout + if (job.getQuorumFinishTimeMs() < 0) { + job.setQuorumFinishTimeMs(System.currentTimeMillis()); + } else if (System.currentTimeMillis() - job.getQuorumFinishTimeMs() > stragglerTimeout + || job.getFullTablets().containsAll(jobTotalTablets)) { + tryCommitJob(job, db); } } } + + private void tryCommitJob(LoadJob job, Database db) { + // check transaction state + Load load = Catalog.getInstance().getLoadInstance(); + GlobalTransactionMgr globalTransactionMgr = Catalog.getCurrentGlobalTransactionMgr(); + TransactionState transactionState = globalTransactionMgr.getTransactionState(job.getTransactionId()); + List tabletCommitInfos = new ArrayList(); + // when be finish load task, fe will update job's finish task info, use lock here to prevent + // concurrent problems + db.writeLock(); + try { + TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex(); + for (Replica replica : job.getFinishedReplicas()) { + // the inverted index contains rolling up replica + Long tabletId = invertedIndex.getTabletIdByReplica(replica.getId()); + if (tabletId == null) { + LOG.warn("could not find tablet id for replica {}, the tablet maybe dropped", replica); + continue; + } + tabletCommitInfos.add(new TabletCommitInfo(tabletId, replica.getBackendId())); + } + globalTransactionMgr.commitTransaction(job.getDbId(), job.getTransactionId(), tabletCommitInfos); + } catch (MetaNotFoundException | TransactionCommitFailedException e) { + LOG.warn("errors while commit transaction [{}], cancel the job {}, reason is {}", + transactionState.getTransactionId(), job, e); + load.cancelLoadJob(job, CancelType.UNKNOWN, transactionState.getReason()); + } finally { + db.writeUnlock(); + } + } private Set submitPushTasks(LoadJob job, Database db) { Map tabletLoadInfos = job.getIdToTabletLoadInfo(); @@ -269,7 +342,6 @@ public class LoadChecker extends Daemon { AgentBatchTask batchTask = new AgentBatchTask(); Set jobTotalTablets = new HashSet(); - long currentTimeMs = System.currentTimeMillis(); Map idToTableLoadInfo = job.getIdToTableLoadInfo(); for (Entry tableEntry : idToTableLoadInfo.entrySet()) { long tableId = tableEntry.getKey(); @@ -282,14 +354,20 @@ public class LoadChecker extends Daemon { } if (table == null) { LOG.warn("table does not exist. id: {}", tableId); - if (job.getState() == JobState.QUORUM_FINISHED) { - continue; - } else { - return null; + // if table is dropped during load, the the job is failed + return null; + } + TableLoadInfo tableLoadInfo = tableEntry.getValue(); + // check if the job is submit during rollup + RollupJob rollupJob = (RollupJob) Catalog.getInstance().getRollupHandler().getAlterJob(tableId); + boolean autoLoadToTwoTablet = true; + if (rollupJob != null && job.getTransactionId() > 0) { + long rollupIndexId = rollupJob.getRollupIndexId(); + if (tableLoadInfo.containsIndex(rollupIndexId)) { + autoLoadToTwoTablet = false; } } - - TableLoadInfo tableLoadInfo = tableEntry.getValue(); + for (Entry partitionEntry : tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { long partitionId = partitionEntry.getKey(); PartitionLoadInfo partitionLoadInfo = partitionEntry.getValue(); @@ -302,119 +380,69 @@ public class LoadChecker extends Daemon { Partition partition = table.getPartition(partitionId); if (partition == null) { LOG.warn("partition does not exist. id: {}", partitionId); - if (job.getState() == JobState.QUORUM_FINISHED) { - continue; - } else { - return null; - } + // if partition is + return null; } short replicationNum = table.getPartitionInfo().getReplicationNum(partition.getId()); - long version = partitionLoadInfo.getVersion(); - long versionHash = partitionLoadInfo.getVersionHash(); // check all indices (base + roll up (not include ROLLUP state index)) List indices = partition.getMaterializedIndices(); for (MaterializedIndex index : indices) { long indexId = index.getId(); + // if index is in rollup, then not load into it, be will automatically convert the data if (index.getState() == IndexState.ROLLUP) { - // XXX(cmy): this should not happend anymore. - // index with ROLLUP state is all in RollupJob instance - // observe and then remove LOG.error("skip table under rollup[{}]", indexId); continue; } + // 1. the load job's etl is started before rollup finished + // 2. rollup job comes into finishing state, add rollup index to catalog + // 3. load job's etl finished, begin to load + // 4. load will send data to new rollup index, but could not get schema hash, load will failed + if (!tableLoadInfo.containsIndex(indexId)) { + if (rollupJob.getRollupIndexId() == indexId) { + continue; + } else { + // if the index is not during rollup and not contained in table load info, it a fatal error + // return null, will cancel the load job + LOG.warn("could not find index {} in table load info, and could not find " + + "it in rollup job, it is a fatal error", indexId); + return null; + } + } // add to jobTotalTablets first. for (Tablet tablet : index.getTablets()) { - jobTotalTablets.add(tablet.getId()); - } - - // after rollup finished, we should push the next version to rollup index first to clear the - // relationship between base and rollup index. - // so here we check if rollup index is push finished before sending push task - // to base index. - long rollupIndexId = index.getRollupIndexId(); - if (rollupIndexId != -1L) { - MaterializedIndex rollupIndex = partition.getIndex(rollupIndexId); - if (rollupIndex != null) { - long rollupFinishedVersion = index.getRollupFinishedVersion(); - Preconditions.checkState(rollupFinishedVersion != -1L); - if (version == rollupFinishedVersion + 1) { - // this materializedIndex is a base index. - // and this push version is the next version after rollup finished. - // check if rollup index is push finished with this version - boolean pushFinished = true; - for (Tablet tablet : rollupIndex.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - if (!replica.checkVersionCatchUp(version, versionHash)) { - LOG.debug("waiting for rollup replica[{}] push " - + "next version[{}] version hash[{}]. " - + "current version[{}] version hash[{}]. " - + "tablet[{}]", - replica.getId(), version, versionHash, - replica.getVersion(), replica.getVersionHash(), - tablet.getId()); - pushFinished = false; - break; - } - } // end for replicas - if (!pushFinished) { - break; - } - } // end for tablets - - if (!pushFinished) { - // skip this base index - continue; - } else { - // rollup index is push finished - // clear rollup info in base index - LOG.info("clear rollup index[{}] info in base index[{}]" - + " after finished pushing version[{}] in partition[{}]", - rollupIndex.getId(), indexId, version, partitionId); - index.clearRollupIndexInfo(); - // log - ReplicaPersistInfo info = - ReplicaPersistInfo.createForClearRollupInfo(db.getId(), - tableId, - partitionId, - indexId); - Catalog.getInstance().getEditLog().logClearRollupIndexInfo(info); - } - } else { - // XXX(cmy): - // this should not happend. add log to observe - LOG.error("base index[{}], rollup index[{}], push version[{}], rollup version[{}]", - indexId, rollupIndexId, version, rollupFinishedVersion); - index.clearRollupIndexInfo(); - // log - ReplicaPersistInfo info = - ReplicaPersistInfo.createForClearRollupInfo(db.getId(), - tableId, - partitionId, - indexId); - Catalog.getInstance().getEditLog().logClearRollupIndexInfo(info); - } - } else { - // this can only happend when rollup index has been dropped. - // do nothing - Preconditions.checkState(true); + // the job is submmitted before rollup finished and try to finish after rollup finished + // then the job's tablet load info does not contain the new rollup index's tablet + // not deal with this case because the finished replica will include new rollup index's replica + // and check it at commit time + if (tabletLoadInfos.containsKey(tablet.getId())) { + jobTotalTablets.add(tablet.getId()); } - } // end for handling rollup + } int schemaHash = tableLoadInfo.getIndexSchemaHash(indexId); short quorumNum = (short) (replicationNum / 2 + 1); for (Tablet tablet : index.getTablets()) { long tabletId = tablet.getId(); - // get tablet file path TabletLoadInfo tabletLoadInfo = tabletLoadInfos.get(tabletId); + // the tabletinfo maybe null, in this case: + // the job is submmitted before rollup finished and try to finish after rollup finished + // then the job's tablet load info does not contain the new rollup index's tablet + // not deal with this case because the finished replica will include new rollup index's replica + // and check it at commit time + if (tabletLoadInfo == null) { + continue; + } String filePath = tabletLoadInfo.getFilePath(); long fileSize = tabletLoadInfo.getFileSize(); // get push type TPushType type = TPushType.LOAD; - if (job.getDeleteFlag()) { + if (job.isSyncDeleteJob()) { + type = TPushType.DELETE; + } else if (job.getDeleteFlag()) { type = TPushType.LOAD_DELETE; } @@ -424,60 +452,31 @@ public class LoadChecker extends Daemon { for (Replica replica : tablet.getReplicas()) { long replicaId = replica.getId(); allReplicas.add(replicaId); - + // yiguolei: real time load do not need check replica state and version, version hashs // check replica state and replica version - ReplicaState state = replica.getState(); - boolean checkByState = (state == ReplicaState.NORMAL - || state == ReplicaState.SCHEMA_CHANGE); - long replicaVersion = replica.getVersion(); - long replicaVersionHash = replica.getVersionHash(); - // rules: - // 1. replica's version is the previous version of this load, and version hash is valid - // ATTN: we don't save the previous committed version hash, - // so... there is no way we can handle this situation... - // will fix it in trunk - // or - // 2. replica's version is equal to load version, but version hash is not equal. - boolean checkByVersion = (replicaVersion == version - 1) - || (replicaVersion == version && replicaVersionHash != versionHash); - - if (checkByState && checkByVersion) { - if (!tabletLoadInfo.isReplicaSent(replicaId)) { - AgentTask pushTask = new PushTask(job.getResourceInfo(), - replica.getBackendId(), db.getId(), tableId, - partitionId, indexId, - tabletId, replicaId, schemaHash, - version, versionHash, filePath, fileSize, 0, - job.getId(), type, null, - needDecompress, job.getPriority()); - if (AgentTaskQueue.addTask(pushTask)) { - batchTask.addTask(pushTask); - job.addPushTask((PushTask) pushTask); - tabletLoadInfo.addSentReplica(replicaId); - } + if (!tabletLoadInfo.isReplicaSent(replicaId)) { + PushTask pushTask = new PushTask(job.getResourceInfo(), + replica.getBackendId(), db.getId(), tableId, + partitionId, indexId, + tabletId, replicaId, schemaHash, + -1, 0, filePath, fileSize, 0, + job.getId(), type, job.getConditions(), + needDecompress, job.getPriority(), + TTaskType.REALTIME_PUSH, + job.getTransactionId(), + Catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().getNextTransactionId()); + pushTask.setIsSchemaChanging(autoLoadToTwoTablet); + if (AgentTaskQueue.addTask(pushTask)) { + batchTask.addTask(pushTask); + job.addPushTask((PushTask) pushTask); + tabletLoadInfo.addSentReplica(replicaId); } - } else if (replicaVersion > version - || (replicaVersion == version && replicaVersionHash == versionHash)) { + } + // yiguolei: wait here to check if quorum finished, should exclude the replica that is in clone state + // for example, there are 3 replicas, A normal B normal C clone, if A and C finish loading, we should not commit + // because commit will failed, then the job is failed + if (job.isReplicaFinished(replicaId) && replica.getLastFailedVersion() < 0) { finishedReplicas.add(replicaId); - // add replica persist info - long dataSize = replica.getDataSize(); - long rowCount = replica.getRowCount(); - ReplicaPersistInfo info = ReplicaPersistInfo.createForLoad(tableId, partitionId, - indexId, tabletId, - replicaId, - replicaVersion, - replicaVersionHash, - dataSize, rowCount); - job.addReplicaPersistInfos(info); - } else { - if (replicaVersion != version || replicaVersionHash != versionHash) { - LOG.warn("replica version is lower than job. replica: {}-{}-{}-{}-{}, " - + "replica state: {}, replica version: {}, replica version hash: {}," - + " job version: {}, job version hash: {}, backend id: {}", - db.getId(), tableId, partitionId, tabletId, replicaId, - state, replicaVersion, replicaVersionHash, - version, versionHash, replica.getBackendId()); - } } } // end for replicas @@ -485,8 +484,9 @@ public class LoadChecker extends Daemon { LOG.error("invalid situation. tablet is empty. id: {}", tabletId); } - // check tablet push statis - if (finishedReplicas.size() >= quorumNum) { + // check tablet push states + // quorum tablets and full tablets should be in tabletload infos or the process will > 100% + if (finishedReplicas.size() >= quorumNum && tabletLoadInfos.containsKey(tabletId)) { job.addQuorumTablet(tabletId); if (finishedReplicas.size() == allReplicas.size()) { job.addFullTablet(tabletId); @@ -540,195 +540,11 @@ public class LoadChecker extends Daemon { load.cancelLoadJob(job, CancelType.LOAD_RUN_FAIL, "db does not exist. id: " + dbId); return; } - - // submit push tasks to backends - Set jobTotalTablets = submitPushTasks(job, db); - if (jobTotalTablets == null) { - LOG.warn("submit push tasks fail"); + // if the job is quorum finished, just set it to finished and clear related etl job + if (load.updateLoadJobState(job, JobState.FINISHED)) { + load.clearJob(job, JobState.QUORUM_FINISHED); return; } - - // check job finished - boolean isJobFinished = false; - if (job.getFullTablets().containsAll(jobTotalTablets)) { - // db lock and check - LOG.info("all tablets have been finished, check all replicas version. job id: {}", job.getId()); - db.writeLock(); - try { - boolean allReplicaFinished = true; - Map idToTableLoadInfo = job.getIdToTableLoadInfo(); - OUTER_LOOP: - for (Entry tableEntry : idToTableLoadInfo.entrySet()) { - long tableId = tableEntry.getKey(); - OlapTable table = (OlapTable) db.getTable(tableId); - if (table == null) { - LOG.warn("table does not exist. id: {}", tableId); - continue; - } - - TableLoadInfo tableLoadInfo = tableEntry.getValue(); - for (Entry partitionEntry : - tableLoadInfo.getIdToPartitionLoadInfo().entrySet()) { - long partitionId = partitionEntry.getKey(); - PartitionLoadInfo partitionLoadInfo = partitionEntry.getValue(); - if (!partitionLoadInfo.isNeedLoad()) { - LOG.debug("partition does not have data to load. table id: {}, partition id: {}", - tableId, partitionId); - continue; - } - - long version = partitionLoadInfo.getVersion(); - long versionHash = partitionLoadInfo.getVersionHash(); - - // all tables - Partition partition = table.getPartition(partitionId); - if (partition == null) { - LOG.warn("partition does not exist. id: " + partitionId); - continue; - } - List indices = partition.getMaterializedIndices(); - for (MaterializedIndex materializedIndex : indices) { - if (materializedIndex.getState() == IndexState.ROLLUP) { - LOG.debug("index state is rollup. id: {}", materializedIndex.getId()); - continue; - } - - for (Tablet tablet : materializedIndex.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - // check version - long replicaVersion = replica.getVersion(); - long replicaVersionHash = replica.getVersionHash(); - if ((replicaVersion == version && replicaVersionHash != versionHash) - || replicaVersion < version) { - LOG.warn("replica does not catch up job version. replica: {}-{}-{}-{}-{}" - + ". replica version: {}, replica version hash: {}" - + ", partition version: {}, partition version hash: {}," - + " backend id: {}", dbId, tableId, partitionId, tablet.getId(), - replica.getId(), replicaVersion, replicaVersionHash, - version, versionHash, replica.getBackendId()); - allReplicaFinished = false; - break OUTER_LOOP; - } - } // end for replicas - } // end for tablets - } // end for indices - } // end for partitions - } // end for tables - - if (allReplicaFinished) { - if (load.updateLoadJobState(job, JobState.FINISHED)) { - isJobFinished = true; - LOG.info("load job finished. job: {}", job); - } - } - } finally { - db.writeUnlock(); - } - - // clear - if (isJobFinished) { - load.clearJob(job, JobState.QUORUM_FINISHED); - return; - } - } - - // handle if job is stay in QUORUM_FINISHED for a long time - // maybe this job cannot be done. try to use Clone to finish this job - long currentTimeMs = System.currentTimeMillis(); - if ((currentTimeMs - job.getCreateTimeMs()) / 1000 > Config.quorum_load_job_max_second) { - LOG.warn("load job {} stay in QUORUM_FINISHED for a long time.", job.getId()); - - db.readLock(); - try { - Map idToTableLoadInfo = job.getIdToTableLoadInfo(); - for (Entry tableEntry : idToTableLoadInfo.entrySet()) { - long tableId = tableEntry.getKey(); - OlapTable table = (OlapTable) db.getTable(tableId); - if (table == null) { - continue; - } - - TableLoadInfo tableLoadInfo = tableEntry.getValue(); - for (Entry partitionEntry : tableLoadInfo.getIdToPartitionLoadInfo() - .entrySet()) { - long partitionId = partitionEntry.getKey(); - PartitionLoadInfo partitionLoadInfo = partitionEntry.getValue(); - if (!partitionLoadInfo.isNeedLoad()) { - continue; - } - - long version = partitionLoadInfo.getVersion(); - long versionHash = partitionLoadInfo.getVersionHash(); - - // all tables - Partition partition = table.getPartition(partitionId); - if (partition == null) { - continue; - } - List indices = partition.getMaterializedIndices(); - for (MaterializedIndex materializedIndex : indices) { - if (materializedIndex.getState() == IndexState.ROLLUP) { - continue; - } - - Clone clone = Catalog.getInstance().getCloneInstance(); - for (Tablet tablet : materializedIndex.getTablets()) { - if (clone.containsTablet(tablet.getId())) { - // this tablet already has a clone job - continue; - } - for (Replica replica : tablet.getReplicas()) { - // check version - long replicaVersion = replica.getVersion(); - long replicaVersionHash = replica.getVersionHash(); - boolean checkByState = (replica.getState() == ReplicaState.NORMAL - || replica.getState() == ReplicaState.SCHEMA_CHANGE); - boolean checkByVersion = - (replicaVersion == version - 1 - || (replicaVersion == version && replicaVersionHash != versionHash)); - - if (checkByState && checkByVersion) { - // find a dest backend - Set tabletBeIds = tablet.getBackendIds(); - List destBeId = null; - int tryTime = tabletBeIds.size() + 1; - LOG.debug("tryTime: {}, tablet be: {}", tryTime, tabletBeIds); - do { - destBeId = Catalog.getCurrentSystemInfo().seqChooseBackendIds( - 1, true, false, db.getClusterName()); - LOG.debug("descBeId: {}", destBeId); - --tryTime; - if (destBeId == null) { - break; - } - } while (tabletBeIds.contains(destBeId.get(0)) && tryTime > 0); - - if (destBeId != null && !tabletBeIds.contains(destBeId.get(0))) { - if (clone.addCloneJob(dbId, tableId, partitionId, - materializedIndex.getId(), - tablet.getId(), destBeId.get(0), - JobType.MIGRATION, JobPriority.HIGH, - Config.clone_job_timeout_second * 1000L)) { - LOG.info("try use Clone to finish load job: {}. Tablet: {}, desc be: {}" - + "lower version replica: {} " - + "with version {} in be: {}", - job.getId(), tablet.getId(), destBeId.get(0), - replica.getId(), replicaVersion, replica.getBackendId()); - } - } else { - LOG.warn("failed to choose be to do Clone. Load job: {}. tablet: {}", - job.getId(), tablet.getId()); - } - } - } // end for replicas - } // end for tablets - } // end for indices - } // end for partitions - } // end for tables - } finally { - db.readUnlock(); - } - } } private void runOneQuorumFinishedDeleteJob(AsyncDeleteJob job) { @@ -741,91 +557,13 @@ public class LoadChecker extends Daemon { } db.readLock(); try { - long tableId = job.getTableId(); - OlapTable olapTable = (OlapTable) db.getTable(tableId); - if (olapTable == null) { - load.removeDeleteJobAndSetState(job); - return; - } - - long partitionId = job.getPartitionId(); - Partition partition = olapTable.getPartition(partitionId); - if (partition == null) { - load.removeDeleteJobAndSetState(job); - return; - } - - boolean allReplicaFinished = true; - long jobId = job.getJobId(); - long version = job.getPartitionVersion(); - long versionHash = job.getPartitionVersionHash(); - Set tabletIds = job.getTabletIds(); - AgentBatchTask batchTask = new AgentBatchTask(); - for (MaterializedIndex index : partition.getMaterializedIndices()) { - long indexId = index.getId(); - int schemaHash = olapTable.getSchemaHashByIndexId(indexId); - for (Tablet tablet : index.getTablets()) { - long tabletId = tablet.getId(); - if (!tabletIds.contains(tabletId)) { - continue; - } - for (Replica replica : tablet.getReplicas()) { - // check version - long replicaVersion = replica.getVersion(); - long replicaVersionHash = replica.getVersionHash(); - if ((replicaVersion == version && replicaVersionHash != versionHash) - || replicaVersion < version) { - LOG.warn("delete job:{}. replica does not catch up job version. replica: {}-{}-{}-{}-{}" - + ". replica version: {}, replica version hash: {}" - + ", partition version: {}, partition version hash: {}," - + " backend id: {}, replica state: {}", - jobId, dbId, tableId, partitionId, tabletId, replica.getId(), - replicaVersion, replicaVersionHash, version, versionHash, - replica.getBackendId(), replica.getState()); - allReplicaFinished = false; - - if (replica.getState() != ReplicaState.NORMAL) { - allReplicaFinished = false; - continue; - } - - if (replicaVersion != version && replicaVersion != version - 1) { - continue; - } - - if (!job.hasSend(replica.getId())) { - PushTask task = new PushTask(null, replica.getBackendId(), - dbId, tableId, partitionId, - indexId, tabletId, replica.getId(), - schemaHash, version, - versionHash, null, -1, - -1, job.getJobId(), TPushType.DELETE, - job.getConditions(), false, TPriority.HIGH); - task.setIsSyncDelete(false); - task.setAsyncDeleteJobId(job.getJobId()); - if (AgentTaskQueue.addTask(task)) { - batchTask.addTask(task); - job.setIsSend(replica.getId(), task); - } - } - } - } - } - } // end for indices - - AgentTaskExecutor.submit(batchTask); - - if (allReplicaFinished) { - // clear and finish delete job - job.clearTasks(); - job.setState(DeleteState.FINISHED); - - // log - Catalog.getInstance().getEditLog().logFinishAsyncDelete(job); - load.removeDeleteJobAndSetState(job); - LOG.info("delete job {} finished", job.getJobId()); - } - + // if the delete job is quorum finished, just set it to finished + job.clearTasks(); + job.setState(DeleteState.FINISHED); + // log + Catalog.getInstance().getEditLog().logFinishAsyncDelete(job); + load.removeDeleteJobAndSetState(job); + LOG.info("delete job {} finished", job.getJobId()); } finally { db.readUnlock(); } diff --git a/fe/src/main/java/com/baidu/palo/load/LoadJob.java b/fe/src/main/java/com/baidu/palo/load/LoadJob.java index c016ec08a1..b6ffc24f08 100644 --- a/fe/src/main/java/com/baidu/palo/load/LoadJob.java +++ b/fe/src/main/java/com/baidu/palo/load/LoadJob.java @@ -15,18 +15,29 @@ package com.baidu.palo.load; +import com.baidu.palo.analysis.BinaryPredicate; +import com.baidu.palo.analysis.BinaryPredicate.Operator; import com.baidu.palo.analysis.BrokerDesc; +import com.baidu.palo.analysis.IsNullPredicate; +import com.baidu.palo.analysis.LiteralExpr; +import com.baidu.palo.analysis.Predicate; +import com.baidu.palo.analysis.SlotRef; +import com.baidu.palo.analysis.StringLiteral; import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.catalog.Replica; +import com.baidu.palo.common.Config; import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.io.Text; import com.baidu.palo.common.io.Writable; import com.baidu.palo.load.FailMsg.CancelType; import com.baidu.palo.persist.ReplicaPersistInfo; import com.baidu.palo.task.PushTask; +import com.baidu.palo.thrift.TEtlState; import com.baidu.palo.thrift.TPriority; import com.baidu.palo.thrift.TResourceInfo; import com.google.common.base.Strings; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -36,8 +47,10 @@ import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; @@ -59,7 +72,8 @@ public class LoadJob implements Writable { HADOOP, MINI, INSERT, - BROKER + BROKER, + DELETE } private static final int DEFAULT_TIMEOUT_S = 0; @@ -69,6 +83,8 @@ public class LoadJob implements Writable { private long id; private long dbId; private String label; + // when this job is a real time load job, the job is attach with a transaction + private long transactionId = -1; long timestamp; private int timeoutSecond; private double maxFilterRatio; @@ -90,6 +106,8 @@ public class LoadJob implements Writable { private long etlFinishTimeMs; private long loadStartTimeMs; private long loadFinishTimeMs; + // not serialize it + private long quorumFinishTimeMs; private FailMsg failMsg; private EtlJobType etlJobType; @@ -101,6 +119,11 @@ public class LoadJob implements Writable { private Set fullTablets; private Set pushTasks; private Map replicaPersistInfos; + + private Map finishedReplicas; + + private List conditions = null; + private DeleteInfo deleteInfo; private TResourceInfo resourceInfo; @@ -109,7 +132,7 @@ public class LoadJob implements Writable { private long execMemLimit; // save table names for auth check - private Set tableNames; + private Set tableNames = Sets.newHashSet(); public LoadJob() { this(""); @@ -118,11 +141,65 @@ public class LoadJob implements Writable { public LoadJob(String label) { this(label, DEFAULT_TIMEOUT_S, DEFAULT_MAX_FILTER_RATIO); } + + // convert an async delete job to load job + public LoadJob(long id, long dbId, long tableId, long partitionId, String label, + Map indexIdToSchemaHash, List deleteConditions, + DeleteInfo deleteInfo) { + this.id = id; + this.dbId = dbId; + this.label = label; + this.transactionId = -1; + this.timestamp = -1; + this.timeoutSecond = DEFAULT_TIMEOUT_S; + this.deleteFlag = true; + this.state = JobState.LOADING; + this.progress = 0; + this.createTimeMs = System.currentTimeMillis(); + this.etlStartTimeMs = -1; + this.etlFinishTimeMs = -1; + this.loadStartTimeMs = -1; + this.loadFinishTimeMs = -1; + this.quorumFinishTimeMs = -1; + this.failMsg = new FailMsg(CancelType.UNKNOWN, ""); + this.etlJobType = EtlJobType.DELETE; + EtlStatus etlStatus = new EtlStatus(); + etlStatus.setState(TEtlState.FINISHED); + // has to use hadoop etl job info, because replay thread use hadoop job info + HadoopEtlJobInfo hadoopEtlJobInfo = new HadoopEtlJobInfo(); + hadoopEtlJobInfo.setCluster(""); + hadoopEtlJobInfo.setEtlOutputDir(""); + this.etlJobInfo = hadoopEtlJobInfo; + this.etlJobInfo.setJobStatus(etlStatus); + this.idToTableLoadInfo = Maps.newHashMap();; + this.idToTabletLoadInfo = Maps.newHashMap();; + this.quorumTablets = new HashSet(); + this.fullTablets = new HashSet(); + this.pushTasks = new HashSet(); + this.replicaPersistInfos = Maps.newHashMap(); + this.resourceInfo = null; + this.priority = TPriority.NORMAL; + this.execMemLimit = DEFAULT_EXEC_MEM_LIMIT; + this.finishedReplicas = Maps.newHashMap(); + + // generate table load info + PartitionLoadInfo partitionLoadInfo = new PartitionLoadInfo(null); + Map idToPartitionLoadInfo = new HashMap<>(); + idToPartitionLoadInfo.put(partitionId, partitionLoadInfo); + TableLoadInfo tableLoadInfo = new TableLoadInfo(idToPartitionLoadInfo); + tableLoadInfo.addAllSchemaHash(indexIdToSchemaHash); + idToTableLoadInfo.put(tableId, tableLoadInfo); + + // add delete conditions to load job + this.conditions = deleteConditions; + this.deleteInfo = deleteInfo; + } public LoadJob(String label, int timeoutSecond, double maxFilterRatio) { this.id = -1; this.dbId = -1; this.label = label; + this.transactionId = -1; this.timestamp = -1; this.timeoutSecond = timeoutSecond; this.maxFilterRatio = maxFilterRatio; @@ -134,6 +211,7 @@ public class LoadJob implements Writable { this.etlFinishTimeMs = -1; this.loadStartTimeMs = -1; this.loadFinishTimeMs = -1; + this.quorumFinishTimeMs = -1; this.failMsg = new FailMsg(CancelType.UNKNOWN, ""); this.etlJobType = EtlJobType.HADOOP; this.etlJobInfo = new HadoopEtlJobInfo(); @@ -146,7 +224,7 @@ public class LoadJob implements Writable { this.resourceInfo = null; this.priority = TPriority.NORMAL; this.execMemLimit = DEFAULT_EXEC_MEM_LIMIT; - this.tableNames = Sets.newHashSet(); + this.finishedReplicas = Maps.newHashMap(); } public void addTableName(String tableName) { @@ -172,6 +250,14 @@ public class LoadJob implements Writable { public void setDbId(long dbId) { this.dbId = dbId; } + + public long getTransactionId() { + return transactionId; + } + + public void setTransactionId(long transactionId) { + this.transactionId = transactionId; + } public String getLabel() { return label; @@ -291,6 +377,14 @@ public class LoadJob implements Writable { break; } } + + public long getQuorumFinishTimeMs() { + return quorumFinishTimeMs; + } + + public void setQuorumFinishTimeMs(long quorumFinishTimeMs) { + this.quorumFinishTimeMs = quorumFinishTimeMs; + } public FailMsg getFailMsg() { return failMsg; @@ -476,6 +570,10 @@ public class LoadJob implements Writable { public Set getQuorumTablets() { return quorumTablets; } + + public void clearQuorumTablets() { + quorumTablets.clear(); + } public void addFullTablet(long tabletId) { fullTablets.add(tabletId); @@ -510,7 +608,43 @@ public class LoadJob implements Writable { public TResourceInfo getResourceInfo() { return resourceInfo; } + + public boolean addFinishedReplica(Replica replica) { + finishedReplicas.put(replica.getId(), replica); + return true; + } + + public boolean isReplicaFinished(long replicaId) { + return finishedReplicas.containsKey(replicaId); + } + + public Collection getFinishedReplicas() { + return finishedReplicas.values(); + } + public List getConditions() { + return conditions; + } + + public boolean isSyncDeleteJob() { + if (conditions != null) { + return true; + } + return false; + } + + public DeleteInfo getDeleteInfo() { + return deleteInfo; + } + + public long getDeleteJobTimeout() { + long timeout = Math.max(idToTabletLoadInfo.size() + * Config.tablet_delete_timeout_second * 1000L, + 60000L); + timeout = Math.min(timeout, 300000L); + return timeout; + } + @Override public String toString() { return "LoadJob [id=" + id + ", dbId=" + dbId + ", label=" + label + ", timeoutSecond=" + timeoutSecond @@ -518,7 +652,8 @@ public class LoadJob implements Writable { + ", progress=" + progress + ", createTimeMs=" + createTimeMs + ", etlStartTimeMs=" + etlStartTimeMs + ", etlFinishTimeMs=" + etlFinishTimeMs + ", loadStartTimeMs=" + loadStartTimeMs + ", loadFinishTimeMs=" + loadFinishTimeMs + ", failMsg=" + failMsg + ", etlJobType=" + etlJobType - + ", etlJobInfo=" + etlJobInfo + ", priority=" + priority + "]"; + + ", etlJobInfo=" + etlJobInfo + ", priority=" + priority + ", transactionId=" + transactionId + + ", quorumFinishTimeMs=" + quorumFinishTimeMs +"]"; } public void clearRedundantInfoForHistoryJob() { @@ -657,6 +792,45 @@ public class LoadJob implements Writable { } out.writeLong(execMemLimit); + out.writeLong(transactionId); + + if (conditions != null) { + out.writeBoolean(true); + count = conditions.size(); + out.writeInt(count); + for (Predicate predicate : conditions) { + if (predicate instanceof BinaryPredicate) { + BinaryPredicate binaryPredicate = (BinaryPredicate) predicate; + SlotRef slotRef = (SlotRef) binaryPredicate.getChild(0); + String columnName = slotRef.getColumnName(); + Text.writeString(out, columnName); + Text.writeString(out, binaryPredicate.getOp().name()); + String value = ((LiteralExpr) binaryPredicate.getChild(1)).getStringValue(); + Text.writeString(out, value); + } else if (predicate instanceof IsNullPredicate) { + IsNullPredicate isNullPredicate = (IsNullPredicate) predicate; + SlotRef slotRef = (SlotRef) isNullPredicate.getChild(0); + String columnName = slotRef.getColumnName(); + Text.writeString(out, columnName); + Text.writeString(out, "IS"); + String value = null; + if (isNullPredicate.isNotNull()) { + value = "NOT NULL"; + } else { + value = "NULL"; + } + Text.writeString(out, value); + } + } + } else { + out.writeBoolean(false); + } + if (deleteInfo != null) { + out.writeBoolean(true); + deleteInfo.write(out); + } else { + out.writeBoolean(false); + } out.writeInt(tableNames.size()); for (String tableName : tableNames) { @@ -776,6 +950,38 @@ public class LoadJob implements Writable { this.execMemLimit = in.readLong(); } + if (version >= FeMetaVersion.VERSION_45) { + this.transactionId = in.readLong(); + if (in.readBoolean()) { + count = in.readInt(); + conditions = Lists.newArrayList(); + for (int i = 0; i < count; i++) { + String key = Text.readString(in); + String opStr = Text.readString(in); + if (opStr.equalsIgnoreCase("IS")) { + String value = Text.readString(in); + IsNullPredicate predicate; + if (value.equalsIgnoreCase("NOT NULL")) { + predicate = new IsNullPredicate(new SlotRef(null, key), true); + } else { + predicate = new IsNullPredicate(new SlotRef(null, key), true); + } + conditions.add(predicate); + } else { + Operator op = Operator.valueOf(opStr); + String value = Text.readString(in); + BinaryPredicate predicate = new BinaryPredicate(op, new SlotRef(null, key), + new StringLiteral(value)); + conditions.add(predicate); + } + } + } + if (in.readBoolean()) { + this.deleteInfo = new DeleteInfo(); + this.deleteInfo.readFields(in); + } + } + if (version >= FeMetaVersion.VERSION_43) { int size = in.readInt(); for (int i = 0; i < size; i++) { diff --git a/fe/src/main/java/com/baidu/palo/load/TableLoadInfo.java b/fe/src/main/java/com/baidu/palo/load/TableLoadInfo.java index 3555cce3a8..229c279c52 100644 --- a/fe/src/main/java/com/baidu/palo/load/TableLoadInfo.java +++ b/fe/src/main/java/com/baidu/palo/load/TableLoadInfo.java @@ -37,6 +37,13 @@ public class TableLoadInfo implements Writable { this.idToPartitionLoadInfo = idToPartitionLoadInfo; this.indexIdToSchemaHash = Maps.newHashMap(); } + + public boolean containsIndex(long indexId) { + if (indexIdToSchemaHash.containsKey(indexId)) { + return true; + } + return false; + } public Map getIdToPartitionLoadInfo() { return idToPartitionLoadInfo; diff --git a/fe/src/main/java/com/baidu/palo/master/MasterImpl.java b/fe/src/main/java/com/baidu/palo/master/MasterImpl.java index 59361f48b6..7ba7f99626 100644 --- a/fe/src/main/java/com/baidu/palo/master/MasterImpl.java +++ b/fe/src/main/java/com/baidu/palo/master/MasterImpl.java @@ -24,11 +24,13 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.MaterializedIndex; import com.baidu.palo.catalog.OlapTable; +import com.baidu.palo.catalog.OlapTable.OlapTableState; import com.baidu.palo.catalog.Partition; import com.baidu.palo.catalog.Partition.PartitionState; import com.baidu.palo.catalog.Replica; import com.baidu.palo.catalog.Tablet; import com.baidu.palo.catalog.TabletInvertedIndex; +import com.baidu.palo.catalog.TabletMeta; import com.baidu.palo.common.MetaNotFoundException; import com.baidu.palo.load.AsyncDeleteJob; import com.baidu.palo.load.LoadJob; @@ -37,9 +39,12 @@ import com.baidu.palo.system.Backend; import com.baidu.palo.task.AgentTask; import com.baidu.palo.task.AgentTaskQueue; import com.baidu.palo.task.CheckConsistencyTask; +import com.baidu.palo.task.ClearAlterTask; +import com.baidu.palo.task.ClearTransactionTask; import com.baidu.palo.task.CloneTask; import com.baidu.palo.task.CreateReplicaTask; import com.baidu.palo.task.CreateRollupTask; +import com.baidu.palo.task.PublishVersionTask; import com.baidu.palo.task.DirMoveTask; import com.baidu.palo.task.DownloadTask; import com.baidu.palo.task.PushTask; @@ -142,6 +147,20 @@ public class MasterImpl { Preconditions.checkState(request.isSetReport_version()); finishPush(task, request); break; + case REALTIME_PUSH: + checkHasTabletInfo(request); + Preconditions.checkState(request.isSetReport_version()); + finishRealtimePush(task, request); + break; + case PUBLISH_VERSION: + finishPublishVersion(task, request); + break; + case CLEAR_ALTER_TASK: + finishClearAlterTask(task, request); + break; + case CLEAR_TRANSACTION_TASK: + finishClearTransactionTask(task, request); + break; case DROP: finishDropReplica(task); break; @@ -176,6 +195,9 @@ public class MasterImpl { case MOVE: finishMoveDirTask(task, request); break; + case RECOVER_TABLET: + finishRecoverTablet(task); + break; default: break; } @@ -217,6 +239,165 @@ public class MasterImpl { AgentTaskQueue.removeTask(task.getBackendId(), TTaskType.CREATE, task.getSignature()); } + private void finishRealtimePush(AgentTask task, TFinishTaskRequest request) { + List finishTabletInfos = request.getFinish_tablet_infos(); + Preconditions.checkState(finishTabletInfos != null && !finishTabletInfos.isEmpty()); + + PushTask pushTask = (PushTask) task; + + long dbId = pushTask.getDbId(); + long backendId = pushTask.getBackendId(); + long signature = task.getSignature(); + Database db = Catalog.getInstance().getDb(dbId); + if (db == null) { + AgentTaskQueue.removeTask(backendId, TTaskType.REALTIME_PUSH, signature); + return; + } + + long tableId = pushTask.getTableId(); + long partitionId = pushTask.getPartitionId(); + long pushIndexId = pushTask.getIndexId(); + long pushTabletId = pushTask.getTabletId(); + // push finish type: + // numOfFinishTabletInfos tabletId schemaHash + // Normal: 1 / / + // SchemaChangeHandler 2 same diff + // RollupHandler 2 diff diff + // + // reuse enum 'PartitionState' here as 'push finish type' + PartitionState pushState = null; + if (finishTabletInfos.size() == 1) { + pushState = PartitionState.NORMAL; + } else if (finishTabletInfos.size() == 2) { + if (finishTabletInfos.get(0).getTablet_id() == finishTabletInfos.get(1).getTablet_id()) { + pushState = PartitionState.SCHEMA_CHANGE; + } else { + pushState = PartitionState.ROLLUP; + } + } else { + LOG.warn("invalid push report infos. finishTabletInfos' size: " + finishTabletInfos.size()); + return; + } + LOG.debug("push report state: {}", pushState.name()); + + db.writeLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); + if (olapTable == null) { + throw new MetaNotFoundException("cannot find table[" + tableId + "] when push finished"); + } + + Partition partition = olapTable.getPartition(partitionId); + if (partition == null) { + throw new MetaNotFoundException("cannot find partition[" + partitionId + "] when push finished"); + } + + MaterializedIndex pushIndex = partition.getIndex(pushIndexId); + if (pushIndex == null) { + // yiguolei: if index is dropped during load, it is not a failure. + // throw exception here and cause the job to cancel the task + throw new MetaNotFoundException("cannot find index[" + pushIndex + "] when push finished"); + } + + // should be done before addReplicaPersistInfos and countDownLatch + long reportVersion = request.getReport_version(); + Catalog.getCurrentSystemInfo().updateBackendReportVersion(task.getBackendId(), reportVersion, + task.getDbId()); + // handle load job + // TODO yiguolei: why delete should check request version and task version? + long loadJobId = pushTask.getLoadJobId(); + LoadJob job = Catalog.getInstance().getLoadInstance().getLoadJob(loadJobId); + if (job == null) { + throw new MetaNotFoundException("cannot find load job, job[" + loadJobId + "]"); + } + for (TTabletInfo tTabletInfo : finishTabletInfos) { + checkReplica(olapTable, partition, backendId, pushIndexId, pushTabletId, + tTabletInfo, pushState); + Replica replica = findRelatedReplica(olapTable, partition, + backendId, tTabletInfo); + // if the replica is under schema change, could not find the replica with aim schema hash + if (replica != null) { + job.addFinishedReplica(replica); + } + } + + AgentTaskQueue.removeTask(backendId, TTaskType.REALTIME_PUSH, signature); + LOG.debug("finish push replica. tabletId: {}, backendId: {}", pushTabletId, backendId); + } catch (MetaNotFoundException e) { + AgentTaskQueue.removeTask(backendId, TTaskType.REALTIME_PUSH, signature); + LOG.warn("finish push replica error", e); + } finally { + db.writeUnlock(); + } + } + + private void checkReplica(OlapTable olapTable, Partition partition, long backendId, + long pushIndexId, long pushTabletId, TTabletInfo tTabletInfo, PartitionState pushState) + throws MetaNotFoundException { + long tabletId = tTabletInfo.getTablet_id(); + int schemaHash = tTabletInfo.getSchema_hash(); + // during finishing stage, index's schema hash switched, when old schema hash finished + // current index hash != old schema hash and alter job's new schema hash != old schema hash + // the check replcia will failed + // should use tabletid not pushTabletid because in rollup state, the push tabletid != tabletid + // and tabletmeta will not contain rollupindex's schema hash + TabletMeta tabletMeta = Catalog.getCurrentInvertedIndex().getTabletMeta(tabletId); + if (!tabletMeta.containsSchemaHash(schemaHash)) { + throw new MetaNotFoundException("tablet[" + tabletId + + "] schemaHash is not equal to index's switchSchemaHash. " + + tabletMeta.toString()+ " vs. " + schemaHash); + } + } + + private Replica findRelatedReplica(OlapTable olapTable, Partition partition, + long backendId, TTabletInfo tTabletInfo) + throws MetaNotFoundException { + long tabletId = tTabletInfo.getTablet_id(); + long indexId = Catalog.getCurrentInvertedIndex().getIndexId(tabletId); + // both normal index and rollingup index are in inverted index + // this means the index is dropped during load + if (indexId == TabletInvertedIndex.NOT_EXIST_VALUE) { + LOG.warn("tablet[{}] may be dropped. push index[{}]", tabletId, indexId); + return null; + } + MaterializedIndex index = partition.getIndex(indexId); + if (index == null) { + // this means the index is under rollup + RollupHandler rollupHandler = Catalog.getInstance().getRollupHandler(); + AlterJob alterJob = rollupHandler.getAlterJob(olapTable.getId()); + if (alterJob == null && olapTable.getState() == OlapTableState.ROLLUP) { + // this happends when: + // a rollup job is finish and a delete job is the next first job (no load job before) + // and delete task is first send to base tablet, so it will return 2 tablets info. + // the second tablet is rollup tablet and it is no longer exist in alterJobs queue. + // just ignore the rollup tablet info. it will be handled in rollup tablet delete task report. + + // add log to observe + LOG.warn("Cannot find table[{}].", olapTable.getId()); + return null; + } + RollupJob rollupJob = (RollupJob) alterJob; + MaterializedIndex rollupIndex = rollupJob.getRollupIndex(partition.getId()); + + if (rollupIndex == null) { + LOG.warn("could not find index for tablet {}", tabletId); + return null; + } + index = rollupIndex; + } + Tablet tablet = index.getTablet(tabletId); + if (tablet == null) { + LOG.warn("could not find tablet {} in rollup index {} ", tabletId, indexId); + return null; + } + Replica replica = tablet.getReplicaByBackendId(backendId); + if (replica == null) { + LOG.warn("could not find replica with backend {} in tablet {} in rollup index {} ", + backendId, tabletId, indexId); + } + return replica; + } + private void finishPush(AgentTask task, TFinishTaskRequest request) { List finishTabletInfos = request.getFinish_tablet_infos(); Preconditions.checkState(finishTabletInfos != null && !finishTabletInfos.isEmpty()); @@ -242,7 +423,7 @@ public class MasterImpl { Database db = Catalog.getInstance().getDb(dbId); if (db == null) { AgentTaskQueue.removePushTask(backendId, signature, finishVersion, finishVersionHash, - pushTask.getPushType()); + pushTask.getPushType(), pushTask.getTaskType()); return; } @@ -340,18 +521,46 @@ public class MasterImpl { } AgentTaskQueue.removePushTask(backendId, signature, finishVersion, finishVersionHash, - pushTask.getPushType()); + pushTask.getPushType(), pushTask.getTaskType()); LOG.debug("finish push replica. tabletId: {}, backendId: {}", pushTabletId, backendId); } catch (MetaNotFoundException e) { AgentTaskQueue.removePushTask(backendId, signature, finishVersion, finishVersionHash, - pushTask.getPushType()); + pushTask.getPushType(), pushTask.getTaskType()); LOG.warn("finish push replica error", e); } finally { - db.writeUnlock(); } } + private void finishClearAlterTask(AgentTask task, TFinishTaskRequest request) { + ClearAlterTask clearAlterTask = (ClearAlterTask) task; + clearAlterTask.setFinished(); + AgentTaskQueue.removeTask(task.getBackendId(), + task.getTaskType(), + task.getSignature()); + } + + private void finishClearTransactionTask(AgentTask task, TFinishTaskRequest request) { + ClearTransactionTask clearTransactionTask = (ClearTransactionTask) task; + clearTransactionTask.setFinished(); + AgentTaskQueue.removeTask(task.getBackendId(), + task.getTaskType(), + task.getSignature()); + } + + private void finishPublishVersion(AgentTask task, TFinishTaskRequest request) { + List errorTabletIds = null; + if (request.isSetError_tablet_ids()) { + errorTabletIds = request.getError_tablet_ids(); + } + PublishVersionTask publishVersionTask = (PublishVersionTask)task; + publishVersionTask.addErrorTablets(errorTabletIds); + publishVersionTask.setIsFinished(true); + AgentTaskQueue.removeTask(publishVersionTask.getBackendId(), + publishVersionTask.getTaskType(), + publishVersionTask.getSignature()); + } + private ReplicaPersistInfo updateReplicaInfo(OlapTable olapTable, Partition partition, long backendId, long pushIndexId, long pushTabletId, TTabletInfo tTabletInfo, PartitionState pushState) @@ -516,6 +725,10 @@ public class MasterImpl { AgentTaskQueue.removeTask(task.getBackendId(), TTaskType.MOVE, task.getSignature()); } } + + private void finishRecoverTablet(AgentTask task) { + AgentTaskQueue.removeTask(task.getBackendId(), TTaskType.RECOVER_TABLET, task.getSignature()); + } public TMasterResult report(TReportRequest request) throws TException { TMasterResult result = reportHandler.handleReport(request); diff --git a/fe/src/main/java/com/baidu/palo/master/ReportHandler.java b/fe/src/main/java/com/baidu/palo/master/ReportHandler.java index bdcaf89fc3..f6149176fc 100644 --- a/fe/src/main/java/com/baidu/palo/master/ReportHandler.java +++ b/fe/src/main/java/com/baidu/palo/master/ReportHandler.java @@ -1,5 +1,4 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -12,11 +11,12 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - package com.baidu.palo.master; import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.Database; +import com.baidu.palo.catalog.KeysType; import com.baidu.palo.catalog.MaterializedIndex; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.catalog.Partition; @@ -33,18 +33,24 @@ import com.baidu.palo.task.AgentBatchTask; import com.baidu.palo.task.AgentTask; import com.baidu.palo.task.AgentTaskExecutor; import com.baidu.palo.task.AgentTaskQueue; +import com.baidu.palo.task.ClearTransactionTask; +import com.baidu.palo.task.CreateReplicaTask; import com.baidu.palo.task.DropReplicaTask; import com.baidu.palo.task.MasterTask; +import com.baidu.palo.task.PublishVersionTask; import com.baidu.palo.task.PushTask; +import com.baidu.palo.task.RecoverTabletTask; import com.baidu.palo.task.StorageMediaMigrationTask; import com.baidu.palo.thrift.TBackend; import com.baidu.palo.thrift.TDisk; import com.baidu.palo.thrift.TMasterResult; +import com.baidu.palo.thrift.TPartitionVersionInfo; import com.baidu.palo.thrift.TPushType; import com.baidu.palo.thrift.TReportRequest; import com.baidu.palo.thrift.TStatus; import com.baidu.palo.thrift.TStatusCode; import com.baidu.palo.thrift.TStorageMedium; +import com.baidu.palo.thrift.TStorageType; import com.baidu.palo.thrift.TTablet; import com.baidu.palo.thrift.TTabletInfo; import com.baidu.palo.thrift.TTaskType; @@ -70,15 +76,12 @@ public class ReportHandler extends Daemon { private static final Logger LOG = LogManager.getLogger(ReportHandler.class); private BlockingQueue reportQueue = Queues.newLinkedBlockingQueue(); - public ReportHandler() { } - public TMasterResult handleReport(TReportRequest request) throws TException { TMasterResult result = new TMasterResult(); TStatus tStatus = new TStatus(TStatusCode.OK); result.setStatus(tStatus); - // get backend TBackend tBackend = request.getBackend(); String host = tBackend.getHost(); @@ -91,13 +94,12 @@ public class ReportHandler extends Daemon { tStatus.setError_msgs(errorMsgs); return result; } - long beId = backend.getId(); Map> tasks = null; Map disks = null; Map tablets = null; + boolean forceRecovery = false; long reportVersion = -1; - if (request.isSetTasks()) { tasks = request.getTasks(); } @@ -108,9 +110,10 @@ public class ReportHandler extends Daemon { tablets = request.getTablets(); reportVersion = request.getReport_version(); } - - ReportTask reportTask = new ReportTask(beId, tasks, disks, tablets, reportVersion); - + if (request.isSetForce_recovery()) { + forceRecovery = request.isForce_recovery(); + } + ReportTask reportTask = new ReportTask(beId, tasks, disks, tablets, reportVersion, forceRecovery); try { reportQueue.put(reportTask); } catch (InterruptedException e) { @@ -124,46 +127,111 @@ public class ReportHandler extends Daemon { LOG.info("receive report from be {}. current queue size: {}", backend.getId(), reportQueue.size()); return result; } - private class ReportTask extends MasterTask { private long beId; private Map> tasks; private Map disks; private Map tablets; private long reportVersion; - + private boolean forceRecovery = false; public ReportTask(long beId, Map> tasks, Map disks, - Map tablets, long reportVersion) { + Map tablets, long reportVersion, + boolean forceRecovery) { this.beId = beId; this.tasks = tasks; this.disks = disks; this.tablets = tablets; this.reportVersion = reportVersion; + this.forceRecovery = forceRecovery; } - @Override protected void exec() { if (tasks != null) { ReportHandler.taskReport(beId, tasks); } - if (disks != null) { ReportHandler.diskReport(beId, disks); } - if (tablets != null) { long backendReportVersion = Catalog.getCurrentSystemInfo().getBackendReportVersion(beId); if (reportVersion < backendReportVersion) { LOG.warn("out of date report version {} from backend[{}]. current report version[{}]", reportVersion, beId, backendReportVersion); } else { - ReportHandler.tabletReport(beId, tablets, reportVersion); + ReportHandler.tabletReport(beId, tablets, reportVersion, forceRecovery); } } } } + private static void tabletReport(long backendId, Map backendTablets, long backendReportVersion, + boolean forceRecovery) { + long start = System.currentTimeMillis(); + LOG.info("backend[{}] reports {} tablet(s). report version: {}", + backendId, backendTablets.size(), backendReportVersion); + + // storage medium map + HashMap storageMediumMap = Catalog.getInstance().getPartitionIdToStorageMediumMap(); + + // db id -> tablet id + ListMultimap tabletSyncMap = LinkedListMultimap.create(); + // db id -> tablet id + ListMultimap tabletDeleteFromMeta = LinkedListMultimap.create(); + // tablet ids which schema hash is valid + Set foundTabletsWithValidSchema = new HashSet(); + // tablet ids which schema hash is invalid + Map foundTabletsWithInvalidSchema = new HashMap(); + // storage medium -> tablet id + ListMultimap tabletMigrationMap = LinkedListMultimap.create(); + + ListMultimap transactionsToPublish = LinkedListMultimap.create(); + ListMultimap transactionsToClear = LinkedListMultimap.create(); + + List tabletRecoverTask = Lists.newArrayList(); + List createReplicaTasks = Lists.newArrayList(); + + // 1. do the diff. find out (intersection) / (be - meta) / (meta - be) + Catalog.getCurrentInvertedIndex().tabletReport(backendId, backendTablets, storageMediumMap, + tabletSyncMap, + tabletDeleteFromMeta, + foundTabletsWithValidSchema, + foundTabletsWithInvalidSchema, + tabletMigrationMap, + transactionsToPublish, + transactionsToClear, + tabletRecoverTask); + + // 2. sync + sync(backendTablets, tabletSyncMap, backendId, backendReportVersion); + + // 3. delete (meta - be) + // BE will automatically drop defective tablets. these tablets should also be dropped in catalog + deleteFromMeta(tabletDeleteFromMeta, backendId, backendReportVersion, createReplicaTasks); + + // 4. handle (be - meta) + deleteFromBackend(backendTablets, foundTabletsWithValidSchema, foundTabletsWithInvalidSchema, backendId); + + // 5. migration (ssd <-> hdd) + // disable migration because stream load does not support migration + // handleMigration(tabletMigrationMap, backendId); + + // 6. send clear transactions to be + handleClearTransactions(transactionsToClear, backendId); + + // 7. send publish version request to be + handleRepublishVersionInfo(transactionsToPublish, backendId); + + // 8. send recover request to + handleRecoverTablet(tabletRecoverTask, backendId, forceRecovery); + + // 9. send force create replica task to be + // handleForceCreateReplica(createReplicaTasks, backendId, forceRecovery); + + long end = System.currentTimeMillis(); + LOG.info("tablet report from backend[{}] cost: {}", backendId, (end - start)); + } + private static void taskReport(long backendId, Map> runningTasks) { LOG.info("begin to handle task report from backend {}", backendId); long start = System.currentTimeMillis(); @@ -197,7 +265,6 @@ public class ReportHandler extends Daemon { if (batchTask.getTaskNum() > 0) { AgentTaskExecutor.submit(batchTask); } - LOG.info("finished to handle task report from backend {}, diff task num: {}. cost: {} ms", backendId, batchTask.getTaskNum(), (System.currentTimeMillis() - start)); } @@ -205,7 +272,6 @@ public class ReportHandler extends Daemon { private static void diskReport(long backendId, Map backendDisks) { LOG.info("begin to handle disk report from backend {}", backendId); long start = System.currentTimeMillis(); - Backend backend = Catalog.getCurrentSystemInfo().getBackend(backendId); if (backend == null) { LOG.warn("backend doesn't exist. id: " + backendId); @@ -217,50 +283,6 @@ public class ReportHandler extends Daemon { backendId, (System.currentTimeMillis() - start)); } - private static void tabletReport(long backendId, Map backendTablets, long backendReportVersion) { - LOG.info("begin to handle tablet report from backend {}, tablet num: {}, report version: {}", - backendId, backendTablets.size(), backendReportVersion); - long start = System.currentTimeMillis(); - - // storage medium map - HashMap storageMediumMap = Catalog.getInstance().getPartitionIdToStorageMediumMap(); - - // db id -> tablet id - ListMultimap tabletSyncMap = LinkedListMultimap.create(); - // db id -> tablet id - ListMultimap tabletDeleteFromMeta = LinkedListMultimap.create(); - // tablet ids which schema hash is valid - Set foundTabletsWithValidSchema = new HashSet(); - // tablet ids which schema hash is invalid - Map foundTabletsWithInvalidSchema = new HashMap(); - // storage medium -> tablet id - ListMultimap tabletMigrationMap = LinkedListMultimap.create(); - - // 1. do the diff. find out (intersection) / (be - meta) / (meta - be) - Catalog.getCurrentInvertedIndex().tabletReport(backendId, backendTablets, storageMediumMap, - tabletSyncMap, - tabletDeleteFromMeta, - foundTabletsWithValidSchema, - foundTabletsWithInvalidSchema, - tabletMigrationMap); - - // 2. sync - sync(backendTablets, tabletSyncMap, backendId, backendReportVersion); - - // 3. delete (meta - be) - // BE will automatically drop defective tablets. these tablets should also be dropped in catalog - deleteFromMeta(tabletDeleteFromMeta, backendId, backendReportVersion); - - // 4. handle (be - meta) - deleteFromBackend(backendTablets, foundTabletsWithValidSchema, foundTabletsWithInvalidSchema, backendId); - - // 5. migration (ssd <-> hdd) - handleMigration(tabletMigrationMap, backendId); - - LOG.info("finished to handle tablet report from backend {}, cost: {} ms", - backendId, (System.currentTimeMillis() - start)); - } - private static void sync(Map backendTablets, ListMultimap tabletSyncMap, long backendId, long backendReportVersion) { TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex(); @@ -304,7 +326,13 @@ public class ReportHandler extends Daemon { if (replica == null) { continue; } - + // yiguolei: it is very important here, if the replica is under schema change or rollup + // should ignore the report. + // eg. + // original replica import successfully, but the dest schema change replica failed + // the fe will sync the replica with the original replica, but ignore the schema change replica. + // if the last failed version is changed, then fe will think schema change successfully. + // this is an fatal error. if (replica.getState() == ReplicaState.NORMAL) { long metaVersion = replica.getVersion(); long metaVersionHash = replica.getVersionHash(); @@ -312,7 +340,8 @@ public class ReportHandler extends Daemon { long backendVersionHash = -1L; long rowCount = -1L; long dataSize = -1L; - + // schema change maybe successfully in fe, but not inform be, then be will report two schema hash + // just select the dest schema hash for (TTabletInfo tabletInfo : backendTablets.get(tabletId).getTablet_infos()) { if (tabletInfo.getSchema_hash() == schemaHash) { backendVersion = tabletInfo.getVersion(); @@ -355,7 +384,8 @@ public class ReportHandler extends Daemon { } private static void deleteFromMeta(ListMultimap tabletDeleteFromMeta, long backendId, - long backendReportVersion) { + long backendReportVersion, + List createReplicaTasks) { TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex(); for (Long dbId : tabletDeleteFromMeta.keySet()) { Database db = Catalog.getInstance().getDb(dbId); @@ -409,7 +439,27 @@ public class ReportHandler extends Daemon { // so we do not delete it. List replicas = tablet.getReplicas(); if (replicas.size() <= 1) { - LOG.error("invalid situation. tablet[{}] has few replica[{}]", tabletId, replicas.size()); + LOG.error("backend [{}] invalid situation. tablet[{}] has few replica[{}], " + + "replica num setting is [{}]", + backendId, tabletId, replicas.size(), replicationNum); + // there is a replica in fe, but not in be and there is only one replica in this tablet + // in this case, it means data is lost + // should generate a create replica request to be to create a replica forcibly + if (replicas.size() == 1) { + short shortKeyColumnCount = olapTable.getShortKeyColumnCountByIndexId(indexId); + int schemaHash = olapTable.getSchemaHashByIndexId(indexId); + KeysType keysType = olapTable.getKeysType(); + List columns = olapTable.getSchemaByIndexId(indexId); + Set bfColumns = olapTable.getCopiedBfColumns(); + double bfFpp = olapTable.getBfFpp(); + CreateReplicaTask createReplicaTask = new CreateReplicaTask(backendId, dbId, + tableId, partitionId, indexId, tabletId, shortKeyColumnCount, + schemaHash, partition.getCommittedVersion(), + partition.getCommittedVersionHash(), keysType, + TStorageType.COLUMN, + TStorageMedium.HDD, columns, bfColumns, bfFpp, null); + createReplicaTasks.add(createReplicaTask); + } continue; } @@ -455,7 +505,6 @@ public class ReportHandler extends Daemon { AgentBatchTask batchTask = new AgentBatchTask(); for (Long tabletId : backendTablets.keySet()) { TTablet backendTablet = backendTablets.get(tabletId); - for (TTabletInfo backendTabletInfo : backendTablet.getTablet_infos()) { boolean needDelete = false; if (!foundTabletsWithValidSchema.contains(tabletId)) { @@ -516,6 +565,74 @@ public class ReportHandler extends Daemon { AgentTaskExecutor.submit(batchTask); } + private static void handleRepublishVersionInfo(ListMultimap transactionsToPublish, + long backendId) { + AgentBatchTask batchTask = new AgentBatchTask(); + for (Long transactionId : transactionsToPublish.keySet()) { + PublishVersionTask task = new PublishVersionTask(backendId, + transactionId, + transactionsToPublish.get(transactionId)); + batchTask.addTask(task); + // add to AgentTaskQueue for handling finish report. + AgentTaskQueue.addTask(task); + } + AgentTaskExecutor.submit(batchTask); + } + + private static void handleRecoverTablet(List tabletRecoverTasks, long backendId, + boolean forceRecovery) { + if (tabletRecoverTasks.size() > 0) { + // print a warn log here to indicate the exceptions on the backend + LOG.warn("find {} tablets with report version less than version in meta on backend {}" + + " they need clone or force recovery, force recovery is [{}]", + tabletRecoverTasks.size(), backendId, forceRecovery); + } + if (!forceRecovery) { + return; + } + AgentBatchTask batchTask = new AgentBatchTask(); + for (RecoverTabletTask recoverTask : tabletRecoverTasks) { + batchTask.addTask(recoverTask); + AgentTaskQueue.addTask(recoverTask); + } + + AgentTaskExecutor.submit(batchTask); + } + + private static void handleForceCreateReplica(List createReplicaTasks, + long backendId, boolean forceRecovery) { + // print this warn info to indicate admin the fatal state + if (createReplicaTasks.size() > 0) { + // print a warn log here to indicate the exceptions on the backend + LOG.warn("find {} tablets with only on replica and it is on this backend {}" + + " admin need create the tablet on this backend forcibly, " + + " force recovery is [{}]", + createReplicaTasks.size(), backendId, forceRecovery); + } + if (!forceRecovery) { + return; + } + AgentBatchTask batchTask = new AgentBatchTask(); + for (CreateReplicaTask recoverTask : createReplicaTasks) { + batchTask.addTask(recoverTask); + AgentTaskQueue.addTask(recoverTask); + } + + AgentTaskExecutor.submit(batchTask); + } + + private static void handleClearTransactions(ListMultimap transactionsToClear, long backendId) { + AgentBatchTask batchTask = new AgentBatchTask(); + for (Long transactionId : transactionsToClear.keySet()) { + ClearTransactionTask clearTransactionTask = new ClearTransactionTask(backendId, + transactionId, + transactionsToClear.get(transactionId)); + batchTask.addTask(clearTransactionTask); + AgentTaskQueue.addTask(clearTransactionTask); + } + + AgentTaskExecutor.submit(batchTask); + } private static void addReplica(long tabletId, TTabletInfo backendTabletInfo, long backendId) throws MetaNotFoundException { @@ -578,21 +695,36 @@ public class ReportHandler extends Daemon { final long id = replica.getBackendId(); final Backend backend = Catalog.getCurrentSystemInfo().getBackend(id); if (backend != null && backend.isAlive() && !backend.isDecommissioned() - && replica.getState() == ReplicaState.NORMAL) { + && replica.getState() == ReplicaState.NORMAL) { replicationOnLine++; } } if (replicationOnLine < replicationNum) { long replicaId = Catalog.getInstance().getNextId(); + long lastFailedVersion = -1L; + long lastFailedVersionHash = 0L; + if (version > partition.getNextVersion() - 1) { + // this is a fatal error + throw new MetaNotFoundException("version is invalid. tablet[" + version + "-" + versionHash + "]" + + ", partition's max version [" + (partition.getNextVersion() - 1) + "]"); + } else if (version < partition.getCurrentVersion() + || version == partition.getCurrentVersion() + && versionHash != partition.getCurrentVersionHash()) { + lastFailedVersion = partition.getCurrentVersion(); + lastFailedVersionHash = partition.getCurrentVersionHash(); + } Replica replica = new Replica(replicaId, backendId, version, versionHash, - dataSize, rowCount, ReplicaState.NORMAL); + dataSize, rowCount, ReplicaState.NORMAL, + lastFailedVersion, lastFailedVersionHash, version, versionHash); tablet.addReplica(replica); // write edit log ReplicaPersistInfo info = ReplicaPersistInfo.createForAdd(dbId, tableId, partitionId, indexId, tabletId, backendId, replicaId, - version, versionHash, dataSize, rowCount); + version, versionHash, dataSize, rowCount, + lastFailedVersion, lastFailedVersionHash, + version, versionHash); Catalog.getInstance().getEditLog().logAddReplica(info); @@ -606,14 +738,12 @@ public class ReportHandler extends Daemon { return; } } - throw new MetaNotFoundException("replica is enough[" + replicas.size() + "-" + replicationNum + "]"); } } finally { db.writeUnlock(); } } - @Override protected void runOneCycle() { while (true) { diff --git a/fe/src/main/java/com/baidu/palo/persist/EditLog.java b/fe/src/main/java/com/baidu/palo/persist/EditLog.java index 84c4b047af..4edcb784ad 100644 --- a/fe/src/main/java/com/baidu/palo/persist/EditLog.java +++ b/fe/src/main/java/com/baidu/palo/persist/EditLog.java @@ -53,6 +53,7 @@ import com.baidu.palo.mysql.privilege.UserPropertyInfo; import com.baidu.palo.qe.SessionVariable; import com.baidu.palo.system.Backend; import com.baidu.palo.system.Frontend; +import com.baidu.palo.transaction.TransactionState; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -128,6 +129,12 @@ public class EditLog { catalog.setNextId(id + 1); break; } + case OperationType.OP_SAVE_TRANSACTION_ID: { + String idString = ((Text) journal.getData()).toString(); + long id = Long.parseLong(idString); + catalog.getCurrentGlobalTransactionMgr().getTransactionIDGenerator().initTransactionId(id + 1); + break; + } case OperationType.OP_CREATE_DB: { Database db = (Database) journal.getData(); catalog.replayCreateDb(db); @@ -261,6 +268,11 @@ public class EditLog { catalog.getRollupHandler().replayInitJob(job, catalog); break; } + case OperationType.OP_FINISHING_ROLLUP: { + RollupJob job = (RollupJob) journal.getData(); + catalog.getRollupHandler().replayFinishing(job, catalog); + break; + } case OperationType.OP_FINISH_ROLLUP: { RollupJob job = (RollupJob) journal.getData(); catalog.getRollupHandler().replayFinish(job, catalog); @@ -283,6 +295,13 @@ public class EditLog { catalog.getSchemaChangeHandler().replayInitJob(job, catalog); break; } + case OperationType.OP_FINISHING_SCHEMA_CHANGE: { + SchemaChangeJob job = (SchemaChangeJob) journal.getData(); + LOG.info("Begin to unprotect replay finishing schema change job. db = " + job.getDbId() + + " table = " + job.getTableId()); + catalog.getSchemaChangeHandler().replayFinishing(job, catalog); + break; + } case OperationType.OP_FINISH_SCHEMA_CHANGE: { SchemaChangeJob job = (SchemaChangeJob) journal.getData(); catalog.getSchemaChangeHandler().replayFinish(job, catalog); @@ -556,6 +575,19 @@ public class EditLog { catalog.replayUpdateClusterAndBackends(info); break; } + case OperationType.OP_UPSERT_TRANSACTION_STATE: { + final TransactionState state = (TransactionState) journal.getData(); + catalog.getCurrentGlobalTransactionMgr().replayUpsertTransactionState(state); + LOG.debug("opcode: {}, tid: {}", opCode, state.getTransactionId()); + + break; + } + case OperationType.OP_DELETE_TRANSACTION_STATE: { + final TransactionState state = (TransactionState) journal.getData(); + catalog.getCurrentGlobalTransactionMgr().replayDeleteTransactionState(state); + LOG.debug("opcode: {}, tid: {}", opCode, state.getTransactionId()); + break; + } case OperationType.OP_CREATE_REPOSITORY: { Repository repository = (Repository) journal.getData(); catalog.getBackupHandler().getRepoMgr().addAndInitRepoIfNotExist(repository, true); @@ -657,6 +689,10 @@ public class EditLog { public void logSaveNextId(long nextId) { logEdit(OperationType.OP_SAVE_NEXTID, new Text(Long.toString(nextId))); } + + public void logSaveTransactionId(long transactionId) { + logEdit(OperationType.OP_SAVE_TRANSACTION_ID, new Text(Long.toString(transactionId))); + } public void logCreateDb(Database db) { logEdit(OperationType.OP_CREATE_DB, db); @@ -742,6 +778,10 @@ public class EditLog { logEdit(OperationType.OP_START_ROLLUP, rollupJob); } + public void logFinishingRollup(RollupJob rollupJob) { + logEdit(OperationType.OP_FINISHING_ROLLUP, rollupJob); + } + public void logFinishRollup(RollupJob rollupJob) { logEdit(OperationType.OP_FINISH_ROLLUP, rollupJob); } @@ -762,6 +802,10 @@ public class EditLog { logEdit(OperationType.OP_START_SCHEMA_CHANGE, schemaChangeJob); } + public void logFinishingSchemaChange(SchemaChangeJob schemaChangeJob) { + logEdit(OperationType.OP_FINISHING_SCHEMA_CHANGE, schemaChangeJob); + } + public void logFinishSchemaChange(SchemaChangeJob schemaChangeJob) { logEdit(OperationType.OP_FINISH_SCHEMA_CHANGE, schemaChangeJob); } @@ -972,6 +1016,15 @@ public class EditLog { public void logUpdateClusterAndBackendState(BackendIdsUpdateInfo info) { logEdit(OperationType.OP_UPDATE_CLUSTER_AND_BACKENDS, info); } + + // for TransactionState + public void logInsertTransactionState(TransactionState transactionState) { + logEdit(OperationType.OP_UPSERT_TRANSACTION_STATE, transactionState); + } + + public void logDeleteTransactionState(TransactionState transactionState) { + logEdit(OperationType.OP_DELETE_TRANSACTION_STATE, transactionState); + } public void logBackupJob(BackupJob job) { logEdit(OperationType.OP_BACKUP_JOB, job); diff --git a/fe/src/main/java/com/baidu/palo/persist/OperationType.java b/fe/src/main/java/com/baidu/palo/persist/OperationType.java index 29560a9852..63a884663d 100644 --- a/fe/src/main/java/com/baidu/palo/persist/OperationType.java +++ b/fe/src/main/java/com/baidu/palo/persist/OperationType.java @@ -16,7 +16,6 @@ package com.baidu.palo.persist; public class OperationType { - public static final short OP_INVALID = -1; public static final short OP_SAVE_NEXTID = 0; public static final short OP_CREATE_DB = 1; @@ -126,9 +125,15 @@ public class OperationType { public static final short OP_ADD_BROKER = 85; public static final short OP_DROP_BROKER = 86; public static final short OP_DROP_ALL_BROKER = 87; - public static final short OP_UPDATE_CLUSTER_AND_BACKENDS = 88; - public static final short OP_CREATE_REPOSITORY = 89; public static final short OP_DROP_REPOSITORY = 90; + + //real time load 100 -108 + public static final short OP_UPSERT_TRANSACTION_STATE = 100; + public static final short OP_DELETE_TRANSACTION_STATE = 101; + public static final short OP_FINISHING_ROLLUP = 102; + public static final short OP_FINISHING_SCHEMA_CHANGE = 103; + public static final short OP_SAVE_TRANSACTION_ID = 104; + } diff --git a/fe/src/main/java/com/baidu/palo/persist/ReplicaPersistInfo.java b/fe/src/main/java/com/baidu/palo/persist/ReplicaPersistInfo.java index a315465e99..c3ffcd9f31 100644 --- a/fe/src/main/java/com/baidu/palo/persist/ReplicaPersistInfo.java +++ b/fe/src/main/java/com/baidu/palo/persist/ReplicaPersistInfo.java @@ -15,6 +15,8 @@ package com.baidu.palo.persist; +import com.baidu.palo.catalog.Catalog; +import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.common.io.Writable; import java.io.DataInput; @@ -22,7 +24,60 @@ import java.io.DataOutput; import java.io.IOException; public class ReplicaPersistInfo implements Writable { + + public enum ReplicaOperationType { + ADD(0), + CROND_DELETE(1), + DELETE(2), + CLONE(3), + LOAD(4), + ROLLUP(5), + SCHEMA_CHANGE(6), + CLEAR_ROLLUPINFO(7), + // this default op is used for upgrate to femeta_45, add default op to solve this scenario + // the old image and old persist log does not have op field, so the op field is null when upgrate to fe meta 45 + // then fe will dump image and want to write op type to image, op type is null and then throw null pointer exception + // add the default op, when read from image and op type == null ,set op type to default op to skip the exception + DEFAULT_OP(8); + + private final int value; + + private ReplicaOperationType(int value) { + this.value = value; + } + + public int getValue() { + return value; + } + + public static ReplicaOperationType findByValue(int value) { + switch (value) { + case 0: + return ADD; + case 1: + return CROND_DELETE; + case 2: + return DELETE; + case 3: + return CLONE; + case 4: + return LOAD; + case 5: + return ROLLUP; + case 6: + return SCHEMA_CHANGE; + case 7: + return CLEAR_ROLLUPINFO; + case 8: + return DEFAULT_OP; + default: + return null; + } + } + } + // required + private ReplicaOperationType opType; private long dbId; private long tableId; private long partitionId; @@ -36,21 +91,36 @@ public class ReplicaPersistInfo implements Writable { private long versionHash; private long dataSize; private long rowCount; + + private long lastFailedVersion = -1L; + private long lastFailedVersionHash = 0L; + private long lastSuccessVersion = -1L; + private long lastSuccessVersionHash = 0L; public static ReplicaPersistInfo createForAdd(long dbId, long tableId, long partitionId, long indexId, long tabletId, long backendId, long replicaId, long version, - long versionHash, long dataSize, long rowCount) { - return new ReplicaPersistInfo(dbId, tableId, partitionId, indexId, tabletId, backendId, - replicaId, version, versionHash, dataSize, rowCount); + long versionHash, long dataSize, long rowCount, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + return new ReplicaPersistInfo(ReplicaOperationType.ADD, + dbId, tableId, partitionId, indexId, tabletId, backendId, + replicaId, version, versionHash, dataSize, rowCount, + lastFailedVersion, lastFailedVersionHash, + lastSuccessVersion, lastSuccessVersionHash); } /* * this for delete stmt operation */ public static ReplicaPersistInfo createForCondDelete(long indexId, long tabletId, long replicaId, long version, - long versionHash, long dataSize, long rowCount) { - return new ReplicaPersistInfo(-1L, -1L, -1L, indexId, tabletId, -1L, - replicaId, version, versionHash, dataSize, rowCount); + long versionHash, long dataSize, long rowCount, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + return new ReplicaPersistInfo(ReplicaOperationType.CROND_DELETE, + -1L, -1L, -1L, indexId, tabletId, -1L, + replicaId, version, versionHash, dataSize, rowCount, + lastFailedVersion, lastFailedVersionHash, + lastSuccessVersion, lastSuccessVersionHash); } /* @@ -58,59 +128,89 @@ public class ReplicaPersistInfo implements Writable { */ public static ReplicaPersistInfo createForDelete(long dbId, long tableId, long partitionId, long indexId, long tabletId, long backendId) { - return new ReplicaPersistInfo(dbId, tableId, partitionId, indexId, tabletId, backendId, - -1L, -1L, -1L, -1L, -1L); + return new ReplicaPersistInfo(ReplicaOperationType.DELETE, + dbId, tableId, partitionId, indexId, tabletId, backendId, + -1L, -1L, -1L, -1L, -1L, -1L, 0L, -1L, 0L); } public static ReplicaPersistInfo createForClone(long dbId, long tableId, long partitionId, long indexId, long tabletId, long backendId, long replicaId, long version, - long versionHash, long dataSize, long rowCount) { - return new ReplicaPersistInfo(dbId, tableId, partitionId, indexId, tabletId, backendId, replicaId, - version, versionHash, dataSize, rowCount); + long versionHash, long dataSize, long rowCount, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + return new ReplicaPersistInfo(ReplicaOperationType.CLONE, + dbId, tableId, partitionId, indexId, tabletId, backendId, replicaId, + version, versionHash, dataSize, rowCount, + lastFailedVersion, + lastFailedVersionHash, + lastSuccessVersion, + lastSuccessVersionHash); } + // for original batch load, the last success version = version, last success version hash = version hash + // last failed version = -1 public static ReplicaPersistInfo createForLoad(long tableId, long partitionId, long indexId, long tabletId, long replicaId, long version, long versionHash, long dataSize, long rowCount) { - return new ReplicaPersistInfo(-1L, tableId, partitionId, indexId, tabletId, -1L, - replicaId, version, versionHash, dataSize, rowCount); + return new ReplicaPersistInfo(ReplicaOperationType.LOAD, + -1L, tableId, partitionId, indexId, tabletId, -1L, + replicaId, version, versionHash, dataSize, + rowCount, -1L, 0L, version, versionHash); } public static ReplicaPersistInfo createForRollup(long indexId, long tabletId, long backendId, long version, - long versionHash, long dataSize, long rowCount) { - return new ReplicaPersistInfo(-1L, -1L, -1L, indexId, tabletId, backendId, -1L, - version, versionHash, dataSize, rowCount); + long versionHash, long dataSize, long rowCount, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + return new ReplicaPersistInfo(ReplicaOperationType.ROLLUP, + -1L, -1L, -1L, indexId, tabletId, backendId, -1L, + version, versionHash, dataSize, rowCount, + lastFailedVersion, lastFailedVersionHash, + lastSuccessVersion, lastSuccessVersionHash); } public static ReplicaPersistInfo createForSchemaChange(long partitionId, long indexId, long tabletId, long backendId, long version, long versionHash, - long dataSize, long rowCount) { - return new ReplicaPersistInfo(-1L, -1L, partitionId, indexId, tabletId, backendId, -1L, version, - versionHash, dataSize, rowCount); + long dataSize, long rowCount, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + return new ReplicaPersistInfo(ReplicaOperationType.SCHEMA_CHANGE, + -1L, -1L, partitionId, indexId, tabletId, backendId, -1L, version, + versionHash, dataSize, rowCount, lastFailedVersion, lastFailedVersionHash, + lastSuccessVersion, lastSuccessVersionHash); } public static ReplicaPersistInfo createForClearRollupInfo(long dbId, long tableId, long partitionId, long indexId) { - return new ReplicaPersistInfo(dbId, tableId, partitionId, indexId, -1L, -1L, -1L, -1L, -1L, -1L, -1L); + return new ReplicaPersistInfo(ReplicaOperationType.CLEAR_ROLLUPINFO, + dbId, tableId, partitionId, indexId, + -1L, -1L, -1L, -1L, -1L, + -1L, -1L, -1L, 0L, -1L, 0L); } public ReplicaPersistInfo() { } - private ReplicaPersistInfo(long dbId, long tableId, long partitionId, long indexId, long tabletId, long backendId, - long replicaId, long version, long versionHash, long dataSize, long rowCount) { + private ReplicaPersistInfo(ReplicaOperationType opType, long dbId, long tableId, long partitionId, long indexId, long tabletId, + long backendId, long replicaId, long version, long versionHash, long dataSize, long rowCount, + long lastFailedVersion, long lastFailedVersionHash, + long lastSuccessVersion, long lastSuccessVersionHash) { + this.opType = opType; this.dbId = dbId; this.tableId = tableId; this.partitionId = partitionId; this.indexId = indexId; this.tabletId = tabletId; this.backendId = backendId; - this.replicaId = replicaId; - this.version = version; this.versionHash = versionHash; this.dataSize = dataSize; this.rowCount = rowCount; + + this.lastFailedVersion = lastFailedVersion; + this.lastFailedVersionHash = lastFailedVersionHash; + this.lastSuccessVersion = lastSuccessVersion; + this.lastSuccessVersionHash = lastSuccessVersionHash; } public void setReplicaId(long replicaId) { @@ -137,6 +237,10 @@ public class ReplicaPersistInfo implements Writable { this.rowCount = rowCount; } + public ReplicaOperationType getOpType() { + return opType; + } + public long getDbId() { return dbId; } @@ -180,6 +284,22 @@ public class ReplicaPersistInfo implements Writable { public long getRowCount() { return rowCount; } + + public long getLastFailedVersion() { + return lastFailedVersion; + } + + public long getLastFailedVersionHash() { + return lastFailedVersionHash; + } + + public long getLastSuccessVersion() { + return lastSuccessVersion; + } + + public long getLastSuccessVersionHash() { + return lastSuccessVersionHash; + } public static ReplicaPersistInfo read(DataInput in) throws IOException { ReplicaPersistInfo replicaInfo = new ReplicaPersistInfo(); @@ -200,10 +320,17 @@ public class ReplicaPersistInfo implements Writable { out.writeLong(versionHash); out.writeLong(dataSize); out.writeLong(rowCount); + + out.writeInt(opType.value); + out.writeLong(lastFailedVersion); + out.writeLong(lastFailedVersionHash); + out.writeLong(lastSuccessVersion); + out.writeLong(lastSuccessVersionHash); } @Override public void readFields(DataInput in) throws IOException { + dbId = in.readLong(); tableId = in.readLong(); partitionId = in.readLong(); @@ -215,6 +342,17 @@ public class ReplicaPersistInfo implements Writable { versionHash = in.readLong(); dataSize = in.readLong(); rowCount = in.readLong(); + opType = ReplicaOperationType.DEFAULT_OP; + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_45) { + opType = ReplicaOperationType.findByValue(in.readInt()); + if (opType == null) { + throw new IOException("could not parse operation type from replica info"); + } + lastFailedVersion = in.readLong(); + lastFailedVersionHash = in.readLong(); + lastSuccessVersion = in.readLong(); + lastSuccessVersionHash = in.readLong(); + } } @Override @@ -239,7 +377,11 @@ public class ReplicaPersistInfo implements Writable { && version == info.version && versionHash == info.versionHash && dataSize == info.dataSize - && rowCount == info.rowCount; + && rowCount == info.rowCount + && lastFailedVersion == info.lastFailedVersion + && lastFailedVersionHash == info.lastFailedVersionHash + && lastSuccessVersion == info.lastSuccessVersion + && lastSuccessVersionHash == info.lastSuccessVersionHash; } @Override @@ -256,6 +398,10 @@ public class ReplicaPersistInfo implements Writable { sb.append(" version hash: ").append(versionHash); sb.append(" data size: ").append(dataSize); sb.append(" row count: ").append(rowCount); + sb.append(" last failed version: ").append(lastFailedVersion); + sb.append(" last failed version hash: ").append(lastFailedVersionHash); + sb.append(" last success version: ").append(lastSuccessVersion); + sb.append(" last success version hash: ").append(lastSuccessVersionHash); return sb.toString(); } diff --git a/fe/src/main/java/com/baidu/palo/planner/AggregationNode.java b/fe/src/main/java/com/baidu/palo/planner/AggregationNode.java index 52c7b850a0..872c85a794 100644 --- a/fe/src/main/java/com/baidu/palo/planner/AggregationNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/AggregationNode.java @@ -31,7 +31,7 @@ import com.baidu.palo.thrift.TAggregationNode; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TPlanNode; import com.baidu.palo.thrift.TPlanNodeType; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Objects; import com.google.common.base.Preconditions; @@ -118,7 +118,7 @@ public class AggregationNode extends PlanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { // Assign predicates to the top-most agg in the single-node plan that can evaluate // them, as follows: For non-distinct aggs place them in the 1st phase agg node. For // distinct aggs place them in the 2nd phase agg node. The conjuncts are diff --git a/fe/src/main/java/com/baidu/palo/planner/AnalyticEvalNode.java b/fe/src/main/java/com/baidu/palo/planner/AnalyticEvalNode.java index cfa91ad76c..0c549fdd02 100644 --- a/fe/src/main/java/com/baidu/palo/planner/AnalyticEvalNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/AnalyticEvalNode.java @@ -31,7 +31,7 @@ import com.baidu.palo.analysis.Expr; import com.baidu.palo.analysis.ExprSubstitutionMap; import com.baidu.palo.analysis.OrderByElement; import com.baidu.palo.analysis.TupleDescriptor; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TAnalyticNode; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TPlanNode; @@ -109,7 +109,7 @@ public class AnalyticEvalNode extends PlanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { analyzer.getDescTbl().computeMemLayout(); intermediateTupleDesc.computeMemLayout(); // we add the analyticInfo's smap to the combined smap of our child diff --git a/fe/src/main/java/com/baidu/palo/planner/AnalyticPlanner.java b/fe/src/main/java/com/baidu/palo/planner/AnalyticPlanner.java index 06ebd0ce6c..c6d1e1022d 100644 --- a/fe/src/main/java/com/baidu/palo/planner/AnalyticPlanner.java +++ b/fe/src/main/java/com/baidu/palo/planner/AnalyticPlanner.java @@ -39,7 +39,7 @@ import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.analysis.TupleId; import com.baidu.palo.analysis.TupleIsNullPredicate; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TPartitionType; import com.google.common.base.Preconditions; @@ -98,11 +98,11 @@ public class AnalyticPlanner { * TODO: when merging sort groups, recognize equivalent exprs * (using the equivalence classes) rather than looking for expr equality * @throws AnalysisException - * @throws InternalException + * @throws UserException */ public PlanNode createSingleNodePlan(PlanNode root, List groupingExprs, List inputPartitionExprs) throws AnalysisException, - InternalException { + UserException { List windowGroups = collectWindowGroups(); for (int i = 0; i < windowGroups.size(); ++i) { @@ -382,10 +382,10 @@ public class AnalyticPlanner { * is not null (partitionExprs represent the data partition of the entire partition * group of which this sort group is a part). * @throws AnalysisException - * @throws InternalException + * @throws UserException */ private PlanNode createSortGroupPlan(PlanNode root, SortGroup sortGroup, - List partitionExprs) throws AnalysisException, InternalException { + List partitionExprs) throws AnalysisException, UserException { List partitionByExprs = sortGroup.partitionByExprs; List orderByElements = sortGroup.orderByElements; ExprSubstitutionMap sortSmap = null; diff --git a/fe/src/main/java/com/baidu/palo/planner/BrokerScanNode.java b/fe/src/main/java/com/baidu/palo/planner/BrokerScanNode.java index ecadbe7a93..133954e5f6 100644 --- a/fe/src/main/java/com/baidu/palo/planner/BrokerScanNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/BrokerScanNode.java @@ -40,7 +40,7 @@ import com.baidu.palo.catalog.Table; import com.baidu.palo.catalog.Type; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.BrokerUtil; import com.baidu.palo.load.BrokerFileGroup; import com.baidu.palo.system.Backend; @@ -138,7 +138,7 @@ public class BrokerScanNode extends ScanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { super.init(analyzer); this.analyzer = analyzer; @@ -147,7 +147,7 @@ public class BrokerScanNode extends ScanNode { try { fileGroups = Lists.newArrayList(new BrokerFileGroup(brokerTable)); } catch (AnalysisException e) { - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } brokerDesc = new BrokerDesc(brokerTable.getBrokerName(), brokerTable.getBrokerProperties()); targetTable = brokerTable; @@ -164,7 +164,7 @@ public class BrokerScanNode extends ScanNode { try { initParams(context); } catch (AnalysisException e) { - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } paramCreateContexts.add(context); } @@ -198,7 +198,7 @@ public class BrokerScanNode extends ScanNode { (OlapTable) targetTable, exprByName, null, partitionExprs); } - private void parseExprMap(Map exprMap) throws InternalException { + private void parseExprMap(Map exprMap) throws UserException { if (exprMap == null) { return; } @@ -208,7 +208,7 @@ public class BrokerScanNode extends ScanNode { Column column = targetTable.getColumn(colName); if (column == null) { - throw new InternalException("Unknown column(" + colName + ")"); + throw new UserException("Unknown column(" + colName + ")"); } // To compatible with older load version @@ -233,7 +233,7 @@ public class BrokerScanNode extends ScanNode { if (column.isAllowNull()) { exprs.add(NullLiteral.create(Type.VARCHAR)); } else { - throw new InternalException("Column(" + colName + ") has no default value."); + throw new UserException("Column(" + colName + ") has no default value."); } } } @@ -275,7 +275,7 @@ public class BrokerScanNode extends ScanNode { } else if (precision.getStringValue().equalsIgnoreCase("hour")) { format = new StringLiteral("%Y-%m-%d %H:00:00"); } else { - throw new InternalException("Unknown precision(" + precision.getStringValue() + ")"); + throw new UserException("Unknown precision(" + precision.getStringValue() + ")"); } FunctionName dateFormatName = new FunctionName("DATE_FORMAT"); List dateFormatArgs = Lists.newArrayList(fromUnixFunc, format); @@ -297,7 +297,7 @@ public class BrokerScanNode extends ScanNode { } // Called from init, construct source tuple information - private void initParams(ParamCreateContext context) throws AnalysisException, InternalException { + private void initParams(ParamCreateContext context) throws AnalysisException, UserException { TBrokerScanRangeParams params = new TBrokerScanRangeParams(); context.params = params; @@ -344,7 +344,7 @@ public class BrokerScanNode extends ScanNode { params.setSrc_tuple_id(srcTupleDesc.getId().asInt()); } - private void finalizeParams(ParamCreateContext context) throws InternalException, AnalysisException { + private void finalizeParams(ParamCreateContext context) throws UserException, AnalysisException { Map slotDescByName = context.slotDescByName; Map exprMap = context.exprMap; // Analyze expr map @@ -358,7 +358,7 @@ public class BrokerScanNode extends ScanNode { for (SlotRef slot : slots) { SlotDescriptor slotDesc = slotDescByName.get(slot.getColumnName()); if (slotDesc == null) { - throw new InternalException("Unknown slot"); + throw new UserException("Unknown slot"); } smap.getLhs().add(slot); smap.getRhs().add(new SlotRef(slotDesc)); @@ -394,7 +394,7 @@ public class BrokerScanNode extends ScanNode { if (column.isAllowNull()) { expr = NullLiteral.create(column.getType()); } else { - throw new InternalException("Unknown slot ref(" + throw new UserException("Unknown slot ref(" + destSlotDesc.getColumn().getName() + ") in source file"); } } @@ -431,7 +431,7 @@ public class BrokerScanNode extends ScanNode { } private TScanRangeLocations newLocations(TBrokerScanRangeParams params, String brokerName) - throws InternalException { + throws UserException { List candidateBes = Lists.newArrayList(); // Get backend int numBe = Math.min(3, backends.size()); @@ -454,7 +454,7 @@ public class BrokerScanNode extends ScanNode { brokerAddress = Catalog.getInstance().getBrokerMgr().getBroker( brokerName, candidateBes.get(i).getHost()); } catch (AnalysisException e) { - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } brokerScanRange.addToBroker_addresses(new TNetworkAddress(brokerAddress.ip, brokerAddress.port)); } @@ -480,7 +480,7 @@ public class BrokerScanNode extends ScanNode { return locations.scan_range.broker_scan_range; } - private void getFileStatusAndCalcInstance() throws InternalException { + private void getFileStatusAndCalcInstance() throws UserException { if (fileStatusesList == null || filesAdded == -1) { // FIXME(cmy): fileStatusesList and filesAdded can be set out of db lock when doing pull load, // but for now it is very difficult to set them out of db lock when doing broker query. @@ -503,7 +503,7 @@ public class BrokerScanNode extends ScanNode { Preconditions.checkState(fileStatusesList.size() == fileGroups.size()); if (isLoad() && filesAdded == 0) { - throw new InternalException("No source file in this table(" + targetTable.getName() + ")."); + throw new UserException("No source file in this table(" + targetTable.getName() + ")."); } totalBytes = 0; @@ -522,12 +522,12 @@ public class BrokerScanNode extends ScanNode { bytesPerInstance = totalBytes / numInstances + 1; if (bytesPerInstance > Config.max_bytes_per_broker_scanner) { - throw new InternalException( + throw new UserException( "Scan bytes per broker scanner exceed limit: " + Config.max_bytes_per_broker_scanner); } } - private void assignBackends() throws InternalException { + private void assignBackends() throws UserException { backends = Lists.newArrayList(); for (Backend be : Catalog.getCurrentSystemInfo().getIdToBackend().values()) { if (be.isAlive()) { @@ -535,7 +535,7 @@ public class BrokerScanNode extends ScanNode { } } if (backends.isEmpty()) { - throw new InternalException("No Alive backends"); + throw new UserException("No Alive backends"); } Collections.shuffle(backends, random); } @@ -558,7 +558,7 @@ public class BrokerScanNode extends ScanNode { private void processFileGroup( TBrokerScanRangeParams params, List fileStatuses) - throws InternalException { + throws UserException { if (fileStatuses == null || fileStatuses.isEmpty()) { return; } @@ -628,7 +628,7 @@ public class BrokerScanNode extends ScanNode { } @Override - public void finalize(Analyzer analyzer) throws InternalException { + public void finalize(Analyzer analyzer) throws UserException { locationsList = Lists.newArrayList(); for (int i = 0; i < fileGroups.size(); ++i) { @@ -640,7 +640,7 @@ public class BrokerScanNode extends ScanNode { try { finalizeParams(context); } catch (AnalysisException e) { - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } processFileGroup(context.params, fileStatuses); } diff --git a/fe/src/main/java/com/baidu/palo/planner/CsvScanNode.java b/fe/src/main/java/com/baidu/palo/planner/CsvScanNode.java index 937b5685f1..efa1c5642e 100644 --- a/fe/src/main/java/com/baidu/palo/planner/CsvScanNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/CsvScanNode.java @@ -27,7 +27,7 @@ import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.catalog.OlapTable; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Pair; import com.baidu.palo.load.LoadJob; import com.baidu.palo.load.PartitionLoadInfo; @@ -103,7 +103,7 @@ public class CsvScanNode extends ScanNode { } @Override - public void finalize(Analyzer analyzer) throws InternalException { + public void finalize(Analyzer analyzer) throws UserException { // get file paths // file paths in different partitions are same in mini load TableLoadInfo tableLoadInfo = job.getTableLoadInfo(table.getId()); @@ -146,7 +146,7 @@ public class CsvScanNode extends ScanNode { unspecifiedColumns.add(columnName); String defaultValue = column.getDefaultValue(); if (defaultValue == null && false == column.isAllowNull()) { - throw new InternalException( + throw new UserException( "Column [" + columnName + "] should be specified. " + "only columns have default values can be omitted"); } diff --git a/fe/src/main/java/com/baidu/palo/planner/DistributedPlanner.java b/fe/src/main/java/com/baidu/palo/planner/DistributedPlanner.java index 544c9d1c15..ecbe4d558e 100644 --- a/fe/src/main/java/com/baidu/palo/planner/DistributedPlanner.java +++ b/fe/src/main/java/com/baidu/palo/planner/DistributedPlanner.java @@ -27,7 +27,7 @@ import com.baidu.palo.analysis.JoinOperator; import com.baidu.palo.analysis.QueryStmt; import com.baidu.palo.catalog.Table; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Pair; import com.baidu.palo.thrift.TPartitionType; @@ -65,7 +65,7 @@ public class DistributedPlanner { * the aggregation and analytic computation). */ public ArrayList createPlanFragments( - PlanNode singleNodePlan) throws InternalException, AnalysisException { + PlanNode singleNodePlan) throws UserException, AnalysisException { Preconditions.checkState(!ctx_.isSingleNodeExec()); // AnalysisContext.AnalysisResult analysisResult = ctx_.getAnalysisResult(); QueryStmt queryStmt = ctx_.getQueryStmt(); @@ -99,11 +99,12 @@ public class DistributedPlanner { PlanFragment createInsertFragment( PlanFragment inputFragment, InsertStmt stmt, ArrayList fragments) - throws InternalException { + throws UserException { Table targetTable = stmt.getTargetTable(); Boolean isRepart = stmt.isRepartition(); // When inputFragment is partitioned: // 1. If target table is partitioned, we need repartitioned. Or a merge node if hint has "NOSHUFFLE" + // 1.a: If target table is random partitioned, return inputFragment // 2. If target table is not partitioned, we must have a merge node // When inputFragment is not partitioned: // 1. If target table is partitioned, we can return inputFragment; or repartition if hints has "SHUFFLE" @@ -112,6 +113,9 @@ public class DistributedPlanner { boolean needMerge = false; if (isFragmentPartitioned(inputFragment)) { if (targetTable.isPartitioned()) { + if (stmt.getDataPartition().getType() == TPartitionType.RANDOM) { + return inputFragment; + } if (isRepart != null && !isRepart) { needMerge = true; } else { @@ -166,7 +170,7 @@ public class DistributedPlanner { */ private PlanFragment createPlanFragments( PlanNode root, boolean isPartitioned, - long perNodeMemLimit, ArrayList fragments) throws InternalException, AnalysisException { + long perNodeMemLimit, ArrayList fragments) throws UserException, AnalysisException { ArrayList childFragments = Lists.newArrayList(); for (PlanNode child : root.getChildren()) { // allow child fragments to be partitioned, unless they contain a limit clause @@ -213,7 +217,7 @@ public class DistributedPlanner { } else if (root instanceof EmptySetNode) { result = new PlanFragment(ctx_.getNextFragmentId(), root, DataPartition.UNPARTITIONED); } else { - throw new InternalException( + throw new UserException( "Cannot create plan fragment for this node type: " + root.getExplainString()); } // move 'result' to end, it depends on all of its children @@ -234,7 +238,7 @@ public class DistributedPlanner { * Requires that input fragment be partitioned. */ private PlanFragment createMergeFragment(PlanFragment inputFragment) - throws InternalException { + throws UserException { Preconditions.checkState(inputFragment.isPartitioned()); // exchange node clones the behavior of its input, aside from the conjuncts @@ -260,6 +264,7 @@ public class DistributedPlanner { } else if (node instanceof SchemaScanNode) { return new PlanFragment(ctx_.getNextFragmentId(), node, DataPartition.UNPARTITIONED); } else { + // es scan node, olap scan node are random partitioned return new PlanFragment(ctx_.getNextFragmentId(), node, DataPartition.RANDOM); } } @@ -273,7 +278,7 @@ public class DistributedPlanner { */ private PlanFragment createHashJoinFragment(HashJoinNode node, PlanFragment rightChildFragment, PlanFragment leftChildFragment, long perNodeMemLimit) - throws InternalException { + throws UserException { // broadcast: send the rightChildFragment's output to each node executing // the leftChildFragment; the cost across all nodes is proportional to the // total amount of data sent @@ -399,7 +404,7 @@ public class DistributedPlanner { */ private PlanFragment createCrossJoinFragment( CrossJoinNode node, PlanFragment rightChildFragment, PlanFragment leftChildFragment) - throws InternalException { + throws UserException { // The rhs tree is going to send data through an exchange node which effectively // compacts the data. No reason to do it again at the rhs root node. rightChildFragment.getPlanRoot().setCompactData(false); @@ -422,7 +427,7 @@ public class DistributedPlanner { private PlanFragment createMergeNodeFragment(MergeNode mergeNode, ArrayList childFragments, ArrayList fragments) - throws InternalException { + throws UserException { Preconditions.checkState(mergeNode.getChildren().size() == childFragments.size()); // If the mergeNode only has constant exprs, return it in an unpartitioned fragment. @@ -484,7 +489,7 @@ public class DistributedPlanner { */ private PlanFragment createUnionNodeFragment( UnionNode unionNode, ArrayList childFragments, ArrayList fragments) - throws InternalException { + throws UserException { Preconditions.checkState(unionNode.getChildren().size() == childFragments.size()); // A UnionNode could have no children or constant selects if all of its operands @@ -582,7 +587,7 @@ public class DistributedPlanner { private void connectChildFragment( PlanNode node, int childIdx, PlanFragment parentFragment, PlanFragment childFragment) - throws InternalException { + throws UserException { ExchangeNode exchangeNode = new ExchangeNode(ctx_.getNextNodeId(), childFragment.getPlanRoot(), false); exchangeNode.setNumInstances(childFragment.getPlanRoot().getNumInstances()); exchangeNode.init(ctx_.getRootAnalyzer()); @@ -602,7 +607,7 @@ public class DistributedPlanner { * correct for the input). */ private PlanFragment createParentFragment(PlanFragment childFragment, DataPartition parentPartition) - throws InternalException { + throws UserException { ExchangeNode exchangeNode = new ExchangeNode(ctx_.getNextNodeId(), childFragment.getPlanRoot(), false); exchangeNode.setNumInstances(childFragment.getPlanRoot().getNumInstances()); exchangeNode.init(ctx_.getRootAnalyzer()); @@ -623,7 +628,7 @@ public class DistributedPlanner { */ private PlanFragment createAggregationFragment( AggregationNode node, PlanFragment childFragment, ArrayList fragments) - throws InternalException { + throws UserException { if (!childFragment.isPartitioned()) { // nothing to distribute; do full aggregation directly within childFragment childFragment.addPlanRoot(node); @@ -660,7 +665,7 @@ public class DistributedPlanner { * aggregation. */ private PlanFragment createMergeAggregationFragment(AggregationNode node, PlanFragment childFragment) - throws InternalException { + throws UserException { Preconditions.checkArgument(childFragment.isPartitioned()); ArrayList groupingExprs = node.getAggInfo().getGroupingExprs(); boolean hasGrouping = !groupingExprs.isEmpty(); @@ -737,7 +742,7 @@ public class DistributedPlanner { */ private PlanFragment createPhase2DistinctAggregationFragment( AggregationNode node, - PlanFragment childFragment, ArrayList fragments) throws InternalException { + PlanFragment childFragment, ArrayList fragments) throws UserException { ArrayList groupingExprs = node.getAggInfo().getGroupingExprs(); boolean hasGrouping = !groupingExprs.isEmpty(); @@ -840,7 +845,7 @@ public class DistributedPlanner { */ private PlanFragment createAnalyticFragment( PlanNode node, PlanFragment childFragment, List fragments) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { Preconditions.checkState( node instanceof SortNode || node instanceof AnalyticEvalNode); @@ -895,7 +900,7 @@ public class DistributedPlanner { */ private PlanFragment createOrderByFragment( SortNode node, PlanFragment childFragment) - throws InternalException { + throws UserException { node.setChild(0, childFragment.getPlanRoot()); childFragment.addPlanRoot(node); if (!childFragment.isPartitioned()) { diff --git a/fe/src/main/java/com/baidu/palo/planner/ExchangeNode.java b/fe/src/main/java/com/baidu/palo/planner/ExchangeNode.java index 2500996d33..b6808ab04a 100644 --- a/fe/src/main/java/com/baidu/palo/planner/ExchangeNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/ExchangeNode.java @@ -24,7 +24,7 @@ import com.baidu.palo.analysis.Analyzer; import com.baidu.palo.analysis.Expr; import com.baidu.palo.analysis.SortInfo; import com.baidu.palo.analysis.TupleId; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TExchangeNode; import com.baidu.palo.thrift.TPlanNode; import com.baidu.palo.thrift.TPlanNodeType; @@ -92,7 +92,7 @@ public class ExchangeNode extends PlanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { super.init(analyzer); Preconditions.checkState(conjuncts.isEmpty()); } diff --git a/fe/src/main/java/com/baidu/palo/planner/HashJoinNode.java b/fe/src/main/java/com/baidu/palo/planner/HashJoinNode.java index 451a5428a3..83440298d0 100644 --- a/fe/src/main/java/com/baidu/palo/planner/HashJoinNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/HashJoinNode.java @@ -30,7 +30,7 @@ import com.baidu.palo.analysis.SlotRef; import com.baidu.palo.analysis.TableRef; import com.baidu.palo.catalog.ColumnStats; import com.baidu.palo.common.Pair; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TEqJoinCondition; import com.baidu.palo.thrift.TExplainLevel; @@ -115,7 +115,7 @@ public class HashJoinNode extends PlanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { assignConjuncts(analyzer); // Set smap to the combined childrens' smaps and apply that to all conjuncts_. diff --git a/fe/src/main/java/com/baidu/palo/planner/MergeNode.java b/fe/src/main/java/com/baidu/palo/planner/MergeNode.java index 345a12dd85..8089ef9b25 100644 --- a/fe/src/main/java/com/baidu/palo/planner/MergeNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/MergeNode.java @@ -26,7 +26,7 @@ import com.baidu.palo.analysis.SlotDescriptor; import com.baidu.palo.analysis.SlotId; import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.analysis.TupleId; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TExpr; @@ -99,7 +99,7 @@ public class MergeNode extends PlanNode { * tuple id */ @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { assignConjuncts(analyzer); //computeMemLayout(analyzer); computeStats(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/planner/MysqlScanNode.java b/fe/src/main/java/com/baidu/palo/planner/MysqlScanNode.java index 196df0e568..ddcb41c5c6 100644 --- a/fe/src/main/java/com/baidu/palo/planner/MysqlScanNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/MysqlScanNode.java @@ -23,7 +23,7 @@ import com.baidu.palo.analysis.SlotRef; import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.MysqlTable; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TMySQLScanNode; import com.baidu.palo.thrift.TPlanNode; import com.baidu.palo.thrift.TPlanNodeType; @@ -63,7 +63,7 @@ public class MysqlScanNode extends ScanNode { } @Override - public void finalize(Analyzer analyzer) throws InternalException { + public void finalize(Analyzer analyzer) throws UserException { // Convert predicates to MySQL columns and filters. createMySQLColumns(analyzer); createMySQLFilters(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/planner/OlapRewriteNode.java b/fe/src/main/java/com/baidu/palo/planner/OlapRewriteNode.java index 68113e44d2..865ef5717a 100644 --- a/fe/src/main/java/com/baidu/palo/planner/OlapRewriteNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/OlapRewriteNode.java @@ -26,7 +26,7 @@ import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TOlapRewriteNode; import com.baidu.palo.thrift.TPlanNode; import com.baidu.palo.thrift.TPlanNodeType; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.collect.Lists; import com.google.common.base.Preconditions; @@ -65,7 +65,7 @@ public class OlapRewriteNode extends PlanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { assignConjuncts(analyzer); // Set smap to the combined childrens' smaps and apply that to all conjuncts_. diff --git a/fe/src/main/java/com/baidu/palo/planner/OlapScanNode.java b/fe/src/main/java/com/baidu/palo/planner/OlapScanNode.java index 53c3b3d5ef..18e3f136c0 100644 --- a/fe/src/main/java/com/baidu/palo/planner/OlapScanNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/OlapScanNode.java @@ -38,7 +38,7 @@ import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.Config; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.service.FrontendOptions; import com.baidu.palo.system.Backend; import com.baidu.palo.thrift.TExplainLevel; @@ -129,7 +129,7 @@ public class OlapScanNode extends ScanNode { } @Override - public void finalize(Analyzer analyzer) throws InternalException { + public void finalize(Analyzer analyzer) throws UserException { if (isFinalized) { return; } @@ -138,7 +138,7 @@ public class OlapScanNode extends ScanNode { try { getScanRangeLocations(analyzer); } catch (AnalysisException e) { - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } isFinalized = true; @@ -162,7 +162,7 @@ public class OlapScanNode extends ScanNode { // } // } - private List selectRollupIndex(Partition partition) throws InternalException { + private List selectRollupIndex(Partition partition) throws UserException { if (olapTable.getKeysType() == KeysType.DUP_KEYS) { isPreAggregation = true; } @@ -199,7 +199,7 @@ public class OlapScanNode extends ScanNode { } if (containTupleIndexes.isEmpty()) { - throw new InternalException("Failed to select index, no match index"); + throw new UserException("Failed to select index, no match index"); } // 2. find all indexes which match the prefix most based on predicate/sort/in predicate columns @@ -290,7 +290,7 @@ public class OlapScanNode extends ScanNode { return finalCandidateIndexes; } - private void normalizePredicate(Analyzer analyzer) throws InternalException { + private void normalizePredicate(Analyzer analyzer) throws UserException { // 1. Get Columns which has eqJoin on it List eqJoinPredicate = analyzer.getEqJoinConjuncts(desc.getId(), null); if (null != eqJoinPredicate) { @@ -391,7 +391,8 @@ public class OlapScanNode extends ScanNode { MaterializedIndex index, List tablets, long localBeId) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { + int logNum = 0; String schemaHashStr = String.valueOf(olapTable.getSchemaHashByIndexId(index.getId())); long committedVersion = partition.getCommittedVersion(); @@ -420,7 +421,7 @@ public class OlapScanNode extends ScanNode { if (allQueryableReplicas.isEmpty()) { LOG.error("no queryable replica found in tablet[{}]. committed version[{}], committed version hash[{}]", tabletId, committedVersion, committedVersionHash); - throw new InternalException("Failed to get scan range, no replica!"); + throw new UserException("Failed to get scan range, no replica!"); } List replicas = null; @@ -448,7 +449,7 @@ public class OlapScanNode extends ScanNode { tabletIsNull = false; } if (tabletIsNull) { - throw new InternalException(tabletId + "have no alive replicas"); + throw new UserException(tabletId + "have no alive replicas"); } TScanRange scanRange = new TScanRange(); scanRange.setPalo_scan_range(paloRange); @@ -457,7 +458,7 @@ public class OlapScanNode extends ScanNode { } } - private void getScanRangeLocations(Analyzer analyzer) throws InternalException, AnalysisException { + private void getScanRangeLocations(Analyzer analyzer) throws UserException, AnalysisException { normalizePredicate(analyzer); long start = System.currentTimeMillis(); diff --git a/fe/src/main/java/com/baidu/palo/planner/PlanFragment.java b/fe/src/main/java/com/baidu/palo/planner/PlanFragment.java index 0ec301f902..cdbafa9d6f 100644 --- a/fe/src/main/java/com/baidu/palo/planner/PlanFragment.java +++ b/fe/src/main/java/com/baidu/palo/planner/PlanFragment.java @@ -22,15 +22,13 @@ package com.baidu.palo.planner; import com.baidu.palo.analysis.Analyzer; import com.baidu.palo.analysis.Expr; -import com.baidu.palo.analysis.TupleId; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.NotImplementedException; import com.baidu.palo.common.TreeNode; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TPartitionType; import com.baidu.palo.thrift.TPlanFragment; import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; @@ -136,7 +134,7 @@ public class PlanFragment extends TreeNode { * Finalize plan tree and create stream sink, if needed. */ public void finalize(Analyzer analyzer, boolean validateFileFormats) - throws InternalException, NotImplementedException { + throws UserException, NotImplementedException { if (sink != null) { return; } diff --git a/fe/src/main/java/com/baidu/palo/planner/PlanNode.java b/fe/src/main/java/com/baidu/palo/planner/PlanNode.java index 6675755115..d40f9ae159 100644 --- a/fe/src/main/java/com/baidu/palo/planner/PlanNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/PlanNode.java @@ -27,7 +27,7 @@ import com.baidu.palo.analysis.SlotId; import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.analysis.TupleId; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.TreeNode; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TPlan; @@ -398,7 +398,7 @@ abstract public class PlanNode extends TreeNode { * Call this once on the root of the plan tree before calling toThrift(). * Subclasses need to override this. */ - public void finalize(Analyzer analyzer) throws InternalException { + public void finalize(Analyzer analyzer) throws UserException { for (PlanNode child : children) { child.finalize(analyzer); } @@ -464,7 +464,7 @@ abstract public class PlanNode extends TreeNode { analyzer.materializeSlots(exprs); } - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { assignConjuncts(analyzer); computeStats(analyzer); createDefaultSmap(analyzer); @@ -525,7 +525,7 @@ abstract public class PlanNode extends TreeNode { * substitutes conjuncts_ using the combined child smap. * @throws AnalysisException */ - protected void createDefaultSmap(Analyzer analyzer) throws InternalException { + protected void createDefaultSmap(Analyzer analyzer) throws UserException { ExprSubstitutionMap combinedChildSmap = getCombinedChildSmap(); outputSmap = ExprSubstitutionMap.compose(outputSmap, combinedChildSmap, analyzer); diff --git a/fe/src/main/java/com/baidu/palo/planner/Planner.java b/fe/src/main/java/com/baidu/palo/planner/Planner.java index d5a0e0bf7b..9a5f0aa126 100644 --- a/fe/src/main/java/com/baidu/palo/planner/Planner.java +++ b/fe/src/main/java/com/baidu/palo/planner/Planner.java @@ -20,7 +20,6 @@ package com.baidu.palo.planner; -import com.baidu.palo.analysis.AnalyticInfo; import com.baidu.palo.analysis.Analyzer; import com.baidu.palo.analysis.Expr; import com.baidu.palo.analysis.InsertStmt; @@ -30,23 +29,21 @@ import com.baidu.palo.analysis.SlotDescriptor; import com.baidu.palo.analysis.SlotId; import com.baidu.palo.analysis.StatementBase; import com.baidu.palo.analysis.TupleDescriptor; -import com.baidu.palo.analysis.TupleId; import com.baidu.palo.catalog.PrimitiveType; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.NotImplementedException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TQueryOptions; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; -import com.google.common.collect.Sets; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.ArrayList; import java.util.Collections; -import java.util.HashSet; import java.util.List; /** @@ -73,11 +70,14 @@ public class Planner { } public List getScanNodes() { + if (singleNodePlanner == null) { + return Lists.newArrayList(); + } return singleNodePlanner.getScanNodes(); } public void plan(StatementBase queryStmt, Analyzer analyzer, TQueryOptions queryOptions) - throws NotImplementedException, InternalException, AnalysisException { + throws NotImplementedException, UserException, AnalysisException { createPlanFragments(queryStmt, analyzer, queryOptions); } @@ -129,7 +129,7 @@ public class Planner { * a list such that element i of that list can only consume output of the following fragments j > i. */ public void createPlanFragments(StatementBase statment, Analyzer analyzer, TQueryOptions queryOptions) - throws NotImplementedException, InternalException, AnalysisException { + throws NotImplementedException, UserException, AnalysisException { QueryStmt queryStmt; if (statment instanceof InsertStmt) { queryStmt = ((InsertStmt) statment).getQueryStmt(); @@ -151,7 +151,7 @@ public class Planner { } InsertStmt insertStmt = (InsertStmt) statment; - if (insertStmt.getOlapTuple() != null) { + if (insertStmt.getOlapTuple() != null && !insertStmt.isStreaming()) { singleNodePlan = new OlapRewriteNode(plannerContext.getNextNodeId(), singleNodePlan, insertStmt); singleNodePlan.init(analyzer); resultExprs = insertStmt.getResultExprs(); @@ -184,9 +184,10 @@ public class Planner { InsertStmt insertStmt = (InsertStmt) statment; rootFragment = distributedPlanner.createInsertFragment(rootFragment, insertStmt, fragments); rootFragment.setSink(insertStmt.createDataSink()); + insertStmt.finalize(); ArrayList exprs = ((InsertStmt) statment).getResultExprs(); List resExprs = Expr.substituteList( - exprs, rootFragment.getPlanRoot().getOutputSmap(), analyzer, false); + exprs, rootFragment.getPlanRoot().getOutputSmap(), analyzer, true); rootFragment.setOutputExprs(resExprs); } else { List resExprs = Expr.substituteList(queryStmt.getBaseTblResultExprs(), @@ -218,7 +219,7 @@ public class Planner { * returns root unchanged. */ private PlanNode addUnassignedConjuncts(Analyzer analyzer, PlanNode root) - throws InternalException { + throws UserException { Preconditions.checkNotNull(root); // List conjuncts = analyzer.getUnassignedConjuncts(root.getTupleIds()); diff --git a/fe/src/main/java/com/baidu/palo/planner/RangePartitionPruner.java b/fe/src/main/java/com/baidu/palo/planner/RangePartitionPruner.java index 449186f718..6d74653980 100644 --- a/fe/src/main/java/com/baidu/palo/planner/RangePartitionPruner.java +++ b/fe/src/main/java/com/baidu/palo/planner/RangePartitionPruner.java @@ -64,6 +64,7 @@ public class RangePartitionPruner implements PartitionPruner { PartitionKey maxKey, int complex) throws AnalysisException { + LOG.debug("column id {}, column filters {}", columnId, partitionColumnFilters); // the last column in partition Key if (columnId == partitionColumns.size()) { try { diff --git a/fe/src/main/java/com/baidu/palo/planner/SchemaScanNode.java b/fe/src/main/java/com/baidu/palo/planner/SchemaScanNode.java index 9f5ac7c3e3..de2ae6b309 100644 --- a/fe/src/main/java/com/baidu/palo/planner/SchemaScanNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/SchemaScanNode.java @@ -24,7 +24,7 @@ import com.baidu.palo.analysis.Analyzer; import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.catalog.SchemaTable; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.service.FrontendOptions; import com.baidu.palo.thrift.TPlanNode; @@ -70,7 +70,7 @@ public class SchemaScanNode extends ScanNode { } @Override - public void finalize(Analyzer analyzer) throws InternalException { + public void finalize(Analyzer analyzer) throws UserException { // Convert predicates to MySQL columns and filters. schemaDb = analyzer.getSchemaDb(); schemaTable = analyzer.getSchemaTable(); diff --git a/fe/src/main/java/com/baidu/palo/planner/SelectNode.java b/fe/src/main/java/com/baidu/palo/planner/SelectNode.java index d35618f3e0..c7a50d0620 100644 --- a/fe/src/main/java/com/baidu/palo/planner/SelectNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/SelectNode.java @@ -21,7 +21,7 @@ package com.baidu.palo.planner; import com.baidu.palo.analysis.Analyzer; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.analysis.Expr; import com.baidu.palo.thrift.TExplainLevel; @@ -57,7 +57,7 @@ public class SelectNode extends PlanNode { } @Override - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { analyzer.markConjunctsAssigned(conjuncts); computeStats(analyzer); createDefaultSmap(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/planner/SingleNodePlanner.java b/fe/src/main/java/com/baidu/palo/planner/SingleNodePlanner.java index beb118a22d..424b072200 100644 --- a/fe/src/main/java/com/baidu/palo/planner/SingleNodePlanner.java +++ b/fe/src/main/java/com/baidu/palo/planner/SingleNodePlanner.java @@ -52,7 +52,7 @@ import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.MysqlTable; import com.baidu.palo.catalog.Table; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.NotImplementedException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.Reference; @@ -63,6 +63,8 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import javassist.expr.NewArray; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -120,7 +122,7 @@ public class SingleNodePlanner { * - Apply combined expression substitution map of child plan nodes; if a plan node * re-maps its input, set a substitution map to be applied by parents. */ - public PlanNode createSingleNodePlan() throws InternalException, AnalysisException { + public PlanNode createSingleNodePlan() throws UserException, AnalysisException { QueryStmt queryStmt = ctx_.getQueryStmt(); // Use the stmt's analyzer which is not necessarily the root analyzer // to detect empty result sets. @@ -227,7 +229,7 @@ public class SingleNodePlanner { * Select/Project/Join/Union [All]/Group by/Having/Order by clauses of the query stmt. */ private PlanNode createQueryPlan(QueryStmt stmt, Analyzer analyzer, long defaultOrderByLimit) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { if (analyzer.hasEmptyResultSet()) return createEmptyNode(stmt, analyzer); long newDefaultOrderByLimit = defaultOrderByLimit; @@ -306,7 +308,7 @@ public class SingleNodePlanner { * semantically correct */ private PlanNode addUnassignedConjuncts(Analyzer analyzer, PlanNode root) - throws InternalException { + throws UserException { Preconditions.checkNotNull(root); // List conjuncts = analyzer.getUnassignedConjuncts(root.getTupleIds()); @@ -322,7 +324,7 @@ public class SingleNodePlanner { } private PlanNode addUnassignedConjuncts( - Analyzer analyzer, List tupleIds, PlanNode root) throws InternalException { + Analyzer analyzer, List tupleIds, PlanNode root) throws UserException { // No point in adding SelectNode on top of an EmptyNode. if (root instanceof EmptySetNode) return root; Preconditions.checkNotNull(root); @@ -628,7 +630,7 @@ public class SingleNodePlanner { * of the selectStmt query block. */ private PlanNode createSelectPlan(SelectStmt selectStmt, Analyzer analyzer, long defaultOrderByLimit) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { // no from clause -> nothing to plan if (selectStmt.getTableRefs().isEmpty()) { return createConstantSelectPlan(selectStmt, analyzer); @@ -713,7 +715,7 @@ public class SingleNodePlanner { * Assigns conjuncts from the Having clause to the returned node. */ private PlanNode createAggregationPlan(SelectStmt selectStmt, Analyzer analyzer, - PlanNode root) throws InternalException { + PlanNode root) throws UserException { // add Having clause root.assignConjuncts(analyzer); Preconditions.checkState(selectStmt.getAggInfo() != null); @@ -743,7 +745,7 @@ public class SingleNodePlanner { * selectStmt with SlotRefs into the materialized tuple. */ private PlanNode createConstantSelectPlan(SelectStmt selectStmt, Analyzer analyzer) - throws InternalException { + throws UserException { Preconditions.checkState(selectStmt.getTableRefs().isEmpty()); ArrayList resultExprs = selectStmt.getResultExprs(); // Create tuple descriptor for materialized tuple. @@ -927,7 +929,7 @@ public class SingleNodePlanner { * complete picture) */ private PlanNode createInlineViewPlan(Analyzer analyzer, InlineViewRef inlineViewRef) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { // If possible, "push down" view predicates; this is needed in order to ensure // that predicates such as "x + y = 10" are evaluated in the view's plan tree // rather than a SelectNode grafted on top of that plan tree. @@ -1117,7 +1119,7 @@ public class SingleNodePlanner { * Create node for scanning all data files of a particular table. */ private PlanNode createScanNode(Analyzer analyzer, TableRef tblRef) - throws InternalException { + throws UserException { ScanNode scanNode = null; switch (tblRef.getTable().getType()) { @@ -1134,10 +1136,13 @@ public class SingleNodePlanner { scanNode = new BrokerScanNode(ctx_.getNextNodeId(), tblRef.getDesc(), "BrokerScanNode", null, -1); break; + case ELASTICSEARCH: + scanNode = new EsScanNode(ctx_.getNextNodeId(), tblRef.getDesc(), "EsScanNode"); + break; default: break; } - if (scanNode instanceof OlapScanNode) { + if (scanNode instanceof OlapScanNode || scanNode instanceof EsScanNode) { Map columnFilters = Maps.newHashMap(); List conjuncts = analyzer.getUnassignedConjuncts(scanNode); for (Column column : tblRef.getTable().getBaseSchema()) { @@ -1231,7 +1236,7 @@ public class SingleNodePlanner { * Throws if the JoinNode.init() fails. */ private PlanNode createJoinNode(Analyzer analyzer, PlanNode outer, TableRef outerRef, TableRef innerRef) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { materializeTableResultForCrossJoinOrCountStar(innerRef, analyzer); // the rows coming from the build node only need to have space for the tuple // materialized by that node @@ -1294,14 +1299,14 @@ public class SingleNodePlanner { * table ref is not implemented. */ private PlanNode createTableRefNode(Analyzer analyzer, TableRef tblRef) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { if (tblRef instanceof BaseTableRef) { return createScanNode(analyzer, tblRef); } if (tblRef instanceof InlineViewRef) { return createInlineViewPlan(analyzer, (InlineViewRef) tblRef); } - throw new InternalException("unknown TableRef node"); + throw new UserException("unknown TableRef node"); } /** @@ -1314,7 +1319,7 @@ public class SingleNodePlanner { private UnionNode createUnionPlan( Analyzer analyzer, UnionStmt unionStmt, List unionOperands, PlanNode unionDistinctPlan, long defaultOrderByLimit) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { UnionNode unionNode = new UnionNode(ctx_.getNextNodeId(), unionStmt.getTupleId(), unionStmt.getUnionResultExprs(), false); for (UnionStmt.UnionOperand op: unionOperands) { @@ -1365,7 +1370,7 @@ public class SingleNodePlanner { * use a union node (this is tricky because a union materializes a new tuple). */ private PlanNode createUnionPlan(UnionStmt unionStmt, Analyzer analyzer, long defaultOrderByLimit) - throws InternalException, AnalysisException { + throws UserException, AnalysisException { // TODO(zc): get unassigned conjuncts // List conjuncts = // analyzer.getUnassignedConjuncts(unionStmt.getTupleId().asList(), false); diff --git a/fe/src/main/java/com/baidu/palo/planner/SortNode.java b/fe/src/main/java/com/baidu/palo/planner/SortNode.java index d3cf8134da..a1b2417d2a 100644 --- a/fe/src/main/java/com/baidu/palo/planner/SortNode.java +++ b/fe/src/main/java/com/baidu/palo/planner/SortNode.java @@ -27,7 +27,7 @@ import com.baidu.palo.analysis.SlotDescriptor; import com.baidu.palo.analysis.SlotId; import com.baidu.palo.analysis.SlotRef; import com.baidu.palo.analysis.SortInfo; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TPlanNode; import com.baidu.palo.thrift.TPlanNodeType; @@ -198,7 +198,7 @@ public class SortNode extends PlanNode { return children.get(0).getNumInstances(); } - public void init(Analyzer analyzer) throws InternalException { + public void init(Analyzer analyzer) throws UserException { // Compute the memory layout for the generated tuple. computeStats(analyzer); // createDefaultSmap(analyzer); diff --git a/fe/src/main/java/com/baidu/palo/qe/Coordinator.java b/fe/src/main/java/com/baidu/palo/qe/Coordinator.java index cb29b1fb41..6e6b4da05b 100644 --- a/fe/src/main/java/com/baidu/palo/qe/Coordinator.java +++ b/fe/src/main/java/com/baidu/palo/qe/Coordinator.java @@ -19,10 +19,10 @@ import com.baidu.palo.analysis.Analyzer; import com.baidu.palo.analysis.DescriptorTable; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.Pair; import com.baidu.palo.common.Reference; import com.baidu.palo.common.Status; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.common.util.RuntimeProfile; import com.baidu.palo.planner.DataPartition; @@ -47,6 +47,7 @@ import com.baidu.palo.system.Backend; import com.baidu.palo.task.LoadEtlTask; import com.baidu.palo.thrift.PaloInternalServiceVersion; import com.baidu.palo.thrift.TDescriptorTable; +import com.baidu.palo.thrift.TEsScanRange; import com.baidu.palo.thrift.TExecPlanFragmentParams; import com.baidu.palo.thrift.TNetworkAddress; import com.baidu.palo.thrift.TPaloScanRange; @@ -64,6 +65,7 @@ import com.baidu.palo.thrift.TScanRangeLocation; import com.baidu.palo.thrift.TScanRangeLocations; import com.baidu.palo.thrift.TScanRangeParams; import com.baidu.palo.thrift.TStatusCode; +import com.baidu.palo.thrift.TTabletCommitInfo; import com.baidu.palo.thrift.TUniqueId; import com.google.common.base.Preconditions; @@ -163,6 +165,8 @@ public class Coordinator { // for export private List exportFiles; + private List commitInfos = Lists.newArrayList(); + // Input parameter private TUniqueId queryId; private TResourceInfo tResourceInfo; @@ -264,6 +268,10 @@ public class Coordinator { } } + public List getCommitInfos() { + return commitInfos; + } + // Initiate private void prepare() { @@ -437,12 +445,12 @@ public class Coordinator { cancelInternal(); switch (code) { case TIMEOUT: - throw new InternalException("query timeout. backend id: " + pair.first.systemBackendId); + throw new UserException("query timeout. backend id: " + pair.first.systemBackendId); case THRIFT_RPC_ERROR: SimpleScheduler.updateBlacklistBackends(pair.first.systemBackendId); throw new RpcException("rpc failed. backend id: " + pair.first.systemBackendId); default: - throw new InternalException(errMsg); + throw new UserException(errMsg); } } } @@ -509,6 +517,15 @@ public class Coordinator { } } + private void updateCommitInfos(List commitInfos) { + lock.lock(); + try { + this.commitInfos.addAll(commitInfos); + } finally { + lock.unlock(); + } + } + void updateStatus(Status status) { lock.lock(); try { @@ -537,7 +554,7 @@ public class Coordinator { TResultBatch getNext() throws Exception { if (receiver == null) { - throw new InternalException("There is no receiver."); + throw new UserException("There is no receiver."); } TResultBatch resultBatch; @@ -568,7 +585,7 @@ public class Coordinator { if (hostIndex != -1) { errMsg = errMsg.substring(0, hostIndex); } - throw new InternalException(errMsg); + throw new UserException(errMsg); } } @@ -716,7 +733,7 @@ public class Coordinator { Backend backend = Catalog.getCurrentSystemInfo().getBackendWithBePort( host.getHostname(), host.getPort()); if (backend == null) { - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } TNetworkAddress dest = new TNetworkAddress(backend.getHost(), backend.getBeRpcPort()); return dest; @@ -726,7 +743,7 @@ public class Coordinator { Backend backend = Catalog.getCurrentSystemInfo().getBackendWithBePort( host.getHostname(), host.getPort()); if (backend == null) { - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } if (backend.getBrpcPort() < 0) { return null; @@ -768,7 +785,7 @@ public class Coordinator { TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); if (execHostport == null) { LOG.warn("DataPartition UNPARTITIONED, no scanNode Backend"); - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } this.addressToBackendID.put(execHostport, backendIdRef.getRef()); FInstanceExecParam instanceParam = new FInstanceExecParam(null, execHostport, @@ -821,7 +838,7 @@ public class Coordinator { Reference backendIdRef = new Reference(); TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); if (execHostport == null) { - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } this.addressToBackendID.put(execHostport, backendIdRef.getRef()); FInstanceExecParam instanceParam = new FInstanceExecParam(null, execHostport, @@ -830,7 +847,7 @@ public class Coordinator { } } } - + private void computeFragmentExecParamsForParallelExec() throws Exception { // create exec params and set instance_id, host, per_node_scan_ranges computeFragmentInstances(fragmentExecParamsMap.get(fragments.get(0).getFragmentId())); @@ -890,7 +907,7 @@ public class Coordinator { TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); if (execHostport == null) { LOG.warn("DataPartition UNPARTITIONED, no scanNode Backend"); - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } TUniqueId instanceId = getNextInstanceId(); FInstanceExecParam instanceParam = new FInstanceExecParam(instanceId, execHostport, @@ -958,7 +975,7 @@ public class Coordinator { } public void createScanInstance(PlanNodeId leftMostScanId, FragmentExecParams fragmentExecParams) - throws InternalException { + throws UserException { int maxNumInstance = queryOptions.mt_dop; if (maxNumInstance == 0) { maxNumInstance = 1; @@ -969,7 +986,7 @@ public class Coordinator { Reference backendIdRef = new Reference(); TNetworkAddress execHostport = SimpleScheduler.getHost(this.idToBackend, backendIdRef); if (execHostport == null) { - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } FInstanceExecParam instanceParam = new FInstanceExecParam(getNextInstanceId(), execHostport, 0, fragmentExecParams); @@ -1200,7 +1217,7 @@ public class Coordinator { TNetworkAddress execHostPort = SimpleScheduler.getHost(minLocation.backend_id, scanRangeLocations.getLocations(), this.idToBackend, backendIdRef); if (execHostPort == null) { - throw new InternalException("there is no scanNode Backend"); + throw new UserException("there is no scanNode Backend"); } this.addressToBackendID.put(execHostPort, backendIdRef.getRef()); @@ -1271,6 +1288,9 @@ public class Coordinator { if (params.isSetExport_files()) { updateExportFiles(params.export_files); } + if (params.isSetCommitInfos()) { + updateCommitInfos(params.getCommitInfos()); + } profileDoneSignal.countDown(); } @@ -1456,6 +1476,7 @@ public class Coordinator { params.params.setDestinations(destinations); params.params.setSender_id(i); + params.params.setNum_senders(instanceExecParams.size()); params.setCoord(coordAddress); params.setBackend_num(backendNum++); params.setQuery_globals(queryGlobals); @@ -1480,6 +1501,12 @@ public class Coordinator { sb.append("{tid=").append(paloScanRange.getTablet_id()) .append(",ver=").append(paloScanRange.getVersion()).append("}"); } + TEsScanRange esScanRange = range.getScan_range().getEs_scan_range(); + if (esScanRange != null) { + sb.append("{ index=").append(esScanRange.getIndex()) + .append(", shardid=").append(esScanRange.getShard_id()) + .append("}"); + } } sb.append("]"); } diff --git a/fe/src/main/java/com/baidu/palo/qe/HelpModule.java b/fe/src/main/java/com/baidu/palo/qe/HelpModule.java index d81102babe..cc81d1a0be 100644 --- a/fe/src/main/java/com/baidu/palo/qe/HelpModule.java +++ b/fe/src/main/java/com/baidu/palo/qe/HelpModule.java @@ -15,7 +15,7 @@ package com.baidu.palo.qe; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; @@ -90,7 +90,7 @@ public class HelpModule { private static ReentrantLock lock = new ReentrantLock(); // Files in zip is not recursive, so we only need to traverse it - public void setUpByZip(String path) throws IOException, InternalException { + public void setUpByZip(String path) throws IOException, UserException { initBuild(); ZipFile zf = new ZipFile(path); Enumeration entries = zf.entries(); @@ -141,10 +141,10 @@ public class HelpModule { } // for test only - public void setUp(String path) throws InternalException, IOException { + public void setUp(String path) throws UserException, IOException { File root = new File(path); if (!root.isDirectory()) { - throw new InternalException("Need help directory."); + throw new UserException("Need help directory."); } initBuild(); for (File file : root.listFiles()) { @@ -157,7 +157,7 @@ public class HelpModule { } // for test only - private void setUpDir(String parent, File dir) throws IOException, InternalException { + private void setUpDir(String parent, File dir) throws IOException, UserException { updateCategory(parent, dir.getName()); for (File file : dir.listFiles()) { if (file.getName().startsWith(".")) { @@ -256,7 +256,7 @@ public class HelpModule { return EMPTY_LIST; } - public void setUpModule() throws IOException, InternalException { + public void setUpModule() throws IOException, UserException { URL helpResource = instance.getClass().getClassLoader() .getResource(HELP_ZIP_FILE_NAME); if (helpResource == null) { @@ -270,7 +270,7 @@ public class HelpModule { lastModifyTime = now; } - public boolean needReloadZipFile(String zipPath) throws InternalException { + public boolean needReloadZipFile(String zipPath) throws UserException { if (!isloaded) { return false; } @@ -284,7 +284,7 @@ public class HelpModule { // check zip file's last modify time File file = new File(zipPath); if (!file.exists()) { - throw new InternalException("zipfile of help module is not exist" + zipPath); + throw new UserException("zipfile of help module is not exist" + zipPath); } long lastModify = file.lastModified(); if (lastModifyTime >= lastModify) { @@ -311,14 +311,14 @@ public class HelpModule { HelpModule newInstance = new HelpModule(); newInstance.setUpByZip(zipFilePath); instance = newInstance; - } catch (InternalException | IOException e) { + } catch (UserException | IOException e) { LOG.warn("Failed to reload help zip file: " + zipFilePath, e); } finally { lock.unlock(); } } } - } catch (InternalException e) { + } catch (UserException e) { LOG.warn("Failed to reload help zip file: " + zipFilePath, e); } diff --git a/fe/src/main/java/com/baidu/palo/qe/HelpObjectLoader.java b/fe/src/main/java/com/baidu/palo/qe/HelpObjectLoader.java index a2630deb68..316078a7c5 100644 --- a/fe/src/main/java/com/baidu/palo/qe/HelpObjectLoader.java +++ b/fe/src/main/java/com/baidu/palo/qe/HelpObjectLoader.java @@ -15,7 +15,7 @@ package com.baidu.palo.qe; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.MarkDownParser; import com.google.common.base.Strings; @@ -36,7 +36,7 @@ import java.util.Map; public abstract class HelpObjectLoader { private static final Logger LOG = LogManager.getLogger(HelpObjectLoader.class); - public List loadAll(List lines) throws InternalException { + public List loadAll(List lines) throws UserException { if (lines == null) { LOG.error("Help object loader input lines is empty."); return null; @@ -55,7 +55,7 @@ public abstract class HelpObjectLoader { } // Load all Topics - public List loadAll(String path) throws IOException, InternalException { + public List loadAll(String path) throws IOException, UserException { if (Strings.isNullOrEmpty(path)) { LOG.error("Help object loader input file is empty."); return null; diff --git a/fe/src/main/java/com/baidu/palo/qe/MultiLoadMgr.java b/fe/src/main/java/com/baidu/palo/qe/MultiLoadMgr.java index 1b7851664c..2d8905ff37 100644 --- a/fe/src/main/java/com/baidu/palo/qe/MultiLoadMgr.java +++ b/fe/src/main/java/com/baidu/palo/qe/MultiLoadMgr.java @@ -29,15 +29,22 @@ import com.baidu.palo.thrift.TNetworkAddress; import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; // Class used to record state of multi-load operation public class MultiLoadMgr { + private static final Logger LOG = LogManager.getLogger(MultiLoadMgr.class); + private Map infoMap = Maps.newHashMap(); private ReadWriteLock lock = new ReentrantReadWriteLock(true); @@ -56,27 +63,45 @@ public class MultiLoadMgr { lock.writeLock().lock(); try { if (infoMap.containsKey(multiLabel)) { - throw new DdlException("Label(" + multiLabel + ") already exists."); + throw new DdlException("Label(" + label + ") already exists."); } infoMap.put(multiLabel, new MultiLoadDesc(multiLabel, properties)); } finally { lock.writeLock().unlock(); } // Register to Load after put into map. - Catalog.getInstance().getLoadInstance().registerMiniLabel(fullDbName, label, System.currentTimeMillis()); + if (!Catalog.getInstance().getLoadInstance().registerMiniLabel(fullDbName, label, System.currentTimeMillis())) { + throw new DdlException("Label(" + label + ") already exists."); + } + } + + public boolean isLabelUsed(String fullDbName, String label, String subLabel, long timestamp) { + LabelName multiLabel = new LabelName(fullDbName, label); + lock.readLock().lock(); + try { + if (infoMap.containsKey(multiLabel)) { + MultiLoadDesc multiLoadDesc = infoMap.get(multiLabel); + return multiLoadDesc.isSubLabelUsed(subLabel, timestamp); + } else { + return false; + } + } finally { + lock.readLock().unlock(); + } } public void load(TMiniLoadRequest request) throws DdlException { load(request.getDb(), request.getLabel(), request.getSubLabel(), request.getTbl(), - request.getFiles(), request.getBackend(), request.getProperties()); + request.getFiles(), request.getBackend(), request.getProperties(), request.getTimestamp()); } - // Add one load job, we have + // Add one load job private void load(String fullDbName, String label, String subLabel, String table, List files, TNetworkAddress fileAddr, - Map properties) throws DdlException { + Map properties, + long timestamp) throws DdlException { LabelName multiLabel = new LabelName(fullDbName, label); lock.writeLock().lock(); try { @@ -84,7 +109,7 @@ public class MultiLoadMgr { if (multiLoadDesc == null) { throw new DdlException("Unknown label(" + multiLabel + ")"); } - multiLoadDesc.addFile(subLabel, table, files, fileAddr, properties); + multiLoadDesc.addFile(subLabel, table, files, fileAddr, properties, timestamp); } finally { lock.writeLock().unlock(); } @@ -203,26 +228,28 @@ public class MultiLoadMgr { this.properties = properties; } - public void addFile(String label, String table, List files, - TNetworkAddress fileAddr, - Map properties) throws DdlException { - TableLoadDesc desc = loadDescByLabel.get(label); + public void addFile(String subLabel, String table, List files, + TNetworkAddress fileAddr, + Map properties, + long timestamp) throws DdlException { + TableLoadDesc desc = loadDescByLabel.get(subLabel); if (desc != null) { // Already exists - throw new DdlException("Sub label(" + label + ") already exists."); + throw new DdlException("Sub label(" + subLabel + ") already exists."); } desc = loadDescByTable.get(table); if (desc == null) { - desc = new TableLoadDesc(table, label, files, fileAddr, properties); + desc = new TableLoadDesc(table, subLabel, files, fileAddr, properties, timestamp); loadDescByTable.put(table, desc); } else { if (!desc.canMerge(properties)) { throw new DdlException("Same table have different properties in one multi-load." + "new=" + properties + ",old=" + desc.properties); } - desc.addFiles(label, files); + desc.addFiles(subLabel, files); + desc.addTimestamp(timestamp); } - loadDescByLabel.put(label, desc); + loadDescByLabel.put(subLabel, desc); } public void delFile(String label) throws DdlException { @@ -243,6 +270,22 @@ public class MultiLoadMgr { } } + public boolean isSubLabelUsed(String subLabel, long timestamp) { + if (loadDescByLabel.containsKey(subLabel)) { + if (timestamp == -1) { + return true; + } else { + TableLoadDesc tblLoadDesc = loadDescByLabel.get(subLabel); + if (tblLoadDesc.containsTimestamp(timestamp)) { + return true; + } else { + return false; + } + } + } + return false; + } + public TNetworkAddress getHost(String table, TNetworkAddress defaultAddr) { TNetworkAddress address = addressByTable.get(table); if (address != null) { @@ -270,14 +313,19 @@ public class MultiLoadMgr { private Map> filesByLabel; private TNetworkAddress address; private Map properties; + // 2 or more files may be loaded to same table with different sub labels. + // So we use Set to save all timestamp of all diffrent sub labels + private Set timestamps = Sets.newHashSet(); public TableLoadDesc(String tbl, String label, List files, - TNetworkAddress address, Map properties) { + TNetworkAddress address, Map properties, + long timestamp) { this.tbl = tbl; this.filesByLabel = Maps.newHashMap(); this.address = address; this.properties = properties; filesByLabel.put(label, files); + this.timestamps.add(timestamp); } public boolean canMerge(Map properties) { @@ -296,6 +344,14 @@ public class MultiLoadMgr { filesByLabel.remove(label); } + public boolean containsTimestamp(long timestamp) { + return timestamps.contains(timestamp); + } + + public void addTimestamp(long timestamp) { + timestamps.add(timestamp); + } + // TODO(zc): public DataDescription toDataDesc() throws DdlException { List files = Lists.newArrayList(); @@ -328,3 +384,4 @@ public class MultiLoadMgr { } } } + diff --git a/fe/src/main/java/com/baidu/palo/qe/QeProcessor.java b/fe/src/main/java/com/baidu/palo/qe/QeProcessor.java index a3b7001e92..e626be9db3 100644 --- a/fe/src/main/java/com/baidu/palo/qe/QeProcessor.java +++ b/fe/src/main/java/com/baidu/palo/qe/QeProcessor.java @@ -15,7 +15,7 @@ package com.baidu.palo.qe; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.thrift.TReportExecStatusParams; import com.baidu.palo.thrift.TReportExecStatusResult; import com.baidu.palo.thrift.TUniqueId; @@ -26,9 +26,9 @@ public interface QeProcessor { TReportExecStatusResult reportExecStatus(TReportExecStatusParams params); - void registerQuery(TUniqueId queryId, Coordinator coord) throws InternalException; + void registerQuery(TUniqueId queryId, Coordinator coord) throws UserException; - void registerQuery(TUniqueId queryId, QeProcessorImpl.QueryInfo info) throws InternalException; + void registerQuery(TUniqueId queryId, QeProcessorImpl.QueryInfo info) throws UserException; void unregisterQuery(TUniqueId queryId); diff --git a/fe/src/main/java/com/baidu/palo/qe/QeProcessorImpl.java b/fe/src/main/java/com/baidu/palo/qe/QeProcessorImpl.java index a3eda5ca4c..f92202b1fc 100644 --- a/fe/src/main/java/com/baidu/palo/qe/QeProcessorImpl.java +++ b/fe/src/main/java/com/baidu/palo/qe/QeProcessorImpl.java @@ -16,7 +16,7 @@ package com.baidu.palo.qe; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.thrift.*; import com.google.common.collect.Maps; @@ -41,16 +41,16 @@ public final class QeProcessorImpl implements QeProcessor { } @Override - public void registerQuery(TUniqueId queryId, Coordinator coord) throws InternalException { + public void registerQuery(TUniqueId queryId, Coordinator coord) throws UserException { registerQuery(queryId, new QueryInfo(coord)); } @Override - public void registerQuery(TUniqueId queryId, QueryInfo info) throws InternalException { + public void registerQuery(TUniqueId queryId, QueryInfo info) throws UserException { LOG.info("register query id = " + queryId.toString()); final QueryInfo result = coordinatorMap.putIfAbsent(queryId, info); if (result != null) { - throw new InternalException("queryId " + queryId + " already exists"); + throw new UserException("queryId " + queryId + " already exists"); } } diff --git a/fe/src/main/java/com/baidu/palo/qe/QeService.java b/fe/src/main/java/com/baidu/palo/qe/QeService.java index 36fd0d8f62..009e0b6763 100644 --- a/fe/src/main/java/com/baidu/palo/qe/QeService.java +++ b/fe/src/main/java/com/baidu/palo/qe/QeService.java @@ -17,7 +17,7 @@ package com.baidu.palo.qe; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.MysqlServer; import org.apache.logging.log4j.LogManager; @@ -50,7 +50,7 @@ public class QeService { } @Deprecated - public void setup() throws IOException, InternalException { + public void setup() throws IOException, UserException { // Set up help module try { HelpModule.getInstance().setUpModule(); diff --git a/fe/src/main/java/com/baidu/palo/qe/ResultReceiver.java b/fe/src/main/java/com/baidu/palo/qe/ResultReceiver.java index e2a148f3e9..ce2db9f917 100644 --- a/fe/src/main/java/com/baidu/palo/qe/ResultReceiver.java +++ b/fe/src/main/java/com/baidu/palo/qe/ResultReceiver.java @@ -15,6 +15,8 @@ package com.baidu.palo.qe; +import com.baidu.palo.common.ClientPool; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Status; import com.baidu.palo.rpc.BackendServiceProxy; import com.baidu.palo.rpc.PFetchDataRequest; @@ -58,7 +60,7 @@ public class ResultReceiver { if (isDone) { return null; } - + try { while (!isDone && !isCancel) { PFetchDataRequest request = new PFetchDataRequest(finstId); @@ -93,7 +95,7 @@ public class ResultReceiver { status.setRpcStatus("receive error packet"); return null; } - + packetIdx++; isDone = pResult.eos; @@ -126,7 +128,7 @@ public class ResultReceiver { currentThread = null; } } - + if (isCancel) { status.setStatus(Status.CANCELLED); } diff --git a/fe/src/main/java/com/baidu/palo/qe/ShowExecutor.java b/fe/src/main/java/com/baidu/palo/qe/ShowExecutor.java index fbf9440aad..aadb07aa9e 100644 --- a/fe/src/main/java/com/baidu/palo/qe/ShowExecutor.java +++ b/fe/src/main/java/com/baidu/palo/qe/ShowExecutor.java @@ -15,6 +15,8 @@ package com.baidu.palo.qe; +import com.baidu.palo.analysis.AdminShowReplicaDistributionStmt; +import com.baidu.palo.analysis.AdminShowReplicaStatusStmt; import com.baidu.palo.analysis.DescribeStmt; import com.baidu.palo.analysis.HelpStmt; import com.baidu.palo.analysis.ShowAlterStmt; @@ -59,6 +61,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.MaterializedIndex; +import com.baidu.palo.catalog.MetadataViewer; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.catalog.Partition; import com.baidu.palo.catalog.Table; @@ -69,6 +72,7 @@ import com.baidu.palo.cluster.BaseParam; import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.CaseSensibility; +import com.baidu.palo.common.DdlException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; import com.baidu.palo.common.PatternMatcher; @@ -95,6 +99,7 @@ import com.google.common.collect.Sets; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.lang.annotation.AnnotationFormatError; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -187,6 +192,10 @@ public class ShowExecutor { handleShowGrants(); } else if (stmt instanceof ShowRolesStmt) { handleShowRoles(); + } else if (stmt instanceof AdminShowReplicaStatusStmt) { + handleAdminShowTabletStatus(); + } else if (stmt instanceof AdminShowReplicaDistributionStmt) { + handleAdminShowTabletDistribution(); } else { handleEmtpy(); } @@ -658,9 +667,13 @@ public class ShowExecutor { String label = null; if (showWarningsStmt.isFindByLabel()) { label = showWarningsStmt.getLabel(); - job = load.getLatestJobIdByLabel(dbId, showWarningsStmt.getLabel()); + jobId = load.getLatestJobIdByLabel(dbId, showWarningsStmt.getLabel()); + job = load.getLoadJob(jobId); + if (job == null) { + throw new AnalysisException("job is not exist."); + } } else { - LOG.info("load_job_id={}", jobId); + LOG.debug("load_job_id={}", jobId); jobId = showWarningsStmt.getJobId(); job = load.getLoadJob(jobId); if (job == null) { @@ -1041,6 +1054,28 @@ public class ShowExecutor { resultSet = new ShowResultSet(showStmt.getMetaData(), infos); } + private void handleAdminShowTabletStatus() { + AdminShowReplicaStatusStmt showStmt = (AdminShowReplicaStatusStmt) stmt; + List> results; + try { + results = MetadataViewer.getTabletStatus(showStmt); + } catch (DdlException e) { + throw new AnnotationFormatError(e.getMessage()); + } + resultSet = new ShowResultSet(showStmt.getMetaData(), results); + } + + private void handleAdminShowTabletDistribution() { + AdminShowReplicaDistributionStmt showStmt = (AdminShowReplicaDistributionStmt) stmt; + List> results; + try { + results = MetadataViewer.getTabletDistribution(showStmt); + } catch (DdlException e) { + throw new AnnotationFormatError(e.getMessage()); + } + resultSet = new ShowResultSet(showStmt.getMetaData(), results); + } + } diff --git a/fe/src/main/java/com/baidu/palo/qe/StmtExecutor.java b/fe/src/main/java/com/baidu/palo/qe/StmtExecutor.java index 15994bc9b1..b9450a1c50 100644 --- a/fe/src/main/java/com/baidu/palo/qe/StmtExecutor.java +++ b/fe/src/main/java/com/baidu/palo/qe/StmtExecutor.java @@ -45,8 +45,8 @@ import com.baidu.palo.common.Config; import com.baidu.palo.common.DdlException; import com.baidu.palo.common.ErrorCode; import com.baidu.palo.common.ErrorReport; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.NotImplementedException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.common.util.ProfileManager; import com.baidu.palo.common.util.RuntimeProfile; @@ -62,6 +62,7 @@ import com.baidu.palo.thrift.TExplainLevel; import com.baidu.palo.thrift.TQueryOptions; import com.baidu.palo.thrift.TResultBatch; import com.baidu.palo.thrift.TUniqueId; +import com.baidu.palo.transaction.TabletCommitInfo; import com.google.common.base.Strings; import com.google.common.collect.Lists; @@ -237,9 +238,22 @@ public class StmtExecutor { } else if (parsedStmt instanceof CreateTableAsSelectStmt) { handleInsertStmt(); } else if (parsedStmt instanceof InsertStmt) { // Must ahead of DdlStmt because InserStmt is its subclass - handleInsertStmt(); - if (context.getSessionVariable().isReportSucc()) { - writeProfile(beginTimeInNanoSecond); + try { + handleInsertStmt(); + if (context.getSessionVariable().isReportSucc()) { + writeProfile(beginTimeInNanoSecond); + } + } catch (Throwable t) { + LOG.warn("handle insert stmt fail", t); + InsertStmt insertStmt = (InsertStmt) parsedStmt; + try { + Catalog.getCurrentGlobalTransactionMgr().abortTransaction( + insertStmt.getTransactionId(), + t.getMessage() == null ? "unknown reason" : t.getMessage()); + } catch (Exception abortTxnException) { + LOG.warn("errors when abort txn", abortTxnException); + } + throw t; } } else if (parsedStmt instanceof DdlStmt) { handleDdlStmt(); @@ -314,7 +328,7 @@ public class StmtExecutor { } // Analyze one statement to structure in memory. - private void analyze() throws AnalysisException, InternalException, + private void analyze() throws AnalysisException, UserException, NotImplementedException { LOG.info("begin to analyze stmt: {}", context.getStmtId()); // Parse statement with parser generated by CUP&FLEX @@ -343,6 +357,12 @@ public class StmtExecutor { throw new AnalysisException("Unexpected exception: " + e.getMessage()); } + // yiguolei: insertstmt's grammer analysis will write editlog, so that we check if the stmt should be forward to master here + // if the stmt should be forward to master, then just return here and the master will do analysis again + if (isForwardToMaster()) { + return; + } + analyzer = new Analyzer(context.getCatalog(), context); // Convert show statement to select statement here if (parsedStmt instanceof ShowStmt) { @@ -442,9 +462,7 @@ public class StmtExecutor { // Preconditions.checkState(!analyzer.hasUnassignedConjuncts()); } catch (AnalysisException e) { throw e; - } catch (InternalException e) { - throw e; - } catch (NotImplementedException e) { + } catch (UserException e) { throw e; } catch (Exception e) { LOG.warn("Analyze failed because ", e); @@ -612,15 +630,24 @@ public class StmtExecutor { return; } - context.getCatalog().getLoadInstance().addLoadJob( - uuid.toString(), - insertStmt.getDb(), - insertStmt.getTargetTable().getId(), - coord.getDeltaUrls(), - System.currentTimeMillis() - ); - - context.getState().setOk("{'label':'" + uuid.toString() + "'}"); + if (insertStmt.isStreaming()) { + Catalog.getCurrentGlobalTransactionMgr().commitAndPublishTransaction( + insertStmt.getDbObj(), insertStmt.getTransactionId(), + TabletCommitInfo.fromThrift(coord.getCommitInfos()), + 5000); + context.getState().setOk(); + } else { + context.getCatalog().getLoadInstance().addLoadJob( + uuid.toString(), + insertStmt.getDb(), + insertStmt.getTargetTable().getId(), + insertStmt.getIndexIdToSchemaHash(), + insertStmt.getTransactionId(), + coord.getDeltaUrls(), + System.currentTimeMillis() + ); + context.getState().setOk("{'label':'" + uuid.toString() + "'}"); + } } private void handleUnsupportedStmt() { diff --git a/fe/src/main/java/com/baidu/palo/service/FrontendOptions.java b/fe/src/main/java/com/baidu/palo/service/FrontendOptions.java index 2df70f34d2..79f20e9ec3 100644 --- a/fe/src/main/java/com/baidu/palo/service/FrontendOptions.java +++ b/fe/src/main/java/com/baidu/palo/service/FrontendOptions.java @@ -38,9 +38,10 @@ public class FrontendOptions { private static String PRIORITY_CIDR_SEPARATOR = ";"; private static List priorityCidrs = Lists.newArrayList(); - private static InetAddress localAddr; + private static InetAddress localAddr = InetAddress.getLoopbackAddress(); public static void init() throws UnknownHostException { + localAddr = null; if (!Config.frontend_address.equals("0.0.0.0")) { if (!InetAddressValidator.getInstance().isValidInet4Address(Config.frontend_address)) { throw new UnknownHostException("invalid frontend_address: " + Config.frontend_address); diff --git a/fe/src/main/java/com/baidu/palo/service/FrontendServiceImpl.java b/fe/src/main/java/com/baidu/palo/service/FrontendServiceImpl.java index 16b1ae239f..079ec31a4c 100644 --- a/fe/src/main/java/com/baidu/palo/service/FrontendServiceImpl.java +++ b/fe/src/main/java/com/baidu/palo/service/FrontendServiceImpl.java @@ -19,22 +19,24 @@ import com.baidu.palo.analysis.SetType; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.Database; +import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.catalog.Table; import com.baidu.palo.cluster.ClusterNamespace; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.AuditLog; -import com.baidu.palo.common.AuthorizationException; +import com.baidu.palo.common.AuthenticationException; import com.baidu.palo.common.CaseSensibility; import com.baidu.palo.common.Config; -import com.baidu.palo.common.DdlException; import com.baidu.palo.common.PatternMatcher; import com.baidu.palo.common.ThriftServerContext; import com.baidu.palo.common.ThriftServerEventProcessor; +import com.baidu.palo.common.UserException; import com.baidu.palo.load.EtlStatus; import com.baidu.palo.load.LoadJob; import com.baidu.palo.load.MiniEtlTaskInfo; import com.baidu.palo.master.MasterImpl; import com.baidu.palo.mysql.privilege.PrivPredicate; +import com.baidu.palo.planner.StreamLoadPlanner; import com.baidu.palo.qe.AuditBuilder; import com.baidu.palo.qe.ConnectContext; import com.baidu.palo.qe.ConnectProcessor; @@ -48,6 +50,7 @@ import com.baidu.palo.thrift.TColumnDef; import com.baidu.palo.thrift.TColumnDesc; import com.baidu.palo.thrift.TDescribeTableParams; import com.baidu.palo.thrift.TDescribeTableResult; +import com.baidu.palo.thrift.TExecPlanFragmentParams; import com.baidu.palo.thrift.TFeResult; import com.baidu.palo.thrift.TFetchResourceResult; import com.baidu.palo.thrift.TFinishTaskRequest; @@ -57,6 +60,12 @@ import com.baidu.palo.thrift.TGetTablesParams; import com.baidu.palo.thrift.TGetTablesResult; import com.baidu.palo.thrift.TListTableStatusResult; import com.baidu.palo.thrift.TLoadCheckRequest; +import com.baidu.palo.thrift.TLoadTxnBeginRequest; +import com.baidu.palo.thrift.TLoadTxnBeginResult; +import com.baidu.palo.thrift.TLoadTxnCommitRequest; +import com.baidu.palo.thrift.TLoadTxnCommitResult; +import com.baidu.palo.thrift.TLoadTxnRollbackRequest; +import com.baidu.palo.thrift.TLoadTxnRollbackResult; import com.baidu.palo.thrift.TMasterOpRequest; import com.baidu.palo.thrift.TMasterOpResult; import com.baidu.palo.thrift.TMasterResult; @@ -70,12 +79,18 @@ import com.baidu.palo.thrift.TShowVariableRequest; import com.baidu.palo.thrift.TShowVariableResult; import com.baidu.palo.thrift.TStatus; import com.baidu.palo.thrift.TStatusCode; +import com.baidu.palo.thrift.TStreamLoadPutRequest; +import com.baidu.palo.thrift.TStreamLoadPutResult; import com.baidu.palo.thrift.TTableStatus; import com.baidu.palo.thrift.TUniqueId; import com.baidu.palo.thrift.TUpdateExportTaskStatusRequest; import com.baidu.palo.thrift.TUpdateMiniEtlTaskStatusRequest; +import com.baidu.palo.transaction.LabelAlreadyExistsException; +import com.baidu.palo.transaction.TabletCommitInfo; +import com.baidu.palo.transaction.TransactionState; import com.google.common.base.Joiner; +import com.google.common.base.Strings; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -314,16 +329,13 @@ public class FrontendServiceImpl implements FrontendService.Iface { LOG.info("mini load request is {}", request); ConnectContext context = new ConnectContext(null); - String cluster; + String cluster = SystemInfoService.DEFAULT_CLUSTER; if (request.isSetCluster()) { cluster = request.cluster; - } else { - cluster = SystemInfoService.DEFAULT_CLUSTER; } - final String dbFullName = ClusterNamespace.getFullName(cluster, request.db); - request.setUser(request.user); - request.setDb(dbFullName); + final String fullDbName = ClusterNamespace.getFullName(cluster, request.db); + request.setDb(fullDbName); context.setCluster(cluster); context.setDatabase(ClusterNamespace.getFullName(cluster, request.db)); context.setQualifiedUser(ClusterNamespace.getFullName(cluster, request.user)); @@ -334,12 +346,43 @@ public class FrontendServiceImpl implements FrontendService.Iface { TStatus status = new TStatus(TStatusCode.OK); TFeResult result = new TFeResult(FrontendServiceVersion.V1, status); try { + if (request.isSetIs_retry() && request.isIs_retry()) { + // this may be a retry request from Backends, + // so we first check if load job has already been submitted. + // TODO(cmy): + // The Backend will retry the mini load request if it encounter timeout exception. + // So this code here is to avoid returning 'label already used' message to user + // because of the timeout retry. + // But this may still cause 'label already used' error if the timeout is set too short, + // because here is no lock to guarantee the atomic operation between 'isLabelUsed' and 'addLabel' + // method. + // But the default timeout is set to 3 seconds, so in common case, it will not be a problem. + if (request.isSetSubLabel()) { + if (ExecuteEnv.getInstance().getMultiLoadMgr().isLabelUsed(fullDbName, + request.getLabel(), + request.getSubLabel(), + request.getTimestamp())) { + LOG.info("multi mini load job has already been submitted. label: {}, sub label: {}, " + + "timestamp: {}", + request.getLabel(), request.getSubLabel(), request.getTimestamp()); + return result; + } + } else { + if (Catalog.getCurrentCatalog().getLoadInstance().isLabelUsed(fullDbName, + request.getLabel(), + request.getTimestamp())) { + LOG.info("mini load job has already been submitted. label: {}, timestamp: {}", + request.getLabel(), request.getTimestamp()); + return result; + } + } + } + if (request.isSetSubLabel()) { ExecuteEnv.getInstance().getMultiLoadMgr().load(request); } else { - if (!Catalog.getInstance().getLoadInstance().addLoadJob(request)) { - return result; - } + // try to add load job, label will be checked here. + Catalog.getInstance().getLoadInstance().addLoadJob(request); try { // gen mini load audit log @@ -348,13 +391,18 @@ public class FrontendServiceImpl implements FrontendService.Iface { LOG.warn("failed log mini load stmt", e); } } - } catch (DdlException e) { - LOG.error("add mini load error", e); + } catch (UserException e) { + LOG.warn("add mini load error", e); status.setStatus_code(TStatusCode.ANALYSIS_ERROR); status.setError_msgs(Lists.newArrayList(e.getMessage())); + } catch (Throwable e) { + LOG.warn("unexpected exception when adding mini load", e); + status.setStatus_code(TStatusCode.ANALYSIS_ERROR); + status.setError_msgs(Lists.newArrayList(e.getMessage())); + } finally { + ConnectContext.remove(); } - ConnectContext.remove(); return result; } @@ -467,53 +515,8 @@ public class FrontendServiceImpl implements FrontendService.Iface { return result; } - @Override - public TFeResult loadCheck(TLoadCheckRequest request) throws TException { - LOG.info("Load check request is {}", request); - - - TStatus status = new TStatus(TStatusCode.OK); - TFeResult result = new TFeResult(FrontendServiceVersion.V1, status); - String cluster; - if (request.isSetCluster()) { - cluster = request.cluster; - } else { - cluster = SystemInfoService.DEFAULT_CLUSTER; - } - - final String dbFullName = ClusterNamespace.getFullName(cluster, request.db); - - try { - checkPasswordAndPrivs(cluster, request.user, request.passwd, request.db, request.tbl, request.user_ip, - PrivPredicate.LOAD); - } catch (AuthorizationException e) { - status.setStatus_code(TStatusCode.ANALYSIS_ERROR); - status.setError_msgs(Lists.newArrayList(e.getMessage())); - return result; - } - - if (request.isSetLabel()) { - // Only single table will be set label - try { - if (request.isSetTimestamp()) { - Catalog.getInstance().getLoadInstance().checkLabelUsed( - dbFullName, request.getLabel(), request.getTimestamp()); - } else { - Catalog.getInstance().getLoadInstance().checkLabelUsed( - dbFullName, request.getLabel(), 0); - } - } catch (DdlException e) { - status.setStatus_code(TStatusCode.INTERNAL_ERROR); - status.setError_msgs(Lists.newArrayList(e.getMessage())); - return result; - } - } - - return result; - } - private void checkPasswordAndPrivs(String cluster, String user, String passwd, String db, String tbl, - String clientIp, PrivPredicate predicate) throws AuthorizationException { + String clientIp, PrivPredicate predicate) throws AuthenticationException { final String fullUserName = ClusterNamespace.getFullName(cluster, user); final String fullDbName = ClusterNamespace.getFullName(cluster, db); @@ -521,15 +524,218 @@ public class FrontendServiceImpl implements FrontendService.Iface { if (!Catalog.getCurrentCatalog().getAuth().checkPlainPassword(fullUserName, clientIp, passwd)) { - throw new AuthorizationException("Access denied for " + throw new AuthenticationException("Access denied for " + fullUserName + "@" + clientIp); } if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(clientIp, fullDbName, fullUserName, tbl, predicate)) { - throw new AuthorizationException( + throw new AuthenticationException( "Access denied; you need (at least one of) the LOAD privilege(s) for this operation"); } } + + @Override + public TFeResult loadCheck(TLoadCheckRequest request) throws TException { + LOG.info("load check request. label: {}, user: {}, ip: {}", + request.getLabel(), request.getUser(), request.getUser_ip()); + + TStatus status = new TStatus(TStatusCode.OK); + TFeResult result = new TFeResult(FrontendServiceVersion.V1, status); + try { + String cluster = SystemInfoService.DEFAULT_CLUSTER; + if (request.isSetCluster()) { + cluster = request.cluster; + } + + checkPasswordAndPrivs(cluster, request.getUser(), request.getPasswd(), request.getDb(), + request.getTbl(), request.getUser_ip(), PrivPredicate.LOAD); + } catch (UserException e) { + status.setStatus_code(TStatusCode.ANALYSIS_ERROR); + status.setError_msgs(Lists.newArrayList(e.getMessage())); + return result; + } catch (Throwable e) { + LOG.warn("catch unknown result.", e); + status.setStatus_code(TStatusCode.INTERNAL_ERROR); + status.setError_msgs(Lists.newArrayList(e.getMessage())); + return result; + } + + return result; + } + + @Override + public TLoadTxnBeginResult loadTxnBegin(TLoadTxnBeginRequest request) throws TException { + LOG.info("receive loadTxnBegin request, request={}", request); + TLoadTxnBeginResult result = new TLoadTxnBeginResult(); + TStatus status = new TStatus(TStatusCode.OK); + result.setStatus(status); + try { + result.setTxnId(loadTxnBeginImpl(request)); + } catch (LabelAlreadyExistsException e) { + status.setStatus_code(TStatusCode.LABEL_ALREADY_EXISTS); + status.setError_msgs(Lists.newArrayList(e.getMessage())); + } catch (UserException e) { + status.setStatus_code(TStatusCode.ANALYSIS_ERROR); + status.setError_msgs(Lists.newArrayList(e.getMessage())); + } + return result; + } + + private long loadTxnBeginImpl(TLoadTxnBeginRequest request) throws UserException { + String cluster = request.getCluster(); + if (Strings.isNullOrEmpty(cluster)) { + cluster = SystemInfoService.DEFAULT_CLUSTER; + } + + checkPasswordAndPrivs(cluster, request.getUser(), request.getPasswd(), request.getDb(), + request.getTbl(), request.getUser_ip(), PrivPredicate.LOAD); + + // check label + if (Strings.isNullOrEmpty(request.getLabel())) { + throw new UserException("empty label in begin request"); + } + // check database + Catalog catalog = Catalog.getInstance(); + String fullDbName = ClusterNamespace.getFullName(cluster, request.getDb()); + Database db = catalog.getDb(fullDbName); + if (db == null) { + String dbName = fullDbName; + if (Strings.isNullOrEmpty(request.getCluster())) { + dbName = request.getDb(); + } + throw new UserException("unknown database, database=" + dbName); + } + // begin + return Catalog.getCurrentGlobalTransactionMgr().beginTransaction( + db.getId(), request.getLabel(), "streamLoad", + TransactionState.LoadJobSourceType.BACKEND_STREAMING); + } + + @Override + public TLoadTxnCommitResult loadTxnCommit(TLoadTxnCommitRequest request) throws TException { + LOG.info("receive loadTxnCommit request, request={}", request); + TLoadTxnCommitResult result = new TLoadTxnCommitResult(); + TStatus status = new TStatus(TStatusCode.OK); + result.setStatus(status); + try { + if (!loadTxnCommitImpl(request)) { + // committed success but not visible + status.setStatus_code(TStatusCode.PUBLISH_TIMEOUT); + status.setError_msgs( + Lists.newArrayList("transaction commit successfully, BUT data will be visible later")); + } + } catch (UserException e) { + status.setStatus_code(TStatusCode.ANALYSIS_ERROR); + status.addToError_msgs(e.getMessage()); + } + return result; + } + + // return true if commit success and publish success, return false if publish timout + private boolean loadTxnCommitImpl(TLoadTxnCommitRequest request) throws UserException { + String cluster = request.getCluster(); + if (Strings.isNullOrEmpty(cluster)) { + cluster = SystemInfoService.DEFAULT_CLUSTER; + } + + checkPasswordAndPrivs(cluster, request.getUser(), request.getPasswd(), request.getDb(), + request.getTbl(), request.getUser_ip(), PrivPredicate.LOAD); + + // get database + Catalog catalog = Catalog.getInstance(); + String fullDbName = ClusterNamespace.getFullName(cluster, request.getDb()); + Database db = catalog.getDb(fullDbName); + if (db == null) { + String dbName = fullDbName; + if (Strings.isNullOrEmpty(request.getCluster())) { + dbName = request.getDb(); + } + throw new UserException("unknown database, database=" + dbName); + } + return Catalog.getCurrentGlobalTransactionMgr().commitAndPublishTransaction( + db, request.getTxnId(), + TabletCommitInfo.fromThrift(request.getCommitInfos()), + 5000); + } + + @Override + public TLoadTxnRollbackResult loadTxnRollback(TLoadTxnRollbackRequest request) throws TException { + LOG.info("receive loadTxnRollback request, request={}", request); + + TLoadTxnRollbackResult result = new TLoadTxnRollbackResult(); + TStatus status = new TStatus(TStatusCode.OK); + result.setStatus(status); + try { + loadTxnRollbackImpl(request); + } catch (UserException e) { + status.setStatus_code(TStatusCode.ANALYSIS_ERROR); + status.addToError_msgs(e.getMessage()); + } + + return result; + } + + private void loadTxnRollbackImpl(TLoadTxnRollbackRequest request) throws UserException { + String cluster = request.getCluster(); + if (Strings.isNullOrEmpty(cluster)) { + cluster = SystemInfoService.DEFAULT_CLUSTER; + } + + checkPasswordAndPrivs(cluster, request.getUser(), request.getPasswd(), request.getDb(), + request.getTbl(), request.getUser_ip(), PrivPredicate.LOAD); + + Catalog.getCurrentGlobalTransactionMgr().abortTransaction(request.getTxnId(), + request.isSetReason() ? request.getReason() : "system cancel"); + } + + @Override + public TStreamLoadPutResult streamLoadPut(TStreamLoadPutRequest request) throws TException { + LOG.info("receive streamLoadPut request, request={}", request); + + TStreamLoadPutResult result = new TStreamLoadPutResult(); + TStatus status = new TStatus(TStatusCode.OK); + result.setStatus(status); + try { + result.setParams(streamLoadPutImpl(request)); + } catch (UserException e) { + status.setStatus_code(TStatusCode.ANALYSIS_ERROR); + status.addToError_msgs(e.getMessage()); + } + return result; + } + + private TExecPlanFragmentParams streamLoadPutImpl(TStreamLoadPutRequest request) throws UserException { + String cluster = request.getCluster(); + if (Strings.isNullOrEmpty(cluster)) { + cluster = SystemInfoService.DEFAULT_CLUSTER; + } + + Catalog catalog = Catalog.getInstance(); + String fullDbName = ClusterNamespace.getFullName(cluster, request.getDb()); + Database db = catalog.getDb(fullDbName); + if (db == null) { + String dbName = fullDbName; + if (Strings.isNullOrEmpty(request.getCluster())) { + dbName = request.getDb(); + } + throw new UserException("unknown database, database=" + dbName); + } + db.readLock(); + try { + Table table = db.getTable(request.getTbl()); + if (table == null) { + throw new UserException("unknown table, table=" + request.getTbl()); + } + if (!(table instanceof OlapTable)) { + throw new UserException("load table type is not OlapTable, type=" + table.getClass()); + } + StreamLoadPlanner planner = new StreamLoadPlanner(db, (OlapTable) table, request); + return planner.plan(); + } finally { + db.readUnlock(); + } + } + } diff --git a/fe/src/main/java/com/baidu/palo/task/AgentBatchTask.java b/fe/src/main/java/com/baidu/palo/task/AgentBatchTask.java index de005ef219..b0171faeac 100644 --- a/fe/src/main/java/com/baidu/palo/task/AgentBatchTask.java +++ b/fe/src/main/java/com/baidu/palo/task/AgentBatchTask.java @@ -24,14 +24,18 @@ import com.baidu.palo.thrift.TAgentTaskRequest; import com.baidu.palo.thrift.TAlterTabletReq; import com.baidu.palo.thrift.TCancelDeleteDataReq; import com.baidu.palo.thrift.TCheckConsistencyReq; +import com.baidu.palo.thrift.TClearAlterTaskRequest; +import com.baidu.palo.thrift.TClearTransactionTaskRequest; import com.baidu.palo.thrift.TCloneReq; import com.baidu.palo.thrift.TCreateTabletReq; import com.baidu.palo.thrift.TDownloadReq; import com.baidu.palo.thrift.TDropTabletReq; import com.baidu.palo.thrift.TMoveDirReq; import com.baidu.palo.thrift.TNetworkAddress; +import com.baidu.palo.thrift.TPublishVersionRequest; import com.baidu.palo.thrift.TPushReq; import com.baidu.palo.thrift.TPushType; +import com.baidu.palo.thrift.TRecoverTabletReq; import com.baidu.palo.thrift.TReleaseSnapshotRequest; import com.baidu.palo.thrift.TSnapshotRequest; import com.baidu.palo.thrift.TStorageMediumMigrateReq; @@ -116,14 +120,12 @@ public class AgentBatchTask implements Runnable { agentTaskRequests.add(toAgentTaskRequest(task)); } client.submit_tasks(agentTaskRequests); - if (LOG.isDebugEnabled()) { for (AgentTask task : tasks) { LOG.debug("send task: type[{}], backend[{}], signature[{}]", task.getTaskType(), backendId, task.getSignature()); } } - ok = true; } catch (Exception e) { LOG.warn("task exec error. backend[{}]", backendId, e); @@ -148,21 +150,28 @@ public class AgentBatchTask implements Runnable { case CREATE: { CreateReplicaTask createReplicaTask = (CreateReplicaTask) task; TCreateTabletReq request = createReplicaTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setCreate_tablet_req(request); return tAgentTaskRequest; } case DROP: { DropReplicaTask dropReplicaTask = (DropReplicaTask) task; TDropTabletReq request = dropReplicaTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setDrop_tablet_req(request); return tAgentTaskRequest; } + case REALTIME_PUSH: case PUSH: { PushTask pushTask = (PushTask) task; TPushReq request = pushTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setPush_req(request); if (pushTask.getPushType() == TPushType.LOAD || pushTask.getPushType() == TPushType.LOAD_DELETE) { tAgentTaskRequest.setResource_info(pushTask.getResourceInfo()); @@ -173,14 +182,18 @@ public class AgentBatchTask implements Runnable { case CLONE: { CloneTask cloneTask = (CloneTask) task; TCloneReq request = cloneTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setClone_req(request); return tAgentTaskRequest; } case ROLLUP: { CreateRollupTask rollupTask = (CreateRollupTask) task; TAlterTabletReq request = rollupTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setAlter_tablet_req(request); tAgentTaskRequest.setResource_info(rollupTask.getResourceInfo()); return tAgentTaskRequest; @@ -188,7 +201,9 @@ public class AgentBatchTask implements Runnable { case SCHEMA_CHANGE: { SchemaChangeTask schemaChangeTask = (SchemaChangeTask) task; TAlterTabletReq request = schemaChangeTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setAlter_tablet_req(request); tAgentTaskRequest.setResource_info(schemaChangeTask.getResourceInfo()); return tAgentTaskRequest; @@ -196,60 +211,113 @@ public class AgentBatchTask implements Runnable { case CANCEL_DELETE: { CancelDeleteTask cancelDeleteTask = (CancelDeleteTask) task; TCancelDeleteDataReq request = cancelDeleteTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setCancel_delete_data_req(request); return tAgentTaskRequest; } case STORAGE_MEDIUM_MIGRATE: { StorageMediaMigrationTask migrationTask = (StorageMediaMigrationTask) task; TStorageMediumMigrateReq request = migrationTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setStorage_medium_migrate_req(request); return tAgentTaskRequest; } case CHECK_CONSISTENCY: { CheckConsistencyTask checkConsistencyTask = (CheckConsistencyTask) task; TCheckConsistencyReq request = checkConsistencyTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setCheck_consistency_req(request); return tAgentTaskRequest; } case MAKE_SNAPSHOT: { SnapshotTask snapshotTask = (SnapshotTask) task; TSnapshotRequest request = snapshotTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setSnapshot_req(request); return tAgentTaskRequest; } case RELEASE_SNAPSHOT: { ReleaseSnapshotTask releaseSnapshotTask = (ReleaseSnapshotTask) task; TReleaseSnapshotRequest request = releaseSnapshotTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setRelease_snapshot_req(request); return tAgentTaskRequest; } case UPLOAD: { UploadTask uploadTask = (UploadTask) task; TUploadReq request = uploadTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setUpload_req(request); return tAgentTaskRequest; } case DOWNLOAD: { DownloadTask downloadTask = (DownloadTask) task; TDownloadReq request = downloadTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setDownload_req(request); return tAgentTaskRequest; } + case PUBLISH_VERSION: { + PublishVersionTask publishVersionTask = (PublishVersionTask) task; + TPublishVersionRequest request = publishVersionTask.toThrift(); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } + tAgentTaskRequest.setPublish_version_req(request); + return tAgentTaskRequest; + } + case CLEAR_ALTER_TASK: { + ClearAlterTask clearAlterTask = (ClearAlterTask) task; + TClearAlterTaskRequest request = clearAlterTask.toThrift(); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } + tAgentTaskRequest.setClear_alter_task_req(request); + return tAgentTaskRequest; + } + case CLEAR_TRANSACTION_TASK: { + ClearTransactionTask clearTransactionTask = (ClearTransactionTask) task; + TClearTransactionTaskRequest request = clearTransactionTask.toThrift(); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } + tAgentTaskRequest.setClear_transaction_task_req(request); + return tAgentTaskRequest; + } case MOVE: { DirMoveTask dirMoveTask = (DirMoveTask) task; TMoveDirReq request = dirMoveTask.toThrift(); - LOG.debug(request.toString()); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } tAgentTaskRequest.setMove_dir_req(request); return tAgentTaskRequest; } + case RECOVER_TABLET: { + RecoverTabletTask recoverTabletTask = (RecoverTabletTask) task; + TRecoverTabletReq request = recoverTabletTask.toThrift(); + if (LOG.isDebugEnabled()) { + LOG.debug(request.toString()); + } + tAgentTaskRequest.setRecover_tablet_req(request); + return tAgentTaskRequest; + } default: + LOG.debug("could not find task type for task [{}]", task); return null; } } diff --git a/fe/src/main/java/com/baidu/palo/task/AgentTask.java b/fe/src/main/java/com/baidu/palo/task/AgentTask.java index 159076c937..d59aebca6a 100644 --- a/fe/src/main/java/com/baidu/palo/task/AgentTask.java +++ b/fe/src/main/java/com/baidu/palo/task/AgentTask.java @@ -21,7 +21,7 @@ import com.baidu.palo.thrift.TTaskType; public abstract class AgentTask { private long signature; private long backendId; - private TTaskType taskType; + protected TTaskType taskType; protected long dbId; protected long tableId; @@ -33,8 +33,8 @@ public abstract class AgentTask { protected int failedTimes; - public AgentTask(TResourceInfo resourceInfo, long backendId, long signature, TTaskType taskType, - long dbId, long tableId, long partitionId, long indexId, long tabletId) { + public AgentTask(TResourceInfo resourceInfo, long backendId, TTaskType taskType, + long dbId, long tableId, long partitionId, long indexId, long tabletId, long signature) { this.backendId = backendId; this.signature = signature; this.taskType = taskType; @@ -49,6 +49,11 @@ public abstract class AgentTask { this.failedTimes = 0; } + + public AgentTask(TResourceInfo resourceInfo, long backendId, TTaskType taskType, + long dbId, long tableId, long partitionId, long indexId, long tabletId) { + this(resourceInfo, backendId, taskType, dbId, tableId, partitionId, indexId, tabletId, tabletId); + } public long getSignature() { return this.signature; diff --git a/fe/src/main/java/com/baidu/palo/task/AgentTaskExecutor.java b/fe/src/main/java/com/baidu/palo/task/AgentTaskExecutor.java index 9f290527ca..450e28eda1 100644 --- a/fe/src/main/java/com/baidu/palo/task/AgentTaskExecutor.java +++ b/fe/src/main/java/com/baidu/palo/task/AgentTaskExecutor.java @@ -25,6 +25,9 @@ public class AgentTaskExecutor { } public static void submit(AgentBatchTask task) { + if (task == null) { + return; + } EXECUTOR.submit(task); } diff --git a/fe/src/main/java/com/baidu/palo/task/AgentTaskQueue.java b/fe/src/main/java/com/baidu/palo/task/AgentTaskQueue.java index 96cf2fbd4a..fc6a1cf333 100644 --- a/fe/src/main/java/com/baidu/palo/task/AgentTaskQueue.java +++ b/fe/src/main/java/com/baidu/palo/task/AgentTaskQueue.java @@ -28,6 +28,7 @@ import org.apache.logging.log4j.Logger; import java.util.ArrayList; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -86,12 +87,12 @@ public class AgentTaskQueue { * add version, versionHash and TPushType to help */ public static synchronized void removePushTask(long backendId, long signature, long version, long versionHash, - TPushType pushType) { - if (!tasks.contains(backendId, TTaskType.PUSH)) { + TPushType pushType, TTaskType taskType) { + if (!tasks.contains(backendId, taskType)) { return; } - Map signatureMap = tasks.get(backendId, TTaskType.PUSH); + Map signatureMap = tasks.get(backendId, taskType); AgentTask task = signatureMap.get(signature); if (task == null) { return; @@ -104,7 +105,7 @@ public class AgentTaskQueue { } signatureMap.remove(signature); - LOG.debug("remove task: type[{}], backend[{}], signature[{}]", TTaskType.PUSH, backendId, signature); + LOG.debug("remove task: type[{}], backend[{}], signature[{}]", taskType, backendId, signature); --taskNum; } @@ -152,7 +153,7 @@ public class AgentTaskQueue { return diffTasks; } - public static synchronized void removeReplicaRelatedTasks(long backendId, long signature) { + public static synchronized void removeReplicaRelatedTasks(long backendId, long tabletId) { if (!tasks.containsRow(backendId)) { return; } @@ -161,10 +162,20 @@ public class AgentTaskQueue { for (TTaskType type : TTaskType.values()) { if (backendTasks.containsKey(type)) { Map typeTasks = backendTasks.get(type); - if (typeTasks.containsKey(signature)) { - typeTasks.remove(signature); - LOG.debug("remove task: type[{}], backend[{}], signature[{}]", type, backendId, signature); - --taskNum; + if (type == TTaskType.REALTIME_PUSH) { + Iterator taskIterator = typeTasks.values().iterator(); + while (taskIterator.hasNext()) { + PushTask realTimePushTask = (PushTask) taskIterator.next(); + if (tabletId == realTimePushTask.getTabletId()) { + taskIterator.remove(); + } + } + } else { + if (typeTasks.containsKey(tabletId)) { + typeTasks.remove(tabletId); + LOG.debug("remove task: type[{}], backend[{}], signature[{}]", type, backendId, tabletId); + --taskNum; + } } } } // end for types diff --git a/fe/src/main/java/com/baidu/palo/task/CancelDeleteTask.java b/fe/src/main/java/com/baidu/palo/task/CancelDeleteTask.java index 6c948c0521..8e7686a0ea 100644 --- a/fe/src/main/java/com/baidu/palo/task/CancelDeleteTask.java +++ b/fe/src/main/java/com/baidu/palo/task/CancelDeleteTask.java @@ -25,7 +25,7 @@ public class CancelDeleteTask extends AgentTask { public CancelDeleteTask(long backendId, long dbId, long tableId, long partitionId, long indexId, long tabletId, int schemaHash, long version, long versionHash) { - super(null, backendId, tabletId, TTaskType.CANCEL_DELETE, dbId, tableId, partitionId, indexId, tabletId); + super(null, backendId, TTaskType.CANCEL_DELETE, dbId, tableId, partitionId, indexId, tabletId); this.schemaHash = schemaHash; this.version = version; diff --git a/fe/src/main/java/com/baidu/palo/task/CheckConsistencyTask.java b/fe/src/main/java/com/baidu/palo/task/CheckConsistencyTask.java index 320d506145..ea9a17ef43 100644 --- a/fe/src/main/java/com/baidu/palo/task/CheckConsistencyTask.java +++ b/fe/src/main/java/com/baidu/palo/task/CheckConsistencyTask.java @@ -28,8 +28,7 @@ public class CheckConsistencyTask extends AgentTask { public CheckConsistencyTask(TResourceInfo resourceInfo, long backendId, long dbId, long tableId, long partitionId, long indexId, long tabletId, int schemaHash, long version, long versionHash) { - super(resourceInfo, backendId, tabletId, TTaskType.CHECK_CONSISTENCY, dbId, tableId, partitionId, indexId, - tabletId); + super(resourceInfo, backendId, TTaskType.CHECK_CONSISTENCY, dbId, tableId, partitionId, indexId, tabletId); this.schemaHash = schemaHash; this.version = version; diff --git a/fe/src/main/java/com/baidu/palo/task/CloneTask.java b/fe/src/main/java/com/baidu/palo/task/CloneTask.java index e5e57fea27..bee1a03f92 100644 --- a/fe/src/main/java/com/baidu/palo/task/CloneTask.java +++ b/fe/src/main/java/com/baidu/palo/task/CloneTask.java @@ -34,7 +34,7 @@ public class CloneTask extends AgentTask { public CloneTask(long backendId, long dbId, long tableId, long partitionId, long indexId, long tabletId, int schemaHash, List srcBackends, TStorageMedium storageMedium, long committedVersion, long committedVersionHash) { - super(null, backendId, tabletId, TTaskType.CLONE, dbId, tableId, partitionId, indexId, tabletId); + super(null, backendId, TTaskType.CLONE, dbId, tableId, partitionId, indexId, tabletId); this.schemaHash = schemaHash; this.srcBackends = srcBackends; this.storageMedium = storageMedium; diff --git a/fe/src/main/java/com/baidu/palo/task/CreateReplicaTask.java b/fe/src/main/java/com/baidu/palo/task/CreateReplicaTask.java index 1f1a2b632f..a5bd0e8bae 100644 --- a/fe/src/main/java/com/baidu/palo/task/CreateReplicaTask.java +++ b/fe/src/main/java/com/baidu/palo/task/CreateReplicaTask.java @@ -61,7 +61,7 @@ public class CreateReplicaTask extends AgentTask { KeysType keysType, TStorageType storageType, TStorageMedium storageMedium, List columns, Set bfColumns, double bfFpp, MarkedCountDownLatch latch) { - super(null, backendId, tabletId, TTaskType.CREATE, dbId, tableId, partitionId, indexId, tabletId); + super(null, backendId, TTaskType.CREATE, dbId, tableId, partitionId, indexId, tabletId); this.shortKeyColumnCount = shortKeyColumnCount; this.schemaHash = schemaHash; diff --git a/fe/src/main/java/com/baidu/palo/task/CreateRollupTask.java b/fe/src/main/java/com/baidu/palo/task/CreateRollupTask.java index 8b1746e7bd..1167730b2c 100644 --- a/fe/src/main/java/com/baidu/palo/task/CreateRollupTask.java +++ b/fe/src/main/java/com/baidu/palo/task/CreateRollupTask.java @@ -54,8 +54,7 @@ public class CreateRollupTask extends AgentTask { long baseTabletId, long rollupReplicaId, short shortKeyColumnCount, int rollupSchemaHash, int baseSchemaHash, TStorageType storageType, List rollupColumns, Set bfColumns, double bfFpp, TKeysType keysType) { - super(resourceInfo, backendId, rollupTabletId, TTaskType.ROLLUP, dbId, tableId, partitionId, rollupIndexId, - rollupTabletId); + super(resourceInfo, backendId, TTaskType.ROLLUP, dbId, tableId, partitionId, rollupIndexId, rollupTabletId); this.baseTableId = baseIndexId; this.baseTabletId = baseTabletId; diff --git a/fe/src/main/java/com/baidu/palo/task/DirMoveTask.java b/fe/src/main/java/com/baidu/palo/task/DirMoveTask.java index 3f0c262621..fc69f67d9b 100644 --- a/fe/src/main/java/com/baidu/palo/task/DirMoveTask.java +++ b/fe/src/main/java/com/baidu/palo/task/DirMoveTask.java @@ -34,7 +34,7 @@ public class DirMoveTask extends AgentTask { public DirMoveTask(TResourceInfo resourceInfo, long backendId, long signature, long jobId, long dbId, long tableId, long partitionId, long indexId, long tabletId, String src, int schemaHash, boolean overwrite) { - super(resourceInfo, backendId, signature, TTaskType.MOVE, dbId, tableId, partitionId, indexId, tabletId); + super(resourceInfo, backendId, TTaskType.MOVE, dbId, tableId, partitionId, indexId, tabletId, signature); this.jobId = jobId; this.src = src; this.schemaHash = schemaHash; diff --git a/fe/src/main/java/com/baidu/palo/task/DownloadTask.java b/fe/src/main/java/com/baidu/palo/task/DownloadTask.java index 053c05894a..f9bbec34f9 100644 --- a/fe/src/main/java/com/baidu/palo/task/DownloadTask.java +++ b/fe/src/main/java/com/baidu/palo/task/DownloadTask.java @@ -37,7 +37,7 @@ public class DownloadTask extends AgentTask { public DownloadTask(TResourceInfo resourceInfo, long backendId, long signature, long jobId, long dbId, Map srcToDestPath, BrokerAddress brokerAddr, Map brokerProperties) { - super(resourceInfo, backendId, signature, TTaskType.DOWNLOAD, dbId, -1, -1, -1, -1); + super(resourceInfo, backendId, TTaskType.DOWNLOAD, dbId, -1, -1, -1, -1, signature); this.jobId = jobId; this.srcToDestPath = srcToDestPath; this.brokerAddr = brokerAddr; diff --git a/fe/src/main/java/com/baidu/palo/task/DropReplicaTask.java b/fe/src/main/java/com/baidu/palo/task/DropReplicaTask.java index fadcd962e9..ef8e806103 100644 --- a/fe/src/main/java/com/baidu/palo/task/DropReplicaTask.java +++ b/fe/src/main/java/com/baidu/palo/task/DropReplicaTask.java @@ -22,7 +22,7 @@ public class DropReplicaTask extends AgentTask { private int schemaHash; // set -1L as unknown public DropReplicaTask(long backendId, long tabletId, int schemaHash) { - super(null, backendId, tabletId, TTaskType.DROP, -1L, -1L, -1L, -1L, tabletId); + super(null, backendId, TTaskType.DROP, -1L, -1L, -1L, -1L, tabletId); this.schemaHash = schemaHash; } diff --git a/fe/src/main/java/com/baidu/palo/task/ExportExportingTask.java b/fe/src/main/java/com/baidu/palo/task/ExportExportingTask.java index f6a42454b6..5bb9164dda 100644 --- a/fe/src/main/java/com/baidu/palo/task/ExportExportingTask.java +++ b/fe/src/main/java/com/baidu/palo/task/ExportExportingTask.java @@ -20,7 +20,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.ClientPool; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Status; import com.baidu.palo.common.util.DebugUtil; import com.baidu.palo.common.util.ProfileManager; @@ -179,7 +179,7 @@ public class ExportExportingTask extends MasterTask { .registerQuery(queryId, coord); needUnregister = true; actualExecCoord(queryId, coord); - } catch (InternalException e) { + } catch (UserException e) { LOG.warn("export exporting internal error. {}", e.getMessage()); } finally { if (needUnregister) { diff --git a/fe/src/main/java/com/baidu/palo/task/HadoopLoadPendingTask.java b/fe/src/main/java/com/baidu/palo/task/HadoopLoadPendingTask.java index e6839072aa..4d7ea7dfa0 100644 --- a/fe/src/main/java/com/baidu/palo/task/HadoopLoadPendingTask.java +++ b/fe/src/main/java/com/baidu/palo/task/HadoopLoadPendingTask.java @@ -63,23 +63,29 @@ public class HadoopLoadPendingTask extends LoadPendingTask { @Override protected void createEtlRequest() throws Exception { - EtlTaskConf taskConf = new EtlTaskConf(); - // output path - taskConf.setOutputPath(getOutputPath()); - // output file pattern - taskConf.setOutputFilePattern(job.getLabel() + ".%(table)s.%(view)s.%(bucket)s"); - // tables (partitions) - Map etlPartitions = createEtlPartitions(); - Preconditions.checkNotNull(etlPartitions); - taskConf.setEtlPartitions(etlPartitions); - - LoadErrorHub.Param info = load.getLoadErrorHubInfo(); - if (info != null) { - taskConf.setHubInfo(new EtlErrorHubInfo(this.job.getId(), info)); + // yiguolei: add a db read lock here? because the schema maybe changed during create etl task + db.readLock(); + try { + EtlTaskConf taskConf = new EtlTaskConf(); + // output path + taskConf.setOutputPath(getOutputPath()); + // output file pattern + taskConf.setOutputFilePattern(job.getLabel() + ".%(table)s.%(view)s.%(bucket)s"); + // tables (partitions) + Map etlPartitions = createEtlPartitions(); + Preconditions.checkNotNull(etlPartitions); + taskConf.setEtlPartitions(etlPartitions); + + LoadErrorHub.Param info = load.getLoadErrorHubInfo(); + if (info != null) { + taskConf.setHubInfo(new EtlErrorHubInfo(this.job.getId(), info)); + } + + etlTaskConf = taskConf.toDppTaskConf(); + Preconditions.checkNotNull(etlTaskConf); + } finally { + db.readUnlock(); } - - etlTaskConf = taskConf.toDppTaskConf(); - Preconditions.checkNotNull(etlTaskConf); } @Override diff --git a/fe/src/main/java/com/baidu/palo/task/LoadEtlTask.java b/fe/src/main/java/com/baidu/palo/task/LoadEtlTask.java index 2c422d306b..adcd8cceda 100644 --- a/fe/src/main/java/com/baidu/palo/task/LoadEtlTask.java +++ b/fe/src/main/java/com/baidu/palo/task/LoadEtlTask.java @@ -37,15 +37,12 @@ import com.baidu.palo.load.TabletLoadInfo; import com.baidu.palo.thrift.TEtlState; import com.google.common.collect.Maps; -import com.google.common.collect.Sets; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.Map; import java.util.Map.Entry; -import java.util.Random; -import java.util.Set; public abstract class LoadEtlTask extends MasterTask { private static final Logger LOG = LogManager.getLogger(LoadEtlTask.class); @@ -155,20 +152,6 @@ public abstract class LoadEtlTask extends MasterTask { private void tryUpdateLoading() { // check job has loading partitions Map idToTableLoadInfo = job.getIdToTableLoadInfo(); - Set partitionIds = Sets.newHashSet(); - for (TableLoadInfo tableLoadInfo : idToTableLoadInfo.values()) { - Map idToPartitionLoadInfo = tableLoadInfo.getIdToPartitionLoadInfo(); - for (Entry entry : idToPartitionLoadInfo.entrySet()) { - PartitionLoadInfo partitionLoadInfo = entry.getValue(); - if (partitionLoadInfo.isNeedLoad()) { - partitionIds.add(entry.getKey()); - } - } - } - if (!load.addLoadingPartitions(partitionIds)) { - LOG.info("load job has unfinished loading partitions. job: {}, job partitions: {}", job, partitionIds); - return; - } // new version and version hash try { @@ -200,9 +183,7 @@ public abstract class LoadEtlTask extends MasterTask { if (partition == null) { throw new MetaNotFoundException("partition does not exist. id: " + partitionId); } - - partitionLoadInfo.setVersion(partition.getCommittedVersion() + 1); - partitionLoadInfo.setVersionHash(Math.abs(new Random().nextLong())); + // yiguolei: real time load do not need get version here } finally { db.readUnlock(); } @@ -213,7 +194,9 @@ public abstract class LoadEtlTask extends MasterTask { } } catch (MetaNotFoundException e) { // remove loading partitions - load.removeLoadingPartitions(partitionIds); + // yiguolei: partitionids is only used to check if there is a loading job running on a partition + // it is useless in real time load since it could run concurrently + // load.removeLoadingPartitions(partitionIds); load.cancelLoadJob(job, CancelType.ETL_RUN_FAIL, e.getMessage()); return; } @@ -223,7 +206,12 @@ public abstract class LoadEtlTask extends MasterTask { LOG.info("update job state to loading success. job: {}", job); } else { // remove loading partitions - load.removeLoadingPartitions(partitionIds); + // yiguolei: do not need remove any more, since we have not add it into + // load.removeLoadingPartitions(partitionIds); + LOG.warn("update job state to loading failed. job: {}", job); + if (job.getTransactionId() > 0) { + LOG.warn("there maybe remaining transactionid {} in transaction table", job.getTransactionId()); + } } } @@ -277,6 +265,7 @@ public abstract class LoadEtlTask extends MasterTask { throw new LoadException("unknown distribution type. type: " + distributionType.name()); } + // yiguolei: how to deal with filesize == -1? for (MaterializedIndex materializedIndex : partition.getMaterializedIndices()) { long indexId = materializedIndex.getId(); int tabletIndex = 0; diff --git a/fe/src/main/java/com/baidu/palo/task/LoadPendingTask.java b/fe/src/main/java/com/baidu/palo/task/LoadPendingTask.java index 6d979c0493..212a93c408 100644 --- a/fe/src/main/java/com/baidu/palo/task/LoadPendingTask.java +++ b/fe/src/main/java/com/baidu/palo/task/LoadPendingTask.java @@ -24,11 +24,12 @@ import com.baidu.palo.load.LoadChecker; import com.baidu.palo.load.LoadJob; import com.baidu.palo.load.LoadJob.JobState; import com.baidu.palo.thrift.TStatusCode; +import com.baidu.palo.transaction.TransactionState.LoadJobSourceType; import com.google.common.base.Joiner; -import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.List; @@ -66,9 +67,17 @@ public abstract class LoadPendingTask extends MasterTask { load.cancelLoadJob(job, CancelType.ETL_SUBMIT_FAIL, "db does not exist. id: " + dbId); return; } - - // create etl request + try { + // yiguolei: get transactionid here, because create etl request will get schema and partition info + // create etl request and make some guarantee for schema change and rollup + if (job.getTransactionId() < 0) { + long transactionId = Catalog.getCurrentGlobalTransactionMgr().beginTransaction(dbId, + job.getLabel(), + "fe", + LoadJobSourceType.FRONTEND); + job.setTransactionId(transactionId); + } createEtlRequest(); } catch (Exception e) { LOG.info("create etl request failed.{}", e); diff --git a/fe/src/main/java/com/baidu/palo/task/MiniLoadPendingTask.java b/fe/src/main/java/com/baidu/palo/task/MiniLoadPendingTask.java index 335563aa99..b633296b09 100644 --- a/fe/src/main/java/com/baidu/palo/task/MiniLoadPendingTask.java +++ b/fe/src/main/java/com/baidu/palo/task/MiniLoadPendingTask.java @@ -21,7 +21,7 @@ import com.baidu.palo.analysis.TupleDescriptor; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.OlapTable; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.LoadException; import com.baidu.palo.common.Pair; import com.baidu.palo.load.MiniEtlTaskInfo; @@ -160,7 +160,7 @@ public class MiniLoadPendingTask extends LoadPendingTask { desc.computeMemLayout(); try { csvScanNode.finalize(null); - } catch (InternalException e) { + } catch (UserException e) { LOG.warn("csvScanNode finalize failed[err={}]", e); throw new LoadException("CSV scan finalize failed.", e); } diff --git a/fe/src/main/java/com/baidu/palo/task/PullLoadEtlTask.java b/fe/src/main/java/com/baidu/palo/task/PullLoadEtlTask.java index 7948c3b3a0..012ff0d809 100644 --- a/fe/src/main/java/com/baidu/palo/task/PullLoadEtlTask.java +++ b/fe/src/main/java/com/baidu/palo/task/PullLoadEtlTask.java @@ -25,9 +25,12 @@ import com.baidu.palo.thrift.TEtlState; import com.google.common.collect.Maps; import java.util.Map; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; // Used to process pull load etl task public class PullLoadEtlTask extends LoadEtlTask { + private static final Logger LOG = LogManager.getLogger(PullLoadEtlTask.class); private PullLoadJobMgr mgr; public PullLoadEtlTask(LoadJob job) { @@ -54,6 +57,10 @@ public class PullLoadEtlTask extends LoadEtlTask { protected boolean updateJobEtlStatus() { PullLoadJob pullLoadJob = mgr.getJob(job.getId()); EtlStatus etlStatus = job.getEtlJobStatus(); + if (pullLoadJob == null) { + LOG.warn("pullLoadJob is null. JobId is {}", job.getId()); + return false; + } switch (pullLoadJob.getState()) { case CANCELED: case FAILED: diff --git a/fe/src/main/java/com/baidu/palo/task/PullLoadJobMgr.java b/fe/src/main/java/com/baidu/palo/task/PullLoadJobMgr.java index dee1bd26bf..f7d237de24 100644 --- a/fe/src/main/java/com/baidu/palo/task/PullLoadJobMgr.java +++ b/fe/src/main/java/com/baidu/palo/task/PullLoadJobMgr.java @@ -15,7 +15,7 @@ package com.baidu.palo.task; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Status; import com.baidu.palo.thrift.TStatusCode; @@ -135,11 +135,11 @@ public class PullLoadJobMgr { public class TaskExecutor implements Runnable { - private void processOneTask(PullLoadTask task, PullLoadJob job) throws InternalException { + private void processOneTask(PullLoadTask task, PullLoadJob job) throws UserException { int retryTime = 3; for (int i = 0; i < retryTime; ++i) { if (!job.isRunning()) { - throw new InternalException("Job has been cancelled."); + throw new UserException("Job has been cancelled."); } task.executeOnce(); if (task.isFinished()) { diff --git a/fe/src/main/java/com/baidu/palo/task/PullLoadPendingTask.java b/fe/src/main/java/com/baidu/palo/task/PullLoadPendingTask.java index ede3ca8088..2e00bc36f0 100644 --- a/fe/src/main/java/com/baidu/palo/task/PullLoadPendingTask.java +++ b/fe/src/main/java/com/baidu/palo/task/PullLoadPendingTask.java @@ -18,7 +18,7 @@ package com.baidu.palo.task; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.util.BrokerUtil; import com.baidu.palo.load.BrokerFileGroup; import com.baidu.palo.load.EtlSubmitResult; @@ -103,7 +103,7 @@ public class PullLoadPendingTask extends LoadPendingTask { private void getAllFileStatus(Map>> fileStatusMap, Map fileNumMap) - throws InternalException { + throws UserException { for (Map.Entry> entry : job.getPullLoadSourceInfo().getIdToFileGroups().entrySet()) { long tableId = entry.getKey(); diff --git a/fe/src/main/java/com/baidu/palo/task/PullLoadTask.java b/fe/src/main/java/com/baidu/palo/task/PullLoadTask.java index cb114f8a57..c2cc929673 100644 --- a/fe/src/main/java/com/baidu/palo/task/PullLoadTask.java +++ b/fe/src/main/java/com/baidu/palo/task/PullLoadTask.java @@ -19,7 +19,7 @@ import com.baidu.palo.analysis.BrokerDesc; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.common.Config; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.Status; import com.baidu.palo.load.BrokerFileGroup; import com.baidu.palo.qe.Coordinator; @@ -88,7 +88,7 @@ public class PullLoadTask { this.execMemLimit = execMemLimit; } - public void init(List> fileStatusList, int fileNum) throws InternalException { + public void init(List> fileStatusList, int fileNum) throws UserException { planner = new PullLoadTaskPlanner(this); planner.plan(fileStatusList, fileNum); } @@ -192,10 +192,10 @@ public class PullLoadTask { } } - public void executeOnce() throws InternalException { + public void executeOnce() throws UserException { synchronized (this) { if (curThread != null) { - throw new InternalException("Task already executing."); + throw new UserException("Task already executing."); } curThread = Thread.currentThread(); executeState = State.RUNNING; @@ -217,7 +217,7 @@ public class PullLoadTask { .registerQuery(executeId, curCoordinator); actualExecute(); needUnregister = true; - } catch (InternalException e) { + } catch (UserException e) { onFailed(executeId, new Status(TStatusCode.INTERNAL_ERROR, e.getMessage())); } finally { if (needUnregister) { diff --git a/fe/src/main/java/com/baidu/palo/task/PullLoadTaskPlanner.java b/fe/src/main/java/com/baidu/palo/task/PullLoadTaskPlanner.java index 23c2c0d457..151e9ed441 100644 --- a/fe/src/main/java/com/baidu/palo/task/PullLoadTaskPlanner.java +++ b/fe/src/main/java/com/baidu/palo/task/PullLoadTaskPlanner.java @@ -25,7 +25,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.common.NotImplementedException; import com.baidu.palo.planner.BrokerScanNode; import com.baidu.palo.planner.DataPartition; @@ -72,7 +72,7 @@ public class PullLoadTaskPlanner { } // NOTE: DB lock need hold when call this function. - public void plan(List> fileStatusesList, int filesAdded) throws InternalException { + public void plan(List> fileStatusesList, int filesAdded) throws UserException { // Tuple descriptor used for all nodes in plan. OlapTable table = task.table; @@ -122,7 +122,7 @@ public class PullLoadTaskPlanner { splitSink = new DataSplitSink(table, tupleDesc); } catch (AnalysisException e) { LOG.info("New DataSplitSink failed.{}", e); - throw new InternalException(e.getMessage()); + throw new UserException(e.getMessage()); } PlanFragment sinkFragment = new PlanFragment(new PlanFragmentId(1), exchangeNode, splitSink.getOutputPartition()); scanFragment.setDestination(exchangeNode); @@ -137,7 +137,7 @@ public class PullLoadTaskPlanner { fragment.finalize(analyzer, false); } catch (NotImplementedException e) { LOG.info("Fragment finalize failed.{}", e); - throw new InternalException("Fragment finalize failed."); + throw new UserException("Fragment finalize failed."); } } Collections.reverse(fragments); diff --git a/fe/src/main/java/com/baidu/palo/task/PushTask.java b/fe/src/main/java/com/baidu/palo/task/PushTask.java index 0ca078c223..05b02d65ea 100644 --- a/fe/src/main/java/com/baidu/palo/task/PushTask.java +++ b/fe/src/main/java/com/baidu/palo/task/PushTask.java @@ -57,12 +57,16 @@ public class PushTask extends AgentTask { private TPriority priority; private boolean isSyncDelete; private long asyncDeleteJobId; - + + private long transactionId; + private boolean isSchemaChanging; + public PushTask(TResourceInfo resourceInfo, long backendId, long dbId, long tableId, long partitionId, long indexId, long tabletId, long replicaId, int schemaHash, long version, long versionHash, String filePath, long fileSize, int timeoutSecond, long loadJobId, TPushType pushType, - List conditions, boolean needDecompress, TPriority priority) { - super(resourceInfo, backendId, tabletId, TTaskType.PUSH, dbId, tableId, partitionId, indexId, tabletId); + List conditions, boolean needDecompress, TPriority priority, TTaskType taskType, + long transactionId, long signature) { + super(resourceInfo, backendId, taskType, dbId, tableId, partitionId, indexId, tabletId, signature); this.replicaId = replicaId; this.schemaHash = schemaHash; this.version = version; @@ -78,10 +82,26 @@ public class PushTask extends AgentTask { this.priority = priority; this.isSyncDelete = true; this.asyncDeleteJobId = -1; + this.transactionId = transactionId; + } + + public PushTask(TResourceInfo resourceInfo, long backendId, long dbId, long tableId, long partitionId, + long indexId, long tabletId, long replicaId, int schemaHash, long version, long versionHash, + String filePath, long fileSize, int timeoutSecond, long loadJobId, TPushType pushType, + List conditions, boolean needDecompress, TPriority priority) { + this(resourceInfo, backendId, dbId, tableId, partitionId, indexId, + tabletId, replicaId, schemaHash, version, versionHash, filePath, + fileSize, timeoutSecond, loadJobId, pushType, conditions, needDecompress, + priority, TTaskType.PUSH, -1, tableId); } public TPushReq toThrift() { TPushReq request = new TPushReq(tabletId, schemaHash, version, versionHash, timeoutSecond, pushType); + if (taskType == TTaskType.REALTIME_PUSH) { + request.setPartition_id(partitionId); + request.setTransaction_id(transactionId); + } + request.setIs_schema_changing(isSchemaChanging); switch (pushType) { case LOAD: case LOAD_DELETE: @@ -187,4 +207,8 @@ public class PushTask extends AgentTask { public long getAsyncDeleteJobId() { return asyncDeleteJobId; } + + public void setIsSchemaChanging(boolean isSchemaChanging) { + this.isSchemaChanging = isSchemaChanging; + } } diff --git a/fe/src/main/java/com/baidu/palo/task/ReleaseSnapshotTask.java b/fe/src/main/java/com/baidu/palo/task/ReleaseSnapshotTask.java index 46ded8f1fe..b57da95645 100644 --- a/fe/src/main/java/com/baidu/palo/task/ReleaseSnapshotTask.java +++ b/fe/src/main/java/com/baidu/palo/task/ReleaseSnapshotTask.java @@ -25,7 +25,7 @@ public class ReleaseSnapshotTask extends AgentTask { public ReleaseSnapshotTask(TResourceInfo resourceInfo, long backendId, long dbId, long tabletId, String snapshotPath) { - super(resourceInfo, backendId, tabletId, TTaskType.RELEASE_SNAPSHOT, dbId, -1, -1, -1, tabletId); + super(resourceInfo, backendId, TTaskType.RELEASE_SNAPSHOT, dbId, -1, -1, -1, tabletId); this.snapshotPath = snapshotPath; } diff --git a/fe/src/main/java/com/baidu/palo/task/SchemaChangeTask.java b/fe/src/main/java/com/baidu/palo/task/SchemaChangeTask.java index fb1bed3519..9a2d21adb0 100644 --- a/fe/src/main/java/com/baidu/palo/task/SchemaChangeTask.java +++ b/fe/src/main/java/com/baidu/palo/task/SchemaChangeTask.java @@ -49,8 +49,7 @@ public class SchemaChangeTask extends AgentTask { List newColumns, int newSchemaHash, int baseSchemaHash, short newShortKeyColumnCount, TStorageType storageType, Set bfColumns, double bfFpp, TKeysType keysType) { - super(resourceInfo, backendId, baseTabletId, TTaskType.SCHEMA_CHANGE, dbId, tableId, partitionId, indexId, - baseTabletId); + super(resourceInfo, backendId, TTaskType.SCHEMA_CHANGE, dbId, tableId, partitionId, indexId, baseTabletId); this.baseReplicaId = baseReplicaId; this.baseSchemaHash = baseSchemaHash; diff --git a/fe/src/main/java/com/baidu/palo/task/SnapshotTask.java b/fe/src/main/java/com/baidu/palo/task/SnapshotTask.java index 22f3cef8e7..227f6aa8a5 100644 --- a/fe/src/main/java/com/baidu/palo/task/SnapshotTask.java +++ b/fe/src/main/java/com/baidu/palo/task/SnapshotTask.java @@ -34,8 +34,8 @@ public class SnapshotTask extends AgentTask { public SnapshotTask(TResourceInfo resourceInfo, long backendId, long signature, long jobId, long dbId, long tableId, long partitionId, long indexId, long tabletId, long version, long versionHash, int schemaHash, long timeout, boolean isRestoreTask) { - super(resourceInfo, backendId, signature, TTaskType.MAKE_SNAPSHOT, dbId, tableId, partitionId, indexId, - tabletId); + super(resourceInfo, backendId, TTaskType.MAKE_SNAPSHOT, dbId, tableId, partitionId, indexId, tabletId, + signature); this.jobId = jobId; @@ -79,4 +79,4 @@ public class SnapshotTask extends AgentTask { request.setList_files(true); return request; } -} \ No newline at end of file +} diff --git a/fe/src/main/java/com/baidu/palo/task/StorageMediaMigrationTask.java b/fe/src/main/java/com/baidu/palo/task/StorageMediaMigrationTask.java index 95110c39d2..cb9f82d378 100644 --- a/fe/src/main/java/com/baidu/palo/task/StorageMediaMigrationTask.java +++ b/fe/src/main/java/com/baidu/palo/task/StorageMediaMigrationTask.java @@ -26,7 +26,7 @@ public class StorageMediaMigrationTask extends AgentTask { public StorageMediaMigrationTask(long backendId, long tabletId, int schemaHash, TStorageMedium toStorageMedium) { - super(null, backendId, tabletId, TTaskType.STORAGE_MEDIUM_MIGRATE, -1L, -1L, -1L, -1L, tabletId); + super(null, backendId, TTaskType.STORAGE_MEDIUM_MIGRATE, -1L, -1L, -1L, -1L, tabletId); this.schemaHash = schemaHash; this.toStorageMedium = toStorageMedium; diff --git a/fe/src/main/java/com/baidu/palo/task/UploadTask.java b/fe/src/main/java/com/baidu/palo/task/UploadTask.java index 3e1a3eaca9..b3b799f10f 100644 --- a/fe/src/main/java/com/baidu/palo/task/UploadTask.java +++ b/fe/src/main/java/com/baidu/palo/task/UploadTask.java @@ -33,7 +33,7 @@ public class UploadTask extends AgentTask { public UploadTask(TResourceInfo resourceInfo, long backendId, long signature, long jobId, Long dbId, Map srcToDestPath, BrokerAddress brokerAddr, Map brokerProperties) { - super(resourceInfo, backendId, signature, TTaskType.UPLOAD, dbId, -1, -1, -1, -1); + super(resourceInfo, backendId, TTaskType.UPLOAD, dbId, -1, -1, -1, -1, signature); this.jobId = jobId; this.srcToDestPath = srcToDestPath; this.brokerAddress = brokerAddr; diff --git a/fe/src/main/jflex/sql_scanner.flex b/fe/src/main/jflex/sql_scanner.flex index 9a616ca99a..4d50cd3701 100644 --- a/fe/src/main/jflex/sql_scanner.flex +++ b/fe/src/main/jflex/sql_scanner.flex @@ -63,6 +63,7 @@ import com.baidu.palo.common.util.SqlUtils; static { keywordMap.put("&&", new Integer(SqlParserSymbols.KW_AND)); keywordMap.put("add", new Integer(SqlParserSymbols.KW_ADD)); + keywordMap.put("admin", new Integer(SqlParserSymbols.KW_ADMIN)); keywordMap.put("after", new Integer(SqlParserSymbols.KW_AFTER)); keywordMap.put("aggregate", new Integer(SqlParserSymbols.KW_AGGREGATE)); keywordMap.put("all", new Integer(SqlParserSymbols.KW_ALL)); @@ -130,6 +131,7 @@ import com.baidu.palo.common.util.SqlUtils; keywordMap.put("distinctpcsa", new Integer(SqlParserSymbols.KW_DISTINCTPCSA)); keywordMap.put("distinctpcsa", new Integer(SqlParserSymbols.KW_DISTINCTPCSA)); keywordMap.put("distributed", new Integer(SqlParserSymbols.KW_DISTRIBUTED)); + keywordMap.put("distribution", new Integer(SqlParserSymbols.KW_DISTRIBUTION)); keywordMap.put("buckets", new Integer(SqlParserSymbols.KW_BUCKETS)); keywordMap.put("div", new Integer(SqlParserSymbols.KW_DIV)); keywordMap.put("double", new Integer(SqlParserSymbols.KW_DOUBLE)); @@ -238,6 +240,7 @@ import com.baidu.palo.common.util.SqlUtils; keywordMap.put("rename", new Integer(SqlParserSymbols.KW_RENAME)); keywordMap.put("repeatable", new Integer(SqlParserSymbols.KW_REPEATABLE)); keywordMap.put("replace", new Integer(SqlParserSymbols.KW_REPLACE)); + keywordMap.put("replica", new Integer(SqlParserSymbols.KW_REPLICA)); keywordMap.put("repository", new Integer(SqlParserSymbols.KW_REPOSITORY)); keywordMap.put("repositories", new Integer(SqlParserSymbols.KW_REPOSITORIES)); keywordMap.put("resource", new Integer(SqlParserSymbols.KW_RESOURCE)); diff --git a/fe/src/test/java/com/baidu/palo/analysis/AlterClusterStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/AlterClusterStmtTest.java index 31b7534639..38c775fe67 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/AlterClusterStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/AlterClusterStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -57,7 +57,7 @@ public class AlterClusterStmtTest { } @Test - public void testAnalyzeNormal() throws InternalException, AnalysisException { + public void testAnalyzeNormal() throws UserException, AnalysisException { final Map properties = new HashMap(); properties.put("instance_num", "2"); final AlterClusterStmt stmt = new AlterClusterStmt("testCluster", properties); @@ -68,14 +68,14 @@ public class AlterClusterStmtTest { } @Test(expected = AnalysisException.class) - public void testNoPropertiesFail() throws InternalException, AnalysisException { + public void testNoPropertiesFail() throws UserException, AnalysisException { final AlterClusterStmt stmt = new AlterClusterStmt("testCluster", null); stmt.analyze(analyzer); Assert.fail("no exception"); } @Test(expected = AnalysisException.class) - public void testParamNumberFormatError() throws InternalException, AnalysisException { + public void testParamNumberFormatError() throws UserException, AnalysisException { final Map properties = new HashMap(); properties.put("instance_num", "0xfffffff"); final AlterClusterStmt stmt = new AlterClusterStmt("testCluster", properties); diff --git a/fe/src/test/java/com/baidu/palo/analysis/AlterTableStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/AlterTableStmtTest.java index 1bdd894b4e..8f0542f70a 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/AlterTableStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/AlterTableStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -67,7 +67,7 @@ public class AlterTableStmtTest { } @Test - public void testNormal() throws AnalysisException, InternalException { + public void testNormal() throws AnalysisException, UserException { List ops = Lists.newArrayList(); ops.add(new DropColumnClause("col1", "", null)); ops.add(new DropColumnClause("col2", "", null)); @@ -79,16 +79,8 @@ public class AlterTableStmtTest { Assert.assertEquals(2, stmt.getOps().size()); } - @Test - public void testNoPriv() throws AnalysisException, InternalException { - List ops = Lists.newArrayList(); - ops.add(new DropColumnClause("col1", "", null)); - AlterTableStmt stmt = new AlterTableStmt(new TableName("testDb", "testTbl"), ops); - stmt.analyze(AccessTestUtil.fetchBlockAnalyzer()); - } - @Test(expected = AnalysisException.class) - public void testNoTable() throws AnalysisException, InternalException { + public void testNoTable() throws AnalysisException, UserException { List ops = Lists.newArrayList(); ops.add(new DropColumnClause("col1", "", null)); AlterTableStmt stmt = new AlterTableStmt(null, ops); @@ -98,7 +90,7 @@ public class AlterTableStmtTest { } @Test(expected = AnalysisException.class) - public void testNoClause() throws AnalysisException, InternalException { + public void testNoClause() throws AnalysisException, UserException { List ops = Lists.newArrayList(); AlterTableStmt stmt = new AlterTableStmt(new TableName("testDb", "testTbl"), ops); stmt.analyze(analyzer); diff --git a/fe/src/test/java/com/baidu/palo/analysis/BackendStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/BackendStmtTest.java index b67fb63cb2..830d359862 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/BackendStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/BackendStmtTest.java @@ -72,12 +72,6 @@ public class BackendStmtTest { stmt.analyze(analyzer); } - @Test(expected = AnalysisException.class) - public void initBackendsTest2() throws Exception { - BackendClause stmt = createStmt(2); - stmt.analyze(analyzer); - } - @Test(expected = AnalysisException.class) public void initBackendsTest3() throws Exception { BackendClause stmt = createStmt(3); diff --git a/fe/src/test/java/com/baidu/palo/analysis/CancelAlterStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/CancelAlterStmtTest.java index 7643e46dc5..482323aa1b 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/CancelAlterStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/CancelAlterStmtTest.java @@ -20,10 +20,11 @@ package com.baidu.palo.analysis; + import com.baidu.palo.analysis.ShowAlterStmt.AlterType; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -76,7 +77,7 @@ public class CancelAlterStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { // cancel alter column CancelAlterTableStmt stmt = new CancelAlterTableStmt(AlterType.COLUMN, new TableName(null, "testTbl")); stmt.analyze(analyzer); diff --git a/fe/src/test/java/com/baidu/palo/analysis/CreateClusterStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/CreateClusterStmtTest.java index ccfee40873..e0184864d7 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/CreateClusterStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/CreateClusterStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -34,7 +34,6 @@ import java.util.HashMap; import java.util.Map; import mockit.Mocked; -import mockit.internal.startup.Startup; public class CreateClusterStmtTest { @@ -46,7 +45,7 @@ public class CreateClusterStmtTest { private ConnectContext ctx; static { - Startup.initializeIfPossible(); + // Startup.initializeIfPossible(); } @Before() @@ -57,7 +56,7 @@ public class CreateClusterStmtTest { } @Test - public void testAnalyzeNormal() throws InternalException, AnalysisException { + public void testAnalyzeNormal() throws UserException, AnalysisException { final Map properties = new HashMap(); properties.put("instance_num", "2"); final CreateClusterStmt stmt = new CreateClusterStmt("testCluster", properties, "password"); @@ -69,7 +68,7 @@ public class CreateClusterStmtTest { } @Test(expected = AnalysisException.class) - public void testAnnlyzeWithException() throws InternalException, AnalysisException { + public void testAnnlyzeWithException() throws UserException, AnalysisException { final CreateClusterStmt stmt = new CreateClusterStmt("testCluster", null, "password"); stmt.analyze(analyzer); Assert.fail("no exception"); diff --git a/fe/src/test/java/com/baidu/palo/analysis/CreateDbStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/CreateDbStmtTest.java index 90b8468838..b5cd1b5536 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/CreateDbStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/CreateDbStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class CreateDbStmtTest { } @Test - public void testAnalyzeNormal() throws InternalException, AnalysisException { + public void testAnalyzeNormal() throws UserException, AnalysisException { CreateDbStmt dbStmt = new CreateDbStmt(false, "test"); dbStmt.analyze(analyzer); Assert.assertEquals("testCluster:test", dbStmt.getFullDbName()); @@ -61,7 +61,7 @@ public class CreateDbStmtTest { } @Test(expected = AnalysisException.class) - public void testAnnlyzeWithException() throws InternalException, AnalysisException { + public void testAnnlyzeWithException() throws UserException, AnalysisException { CreateDbStmt stmt = new CreateDbStmt(false, ""); stmt.analyze(analyzer); Assert.fail("no exception"); diff --git a/fe/src/test/java/com/baidu/palo/analysis/CreateTableStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/CreateTableStmtTest.java index a8415d261d..c0afa17ada 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/CreateTableStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/CreateTableStmtTest.java @@ -26,7 +26,7 @@ import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.catalog.KeysType; import com.baidu.palo.catalog.PrimitiveType; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -98,7 +98,7 @@ public class CreateTableStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { CreateTableStmt stmt = new CreateTableStmt(false, false, tblName, cols, "olap", new KeysDesc(KeysType.AGG_KEYS, colsName), null, new HashDistributionDesc(10, Lists.newArrayList("col1")), null, null); @@ -109,7 +109,7 @@ public class CreateTableStmtTest { } @Test - public void testDefaultDbNormal() throws InternalException, AnalysisException { + public void testDefaultDbNormal() throws UserException, AnalysisException { CreateTableStmt stmt = new CreateTableStmt(false, false, tblNameNoDb, cols, "olap", new KeysDesc(KeysType.AGG_KEYS, colsName), null, new HashDistributionDesc(10, Lists.newArrayList("col1")), null, null); @@ -121,7 +121,7 @@ public class CreateTableStmtTest { } @Test(expected = AnalysisException.class) - public void testNoDb() throws InternalException, AnalysisException { + public void testNoDb() throws UserException, AnalysisException { // make defalut db return empty; analyzer = EasyMock.createMock(Analyzer.class); EasyMock.expect(analyzer.getDefaultDb()).andReturn("").anyTimes(); @@ -134,7 +134,7 @@ public class CreateTableStmtTest { } @Test(expected = AnalysisException.class) - public void testEmptyCol() throws InternalException, AnalysisException { + public void testEmptyCol() throws UserException, AnalysisException { // make defalut db return empty; List emptyCols = Lists.newArrayList(); CreateTableStmt stmt = new CreateTableStmt(false, false, tblNameNoDb, emptyCols, "olap", @@ -144,12 +144,11 @@ public class CreateTableStmtTest { } @Test(expected = AnalysisException.class) - public void testDupCol() throws InternalException, AnalysisException { + public void testDupCol() throws UserException, AnalysisException { // make defalut db return empty; CreateTableStmt stmt = new CreateTableStmt(false, false, tblNameNoDb, invalidCols, "olap", new KeysDesc(KeysType.AGG_KEYS, invalidColsName), null, new RandomDistributionDesc(10), null, null); stmt.analyze(analyzer); } - } \ No newline at end of file diff --git a/fe/src/test/java/com/baidu/palo/analysis/CreateUserStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/CreateUserStmtTest.java index 4d90fa9a11..aa171a4158 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/CreateUserStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/CreateUserStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,11 +53,11 @@ public class CreateUserStmtTest { } @Test - public void testToString() throws InternalException, AnalysisException { + public void testToString() throws UserException, AnalysisException { CreateUserStmt stmt = new CreateUserStmt(new UserDesc(new UserIdentity("user", "%"), "passwd", true)); stmt.analyze(analyzer); - Assert.assertEquals("CREATE USER 'testCluster:user'@'%' IDENTIFIED BY 'passwd'", stmt.toString()); + Assert.assertEquals("CREATE USER 'testCluster:user'@'%' IDENTIFIED BY '*XXX'", stmt.toString()); Assert.assertEquals(new String(stmt.getPassword()), "*59C70DA2F3E3A5BDF46B68F5C8B8F25762BCCEF0"); stmt = new CreateUserStmt( @@ -65,8 +65,7 @@ public class CreateUserStmtTest { stmt.analyze(analyzer); Assert.assertEquals("testCluster:user", stmt.getUserIdent().getQualifiedUser()); - Assert.assertEquals( - "CREATE USER 'testCluster:user'@'%' IDENTIFIED BY PASSWORD '*59c70da2f3e3a5bdf46b68f5c8b8f25762bccef0'", + Assert.assertEquals("CREATE USER 'testCluster:user'@'%' IDENTIFIED BY PASSWORD '*59c70da2f3e3a5bdf46b68f5c8b8f25762bccef0'", stmt.toString()); Assert.assertEquals(new String(stmt.getPassword()), "*59C70DA2F3E3A5BDF46B68F5C8B8F25762BCCEF0"); @@ -78,14 +77,14 @@ public class CreateUserStmtTest { } @Test(expected = AnalysisException.class) - public void testEmptyUser() throws InternalException, AnalysisException { + public void testEmptyUser() throws UserException, AnalysisException { CreateUserStmt stmt = new CreateUserStmt(new UserDesc(new UserIdentity("", "%"), "passwd", true)); stmt.analyze(analyzer); Assert.fail("No exception throws."); } @Test(expected = AnalysisException.class) - public void testBadPass() throws InternalException, AnalysisException { + public void testBadPass() throws UserException, AnalysisException { CreateUserStmt stmt = new CreateUserStmt(new UserDesc(new UserIdentity("", "%"), "passwd", false)); stmt.analyze(analyzer); Assert.fail("No exception throws."); diff --git a/fe/src/test/java/com/baidu/palo/analysis/DeleteStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/DeleteStmtTest.java index 8643223c9e..9ca9fd3645 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/DeleteStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/DeleteStmtTest.java @@ -20,9 +20,9 @@ package com.baidu.palo.analysis; + import com.baidu.palo.analysis.BinaryPredicate.Operator; -import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -76,7 +76,7 @@ public class DeleteStmtTest { DeleteStmt deleteStmt = new DeleteStmt(new TableName("testDb", "testTbl"), "partition", likePredicate, null); try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.assertTrue(e.getMessage().contains("should be compound or binary predicate")); } @@ -91,7 +91,7 @@ public class DeleteStmtTest { try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.assertTrue(e.getMessage().contains("should be AND")); } @@ -103,7 +103,7 @@ public class DeleteStmtTest { deleteStmt = new DeleteStmt(new TableName("testDb", "testTbl"), "partition", compoundPredicate, null); try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.assertTrue(e.getMessage().contains("should be compound or binary predicate")); } @@ -117,7 +117,7 @@ public class DeleteStmtTest { deleteStmt = new DeleteStmt(new TableName("testDb", "testTbl"), "partition", compoundPredicate, null); try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.assertTrue(e.getMessage().contains("Right expr should be value")); } @@ -131,7 +131,7 @@ public class DeleteStmtTest { deleteStmt = new DeleteStmt(new TableName("testDb", "testTbl"), "partition", compoundPredicate, null); try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.assertTrue(e.getMessage().contains("Left expr should be column name")); } @@ -145,7 +145,7 @@ public class DeleteStmtTest { deleteStmt = new DeleteStmt(new TableName("testDb", "testTbl"), null, compoundPredicate, null); try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.assertTrue(e.getMessage().contains("Partition is not set")); } @@ -163,7 +163,7 @@ public class DeleteStmtTest { deleteStmt = new DeleteStmt(new TableName("testDb", "testTbl"), "partition", compoundPredicate, null); try { deleteStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { Assert.fail(); } } diff --git a/fe/src/test/java/com/baidu/palo/analysis/DescribeStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/DescribeStmtTest.java index 4e89e16d3b..69cddf71ce 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/DescribeStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/DescribeStmtTest.java @@ -22,7 +22,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.qe.ConnectContext; import org.easymock.EasyMock; @@ -64,7 +64,7 @@ public class DescribeStmtTest { } @Test - public void testNormal() throws AnalysisException, InternalException { + public void testNormal() throws AnalysisException, UserException { DescribeStmt stmt = new DescribeStmt(new TableName("", "testTbl"), false); stmt.analyze(analyzer); Assert.assertEquals("DESCRIBE `testCluster:testDb.testTbl`", stmt.toString()); @@ -74,7 +74,7 @@ public class DescribeStmtTest { } @Test - public void testAllNormal() throws AnalysisException, InternalException { + public void testAllNormal() throws AnalysisException, UserException { DescribeStmt stmt = new DescribeStmt(new TableName("", "testTbl"), true); stmt.analyze(analyzer); Assert.assertEquals("DESCRIBE `testCluster:testDb.testTbl` ALL", stmt.toString()); diff --git a/fe/src/test/java/com/baidu/palo/analysis/DropClusterStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/DropClusterStmtTest.java index a99b50c9df..0cca09bf2c 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/DropClusterStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/DropClusterStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -64,7 +64,7 @@ public class DropClusterStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { final DropClusterStmt stmt = new DropClusterStmt(true, "testCluster"); stmt.analyze(analyzer); @@ -73,7 +73,7 @@ public class DropClusterStmtTest { } @Test(expected = AnalysisException.class) - public void testFailed() throws InternalException, AnalysisException { + public void testFailed() throws UserException, AnalysisException { DropClusterStmt stmt = new DropClusterStmt(false, ""); stmt.analyze(analyzer); diff --git a/fe/src/test/java/com/baidu/palo/analysis/DropDbStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/DropDbStmtTest.java index 7715463c7f..bda064f691 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/DropDbStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/DropDbStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class DropDbStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { DropDbStmt stmt = new DropDbStmt(false, "test"); stmt.analyze(analyzer); @@ -62,7 +62,7 @@ public class DropDbStmtTest { } @Test(expected = AnalysisException.class) - public void testFailed() throws InternalException, AnalysisException { + public void testFailed() throws UserException, AnalysisException { DropDbStmt stmt = new DropDbStmt(false, ""); stmt.analyze(analyzer); @@ -70,7 +70,7 @@ public class DropDbStmtTest { } @Test(expected = AnalysisException.class) - public void testNoPriv() throws InternalException, AnalysisException { + public void testNoPriv() throws UserException, AnalysisException { DropDbStmt stmt = new DropDbStmt(false, ""); stmt.analyze(AccessTestUtil.fetchBlockAnalyzer()); diff --git a/fe/src/test/java/com/baidu/palo/analysis/DropTableStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/DropTableStmtTest.java index 6e77af01df..27b6c4b498 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/DropTableStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/DropTableStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -65,7 +65,7 @@ public class DropTableStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { DropTableStmt stmt = new DropTableStmt(false, tbl); stmt.analyze(analyzer); Assert.assertEquals("testCluster:db1", stmt.getDbName()); @@ -74,7 +74,7 @@ public class DropTableStmtTest { } @Test - public void testDefaultNormal() throws InternalException, AnalysisException { + public void testDefaultNormal() throws UserException, AnalysisException { DropTableStmt stmt = new DropTableStmt(false, noDbTbl); stmt.analyze(analyzer); Assert.assertEquals("testCluster:testDb", stmt.getDbName()); @@ -83,14 +83,14 @@ public class DropTableStmtTest { } @Test(expected = AnalysisException.class) - public void testNoDbFail() throws InternalException, AnalysisException { + public void testNoDbFail() throws UserException, AnalysisException { DropTableStmt stmt = new DropTableStmt(false, noDbTbl); stmt.analyze(noDbAnalyzer); Assert.fail("No Exception throws."); } @Test(expected = AnalysisException.class) - public void testNoTableFail() throws InternalException, AnalysisException { + public void testNoTableFail() throws UserException, AnalysisException { DropTableStmt stmt = new DropTableStmt(false, new TableName("db1", "")); stmt.analyze(noDbAnalyzer); Assert.fail("No Exception throws."); diff --git a/fe/src/test/java/com/baidu/palo/analysis/DropUserStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/DropUserStmtTest.java index e90534bece..ee7cc4182e 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/DropUserStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/DropUserStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class DropUserStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { DropUserStmt stmt = new DropUserStmt(new UserIdentity("user", "%")); stmt.analyze(analyzer); Assert.assertEquals("DROP USER 'testCluster:user'@'%'", stmt.toString()); @@ -61,7 +61,7 @@ public class DropUserStmtTest { } @Test(expected = AnalysisException.class) - public void testNoUser() throws InternalException, AnalysisException { + public void testNoUser() throws UserException, AnalysisException { DropUserStmt stmt = new DropUserStmt(new UserIdentity("", "%")); stmt.analyze(analyzer); Assert.fail("No Exception throws."); diff --git a/fe/src/test/java/com/baidu/palo/analysis/GrantStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/GrantStmtTest.java index a7a45ed647..a8898492cf 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/GrantStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/GrantStmtTest.java @@ -23,7 +23,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.catalog.AccessPrivilege; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -79,7 +79,7 @@ public class GrantStmtTest { } @Test - public void testNormal() throws AnalysisException, InternalException { + public void testNormal() throws AnalysisException, UserException { GrantStmt stmt; List privileges = Lists.newArrayList(AccessPrivilege.ALL); @@ -94,7 +94,7 @@ public class GrantStmtTest { } @Test(expected = AnalysisException.class) - public void testUserFail() throws AnalysisException, InternalException { + public void testUserFail() throws AnalysisException, UserException { GrantStmt stmt; List privileges = Lists.newArrayList(AccessPrivilege.ALL); diff --git a/fe/src/test/java/com/baidu/palo/analysis/LinkDbStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/LinkDbStmtTest.java index 66762c2b3b..c8b9db59a9 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/LinkDbStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/LinkDbStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -54,7 +54,7 @@ public class LinkDbStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { final ClusterName cn1 = new ClusterName("testCluster1", "testDb1"); final ClusterName cn2 = new ClusterName("testCluster2", "testDb2"); final LinkDbStmt stmt = new LinkDbStmt(cn1, cn2); @@ -65,7 +65,7 @@ public class LinkDbStmtTest { } @Test(expected = AnalysisException.class) - public void testParamError() throws InternalException, AnalysisException { + public void testParamError() throws UserException, AnalysisException { final ClusterName cn1 = new ClusterName("testCluster1", ""); final ClusterName cn2 = new ClusterName("testCluster2", "testDb2"); final LinkDbStmt stmt = new LinkDbStmt(cn1, cn2); diff --git a/fe/src/test/java/com/baidu/palo/analysis/LoadStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/LoadStmtTest.java index ffba8c0893..e46a1ba822 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/LoadStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/LoadStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -72,7 +72,7 @@ public class LoadStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { desc.analyze(EasyMock.anyString()); EasyMock.expectLastCall().anyTimes(); EasyMock.replay(desc); @@ -88,7 +88,7 @@ public class LoadStmtTest { } @Test(expected = AnalysisException.class) - public void testNoData() throws InternalException, AnalysisException { + public void testNoData() throws UserException, AnalysisException { desc.analyze(EasyMock.anyString()); EasyMock.expectLastCall().anyTimes(); EasyMock.replay(desc); diff --git a/fe/src/test/java/com/baidu/palo/analysis/MigrateDbStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/MigrateDbStmtTest.java index 9d92d47d38..7b175bd907 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/MigrateDbStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/MigrateDbStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -54,7 +54,7 @@ public class MigrateDbStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { final ClusterName cn1 = new ClusterName("testCluster1", "testDb1"); final ClusterName cn2 = new ClusterName("testCluster2", "testDb2"); final MigrateDbStmt stmt = new MigrateDbStmt(cn1, cn2); @@ -65,7 +65,7 @@ public class MigrateDbStmtTest { } @Test(expected = AnalysisException.class) - public void testParamError() throws InternalException, AnalysisException { + public void testParamError() throws UserException, AnalysisException { final ClusterName cn1 = new ClusterName("testCluster1", ""); final ClusterName cn2 = new ClusterName("testCluster2", "testDb2"); final MigrateDbStmt stmt = new MigrateDbStmt(cn1, cn2); diff --git a/fe/src/test/java/com/baidu/palo/analysis/SetPassVarTest.java b/fe/src/test/java/com/baidu/palo/analysis/SetPassVarTest.java index 9b75d62fb8..9239cd7636 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/SetPassVarTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/SetPassVarTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class SetPassVarTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { SetPassVar stmt; // mode: SET PASSWORD FOR 'testUser' = 'testPass'; @@ -61,23 +61,23 @@ public class SetPassVarTest { stmt.analyze(analyzer); Assert.assertEquals("testCluster:testUser", stmt.getUserIdent().getQualifiedUser()); Assert.assertEquals("*88EEBA7D913688E7278E2AD071FDB5E76D76D34B", new String(stmt.getPassword())); - Assert.assertEquals("SET PASSWORD FOR 'testCluster:testUser'@'%' = '*88EEBA7D913688E7278E2AD071FDB5E76D76D34B'", + Assert.assertEquals("SET PASSWORD FOR 'testCluster:testUser'@'%' = '*XXX'", stmt.toString()); // empty password stmt = new SetPassVar(new UserIdentity("testUser", "%"), null); stmt.analyze(analyzer); - Assert.assertEquals("SET PASSWORD FOR 'testCluster:testUser'@'%' = ''", stmt.toString()); + Assert.assertEquals("SET PASSWORD FOR 'testCluster:testUser'@'%' = '*XXX'", stmt.toString()); // empty user // empty password stmt = new SetPassVar(null, null); stmt.analyze(analyzer); - Assert.assertEquals("SET PASSWORD FOR 'testCluster:testUser'@'192.168.1.1' = ''", stmt.toString()); + Assert.assertEquals("SET PASSWORD FOR 'testCluster:testUser'@'192.168.1.1' = '*XXX'", stmt.toString()); } @Test(expected = AnalysisException.class) - public void testBadPassword() throws InternalException, AnalysisException { + public void testBadPassword() throws UserException, AnalysisException { SetPassVar stmt; // mode: SET PASSWORD FOR 'testUser' = 'testPass'; stmt = new SetPassVar(new UserIdentity("testUser", "%"), "*88EEBAHD913688E7278E2AD071FDB5E76D76D34B"); diff --git a/fe/src/test/java/com/baidu/palo/analysis/SetStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/SetStmtTest.java index e522f6ac83..c7fc6f50f3 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/SetStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/SetStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -57,7 +57,7 @@ public class SetStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { List vars = Lists.newArrayList(new SetVar("times", new IntLiteral(100L)), new SetVar(SetType.GLOBAL, "names", new StringLiteral("utf-8"))); SetStmt stmt = new SetStmt(vars); @@ -69,7 +69,7 @@ public class SetStmtTest { } @Test(expected = AnalysisException.class) - public void testNoVar() throws InternalException, AnalysisException { + public void testNoVar() throws UserException, AnalysisException { SetStmt stmt = new SetStmt(Lists.newArrayList()); stmt.analyze(analyzer); @@ -77,7 +77,7 @@ public class SetStmtTest { } @Test(expected = AnalysisException.class) - public void testNullVar() throws InternalException, AnalysisException { + public void testNullVar() throws UserException, AnalysisException { SetStmt stmt = new SetStmt(null); stmt.analyze(analyzer); diff --git a/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyStmtTest.java index c8933e0da6..05fc1bf146 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -57,9 +57,9 @@ public class SetUserPropertyStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { List propertyVarList = Lists.newArrayList(); - propertyVarList.add(new SetUserPropertyVar("load_cluster.palo-dpp", null)); + propertyVarList.add(new SetUserPropertyVar("load_cluster.palo-dpp", "")); propertyVarList.add(new SetUserPropertyVar("quota.normal", "100")); SetUserPropertyStmt stmt = new SetUserPropertyStmt("testUser", propertyVarList); @@ -68,7 +68,7 @@ public class SetUserPropertyStmtTest { } @Test(expected = AnalysisException.class) - public void testNoProperty() throws InternalException, AnalysisException { + public void testNoProperty() throws UserException, AnalysisException { SetUserPropertyStmt stmt = new SetUserPropertyStmt("testUser", null); stmt.analyze(analyzer); Assert.fail("No exception throws"); diff --git a/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyVarTest.java b/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyVarTest.java index 4984c343f8..29ea0e52c8 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyVarTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/SetUserPropertyVarTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import org.junit.Assert; import org.junit.Before; @@ -36,20 +36,20 @@ public class SetUserPropertyVarTest { } @Test - public void testNormal() throws AnalysisException, InternalException { + public void testNormal() throws AnalysisException, UserException { SetUserPropertyVar var = new SetUserPropertyVar("quota.normal", "1000"); var.analyze(analyzer, true); Assert.assertEquals("quota.normal", var.getPropertyKey()); Assert.assertEquals("1000", var.getPropertyValue()); Assert.assertEquals("'quota.normal' = '1000'", var.toString()); - var = new SetUserPropertyVar("load_cluster.palo-dpp", null); + var = new SetUserPropertyVar("load_cluster.palo-dpp", ""); var.analyze(analyzer, true); - Assert.assertEquals("'load_cluster.palo-dpp' = NULL", var.toString()); + Assert.assertEquals("'load_cluster.palo-dpp' = ''", var.toString()); } @Test(expected = AnalysisException.class) - public void testUnknownProperty() throws InternalException, AnalysisException { + public void testUnknownProperty() throws UserException, AnalysisException { SetUserPropertyVar var = new SetUserPropertyVar("unknown_property", "1000"); var.analyze(analyzer, true); Assert.fail("No exception throws."); diff --git a/fe/src/test/java/com/baidu/palo/analysis/SetVarTest.java b/fe/src/test/java/com/baidu/palo/analysis/SetVarTest.java index af8fdf0b6a..232efb92e0 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/SetVarTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/SetVarTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class SetVarTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { SetVar var = new SetVar(SetType.DEFAULT, "names", new StringLiteral("utf-8")); var.analyze(analyzer); @@ -71,7 +71,7 @@ public class SetVarTest { } @Test(expected = AnalysisException.class) - public void testNoVariable() throws InternalException, AnalysisException { + public void testNoVariable() throws UserException, AnalysisException { SetVar var = new SetVar(SetType.DEFAULT, "", new StringLiteral("utf-8")); var.analyze(analyzer); Assert.fail("No exception throws."); diff --git a/fe/src/test/java/com/baidu/palo/analysis/ShowAlterStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/ShowAlterStmtTest.java index be5bc914f9..c743f986ff 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/ShowAlterStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/ShowAlterStmtTest.java @@ -23,7 +23,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.analysis.BinaryPredicate.Operator; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import org.easymock.EasyMock; import org.junit.Assert; @@ -59,7 +59,7 @@ public class ShowAlterStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { ShowLoadStmt stmt = new ShowLoadStmt(null, null, null, null); stmt.analyze(analyzer); Assert.assertEquals("SHOW LOAD FROM `testDb`", stmt.toString()); @@ -79,7 +79,7 @@ public class ShowAlterStmtTest { } @Test(expected = AnalysisException.class) - public void testNoDb() throws InternalException, AnalysisException { + public void testNoDb() throws UserException, AnalysisException { analyzer = EasyMock.createMock(Analyzer.class); EasyMock.expect(analyzer.getDefaultDb()).andReturn("").anyTimes(); EasyMock.expect(analyzer.getClusterName()).andReturn("testCluster").anyTimes(); diff --git a/fe/src/test/java/com/baidu/palo/analysis/ShowCreateDbStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/ShowCreateDbStmtTest.java index 85dbe7701f..a04a5892a9 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/ShowCreateDbStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/ShowCreateDbStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -51,7 +51,7 @@ public class ShowCreateDbStmtTest { } @Test - public void testNormal() throws AnalysisException, InternalException { + public void testNormal() throws AnalysisException, UserException { ShowCreateDbStmt stmt = new ShowCreateDbStmt("testDb"); stmt.analyze(AccessTestUtil.fetchAdminAnalyzer(true)); Assert.assertEquals("testCluster:testDb", stmt.getDb()); @@ -60,7 +60,7 @@ public class ShowCreateDbStmtTest { } @Test(expected = AnalysisException.class) - public void testEmptyDb() throws AnalysisException, InternalException { + public void testEmptyDb() throws AnalysisException, UserException { ShowCreateDbStmt stmt = new ShowCreateDbStmt(""); stmt.analyze(AccessTestUtil.fetchAdminAnalyzer(false)); Assert.fail("No exception throws."); diff --git a/fe/src/test/java/com/baidu/palo/analysis/ShowDataStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/ShowDataStmtTest.java index 1e25297d5d..e142c51a6d 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/ShowDataStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/ShowDataStmtTest.java @@ -25,7 +25,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.TabletInvertedIndex; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.mysql.privilege.PrivPredicate; import com.baidu.palo.qe.ConnectContext; @@ -125,7 +125,7 @@ public class ShowDataStmtTest { } @Test - public void testNormal() throws AnalysisException, InternalException { + public void testNormal() throws AnalysisException, UserException { ShowDataStmt stmt = new ShowDataStmt(null, null); stmt.analyze(analyzer); Assert.assertEquals("SHOW DATA FROM `testCluster:testDb`", stmt.toString()); diff --git a/fe/src/test/java/com/baidu/palo/analysis/ShowDbStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/ShowDbStmtTest.java index c90654d05f..2263fcec73 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/ShowDbStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/ShowDbStmtTest.java @@ -24,11 +24,11 @@ import org.junit.Assert; import org.junit.Test; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; public class ShowDbStmtTest { @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { final Analyzer analyzer = AccessTestUtil.fetchBlockAnalyzer(); ShowDbStmt stmt = new ShowDbStmt(null); stmt.analyze(analyzer); diff --git a/fe/src/test/java/com/baidu/palo/analysis/ShowLoadStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/ShowLoadStmtTest.java index 745b88fe84..2773230222 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/ShowLoadStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/ShowLoadStmtTest.java @@ -23,7 +23,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.analysis.ShowAlterStmt.AlterType; import com.baidu.palo.catalog.Catalog; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.system.SystemInfoService; import org.easymock.EasyMock; @@ -68,7 +68,7 @@ public class ShowLoadStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { ShowAlterStmt stmt = new ShowAlterStmt(AlterType.COLUMN, null); stmt.analyze(analyzer); Assert.assertEquals("SHOW ALTER COLUMN FROM `testCluster:testDb`", stmt.toString()); @@ -79,7 +79,7 @@ public class ShowLoadStmtTest { } @Test(expected = AnalysisException.class) - public void testNoDb() throws InternalException, AnalysisException { + public void testNoDb() throws UserException, AnalysisException { analyzer = EasyMock.createMock(Analyzer.class); EasyMock.expect(analyzer.getDefaultDb()).andReturn("").anyTimes(); EasyMock.expect(analyzer.getClusterName()).andReturn("testCluster").anyTimes(); diff --git a/fe/src/test/java/com/baidu/palo/analysis/ShowUserPropertyStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/ShowUserPropertyStmtTest.java index 24f55c4a60..45822fefad 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/ShowUserPropertyStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/ShowUserPropertyStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class ShowUserPropertyStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { ShowUserPropertyStmt stmt = new ShowUserPropertyStmt("testUser", "%load_cluster%"); stmt.analyze(analyzer); Assert.assertEquals("SHOW PROPERTY FOR 'testCluster:testUser' LIKE '%load_cluster%'", stmt.toString()); diff --git a/fe/src/test/java/com/baidu/palo/analysis/UseStmtTest.java b/fe/src/test/java/com/baidu/palo/analysis/UseStmtTest.java index bbf244e37f..aca4365086 100644 --- a/fe/src/test/java/com/baidu/palo/analysis/UseStmtTest.java +++ b/fe/src/test/java/com/baidu/palo/analysis/UseStmtTest.java @@ -21,7 +21,7 @@ package com.baidu.palo.analysis; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.MockedAuth; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.qe.ConnectContext; @@ -53,7 +53,7 @@ public class UseStmtTest { } @Test - public void testNormal() throws InternalException, AnalysisException { + public void testNormal() throws UserException, AnalysisException { UseStmt stmt = new UseStmt("testDb"); stmt.analyze(analyzer); @@ -62,7 +62,7 @@ public class UseStmtTest { } @Test(expected = AnalysisException.class) - public void testNoDb() throws InternalException, AnalysisException { + public void testNoDb() throws UserException, AnalysisException { UseStmt stmt = new UseStmt(""); stmt.analyze(analyzer); diff --git a/fe/src/test/java/com/baidu/palo/backup/BackupJobTest.java b/fe/src/test/java/com/baidu/palo/backup/BackupJobTest.java index 9a5f27d4ef..b31a92ce45 100644 --- a/fe/src/test/java/com/baidu/palo/backup/BackupJobTest.java +++ b/fe/src/test/java/com/baidu/palo/backup/BackupJobTest.java @@ -26,7 +26,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Database; import com.baidu.palo.catalog.OlapTable; import com.baidu.palo.common.Config; -import com.baidu.palo.common.FeMetaVersion; +import com.baidu.palo.common.FeConstants; import com.baidu.palo.common.util.UnitTestUtil; import com.baidu.palo.persist.EditLog; import com.baidu.palo.task.AgentBatchTask; @@ -60,10 +60,11 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; -import mockit.Expectations; +import mockit.Delegate; import mockit.Mock; import mockit.MockUp; import mockit.Mocked; +import mockit.NonStrictExpectations; import mockit.internal.startup.Startup; public class BackupJobTest { @@ -118,62 +119,62 @@ public class BackupJobTest { @Before public void setUp() { - new MockUp() { - @Mock - public BackupHandler getBackupHandler() { - return backupHandler; - } - @Mock - public Database getDb(long dbId) { - return db; - } + db = UnitTestUtil.createDb(dbId, tblId, partId, idxId, tabletId, backendId, version, versionHash); - @Mock - public int getCurrentCatalogJournalVersion() { - return FeMetaVersion.VERSION_42; - } + new NonStrictExpectations() { + { + catalog.getBackupHandler(); + result = backupHandler; - @Mock - public long getNextId() { - return id.getAndIncrement(); - } + catalog.getDb(anyLong); + result = db; - @Mock - public EditLog getEditLog() { - return editLog; + Catalog.getCurrentCatalogJournalVersion(); + result = FeConstants.meta_version; + + catalog.getNextId(); + result = id.getAndIncrement(); + + catalog.getEditLog(); + result = editLog; } }; - new MockUp() { - @Mock - public RepositoryMgr getRepoMgr() { - return repoMgr; + new NonStrictExpectations() { + { + backupHandler.getRepoMgr(); + result = repoMgr; } }; - new MockUp() { - @Mock - public Repository getRepo(long repoId) { - return repo; + new NonStrictExpectations() { + { + repoMgr.getRepo(anyInt); + result = repo; + minTimes = 0; } }; - new MockUp() { - @Mock - public void logBackupJob(BackupJob job) { - System.out.println("log backup job: " + job); + new NonStrictExpectations() { + { + editLog.logBackupJob((BackupJob) any); + result = new Delegate() { + public void logBackupJob(BackupJob job) { + System.out.println("log backup job: " + job); + } + }; } }; new MockUp() { @Mock public void submit(AgentBatchTask task) { - return; + } }; - new Expectations(Repository.class) { + new NonStrictExpectations(Repository.class) { { repo.upload(anyString, anyString); minTimes = 0; @@ -181,7 +182,6 @@ public class BackupJobTest { } }; - db = UnitTestUtil.createDb(dbId, tblId, partId, idxId, tabletId, backendId, version, versionHash); List tableRefs = Lists.newArrayList(); tableRefs.add(new TableRef(new TableName(UnitTestUtil.DB_NAME, UnitTestUtil.TABLE_NAME), null)); job = new BackupJob("label", dbId, UnitTestUtil.DB_NAME, tableRefs, 13600 * 1000, catalog, repo.getId()); diff --git a/fe/src/test/java/com/baidu/palo/backup/RestoreJobTest.java b/fe/src/test/java/com/baidu/palo/backup/RestoreJobTest.java index 1611ab8a44..0aeab2ec19 100644 --- a/fe/src/test/java/com/baidu/palo/backup/RestoreJobTest.java +++ b/fe/src/test/java/com/baidu/palo/backup/RestoreJobTest.java @@ -13,13 +13,11 @@ import com.baidu.palo.catalog.Partition; import com.baidu.palo.catalog.Table; import com.baidu.palo.catalog.Tablet; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.FeMetaVersion; +import com.baidu.palo.common.FeConstants; import com.baidu.palo.common.MarkedCountDownLatch; import com.baidu.palo.persist.EditLog; import com.baidu.palo.system.SystemInfoService; -import com.baidu.palo.task.AgentBatchTask; import com.baidu.palo.task.AgentTask; -import com.baidu.palo.task.AgentTaskExecutor; import com.baidu.palo.task.AgentTaskQueue; import com.baidu.palo.task.DirMoveTask; import com.baidu.palo.task.DownloadTask; @@ -35,6 +33,7 @@ import com.google.common.collect.Maps; import org.junit.Assert; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import java.util.List; @@ -45,6 +44,7 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.zip.Adler32; import mockit.Delegate; +import mockit.Injectable; import mockit.Mock; import mockit.MockUp; import mockit.Mocked; @@ -63,6 +63,7 @@ public class RestoreJobTest { private OlapTable expectedRestoreTbl; private long repoId = 20000; + @Mocked private Catalog catalog; @Mocked @@ -74,6 +75,7 @@ public class RestoreJobTest { @Mocked private SystemInfoService systemInfoService; + @Injectable private Repository repo = new Repository(repoId, "repo", false, "bos://my_repo", new BlobStorage("broker", Maps.newHashMap())); @@ -85,29 +87,30 @@ public class RestoreJobTest { @Before public void setUp() throws AnalysisException { - + db = CatalogMocker.mockDb(); + new NonStrictExpectations() { { catalog.getBackupHandler(); result = backupHandler; - + catalog.getDb(anyLong); result = db; - + Catalog.getCurrentCatalogJournalVersion(); - result = FeMetaVersion.VERSION_42; - + result = FeConstants.meta_version; + catalog.getNextId(); result = id.getAndIncrement(); - + catalog.getEditLog(); - result = catalog; - + result = editLog; + Catalog.getCurrentSystemInfo(); result = systemInfoService; } }; - + new NonStrictExpectations() { { systemInfoService.seqChooseBackendIds(anyInt, anyBoolean, anyBoolean, anyString); @@ -123,21 +126,22 @@ public class RestoreJobTest { }; } }; - + new NonStrictExpectations() { { backupHandler.getRepoMgr(); result = repoMgr; } }; - + new NonStrictExpectations() { { repoMgr.getRepo(anyInt); result = repo; + minTimes = 0; } }; - + new NonStrictExpectations() { { editLog.logBackupJob((BackupJob) any); @@ -148,22 +152,12 @@ public class RestoreJobTest { }; } }; - - new NonStrictExpectations() { - { - AgentTaskExecutor.submit((AgentBatchTask) any); - result = new Delegate() { - public void submit(AgentBatchTask task) { - return; - } - }; - } - }; - + new NonStrictExpectations() { { repo.upload(anyString, anyString); result = Status.OK; + minTimes = 0; List backupMetas = Lists.newArrayList(); repo.getSnapshotMetaFile(label, backupMetas); @@ -182,9 +176,7 @@ public class RestoreJobTest { return true; } }; - - db = CatalogMocker.mockDb(); - + // gen BackupJobInfo jobInfo = new BackupJobInfo(); jobInfo.backupTime = System.currentTimeMillis(); @@ -198,7 +190,7 @@ public class RestoreJobTest { tblInfo.id = CatalogMocker.TEST_TBL2_ID; tblInfo.name = CatalogMocker.TEST_TBL2_NAME; jobInfo.tables.put(tblInfo.name, tblInfo); - + for (Partition partition : expectedRestoreTbl.getPartitions()) { BackupPartitionInfo partInfo = new BackupPartitionInfo(); partInfo.id = partition.getId(); @@ -222,18 +214,19 @@ public class RestoreJobTest { } } } - + // drop this table, cause we want to try restoring this table db.dropTable(expectedRestoreTbl.getName()); job = new RestoreJob(label, "2018-01-01 01:01:01", db.getId(), db.getFullName(), jobInfo, false, 3, 100000, catalog, repo.getId()); - + List tbls = Lists.newArrayList(); tbls.add(expectedRestoreTbl); backupMeta = new BackupMeta(tbls); } + @Ignore @Test public void testRun() { // pending @@ -354,8 +347,10 @@ public class RestoreJobTest { OlapTable tbl = (OlapTable) db.getTable(CatalogMocker.TEST_TBL_NAME); List partNames = Lists.newArrayList(tbl.getPartitionNames()); + System.out.println(partNames); System.out.println("tbl signature: " + tbl.getSignature(BackupHandler.SIGNATURE_VERSION, partNames)); tbl.setName("newName"); + partNames = Lists.newArrayList(tbl.getPartitionNames()); System.out.println("tbl signature: " + tbl.getSignature(BackupHandler.SIGNATURE_VERSION, partNames)); } diff --git a/fe/src/test/java/com/baidu/palo/bdb/BDBToolTest.java b/fe/src/test/java/com/baidu/palo/bdb/BDBToolTest.java index ee0b8ed3aa..8b989f523d 100644 --- a/fe/src/test/java/com/baidu/palo/bdb/BDBToolTest.java +++ b/fe/src/test/java/com/baidu/palo/bdb/BDBToolTest.java @@ -57,7 +57,8 @@ public class BDBToolTest { } // write something - ReplicaPersistInfo info = ReplicaPersistInfo.createForAdd(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); + ReplicaPersistInfo info = ReplicaPersistInfo.createForAdd(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15); JournalEntity entity = new JournalEntity(); entity.setOpCode(OperationType.OP_ADD_REPLICA); entity.setData(info); diff --git a/fe/src/test/java/com/baidu/palo/catalog/CatalogTest.java b/fe/src/test/java/com/baidu/palo/catalog/CatalogTest.java index a59c40f733..7a36bdd74b 100644 --- a/fe/src/test/java/com/baidu/palo/catalog/CatalogTest.java +++ b/fe/src/test/java/com/baidu/palo/catalog/CatalogTest.java @@ -217,7 +217,7 @@ public class CatalogTest { catalog.addCluster(cluster); catalog.unprotectCreateDb(db1); - SchemaChangeJob job1 = new SchemaChangeJob(db1.getId(), table.getId(), null, table.getName()); + SchemaChangeJob job1 = new SchemaChangeJob(db1.getId(), table.getId(), null, table.getName(), -1); catalog.getSchemaChangeHandler().replayInitJob(job1, catalog); long checksum1 = catalog.saveAlterJob(dos, 0, JobType.SCHEMA_CHANGE); diff --git a/fe/src/test/java/com/baidu/palo/catalog/CreateTableTest.java b/fe/src/test/java/com/baidu/palo/catalog/CreateTableTest.java index 28b13fff8e..70c801e43a 100644 --- a/fe/src/test/java/com/baidu/palo/catalog/CreateTableTest.java +++ b/fe/src/test/java/com/baidu/palo/catalog/CreateTableTest.java @@ -26,6 +26,7 @@ import com.baidu.palo.analysis.KeysDesc; import com.baidu.palo.analysis.RandomDistributionDesc; import com.baidu.palo.analysis.TableName; import com.baidu.palo.common.DdlException; +import com.baidu.palo.common.FeMetaVersion; import com.baidu.palo.system.Backend; import com.baidu.palo.system.SystemInfoService; @@ -262,6 +263,7 @@ public class CreateTableTest { EasyMock.expect(Catalog.getInstance()).andReturn(catalog).anyTimes(); EasyMock.expect(Catalog.getCurrentSystemInfo()).andReturn(systemInfoService).anyTimes(); EasyMock.expect(Catalog.getCurrentInvertedIndex()).andReturn(invertedIndex).anyTimes(); + EasyMock.expect(Catalog.getCurrentCatalogJournalVersion()).andReturn(FeMetaVersion.VERSION_45).anyTimes(); EasyMock.expect(Catalog.isCheckpointThread()).andReturn(false).anyTimes(); EasyMock.expect(Catalog.calcShortKeyColumnCount(EasyMock.anyObject(List.class), EasyMock.anyObject(Map.class))) .andReturn((short) 2).anyTimes(); diff --git a/fe/src/test/java/com/baidu/palo/catalog/DatabaseTest.java b/fe/src/test/java/com/baidu/palo/catalog/DatabaseTest.java index 4c260f76de..43c4185544 100644 --- a/fe/src/test/java/com/baidu/palo/catalog/DatabaseTest.java +++ b/fe/src/test/java/com/baidu/palo/catalog/DatabaseTest.java @@ -80,8 +80,6 @@ public class DatabaseTest { db.readLock(); try { Assert.assertFalse(db.tryWriteLock(0, TimeUnit.SECONDS)); - Assert.assertTrue(db.tryReadLock(0, TimeUnit.SECONDS)); - db.readUnlock(); } finally { db.readUnlock(); } @@ -89,7 +87,6 @@ public class DatabaseTest { db.writeLock(); try { Assert.assertTrue(db.tryWriteLock(0, TimeUnit.SECONDS)); - Assert.assertTrue(db.tryReadLock(0, TimeUnit.SECONDS)); } finally { db.writeUnlock(); } @@ -178,7 +175,6 @@ public class DatabaseTest { db2.createTable(table); db2.write(dos); - dos.flush(); dos.close(); diff --git a/fe/src/test/java/com/baidu/palo/catalog/ReplicaTest.java b/fe/src/test/java/com/baidu/palo/catalog/ReplicaTest.java index 3668d38fb4..0d88dadc4b 100644 --- a/fe/src/test/java/com/baidu/palo/catalog/ReplicaTest.java +++ b/fe/src/test/java/com/baidu/palo/catalog/ReplicaTest.java @@ -20,7 +20,10 @@ package com.baidu.palo.catalog; +import static org.junit.Assert.assertEquals; + import com.baidu.palo.catalog.Replica.ReplicaState; +import com.baidu.palo.common.FeMetaVersion; import org.junit.Assert; import org.junit.Before; @@ -34,8 +37,15 @@ import java.io.FileOutputStream; import java.util.ArrayList; import java.util.List; +import mockit.Mocked; +import mockit.NonStrictExpectations; + public class ReplicaTest { + // replica serialize and deserialize test will use catalog so that it should be mocked + @Mocked + Catalog catalog; + private Replica replica; private long replicaId; private long backendId; @@ -44,6 +54,7 @@ public class ReplicaTest { private long dataSize; private long rowCount; + @Before public void setUp() { replicaId = 10000; @@ -52,7 +63,7 @@ public class ReplicaTest { versionHash = 98765; dataSize = 9999; rowCount = 1024; - replica = new Replica(replicaId, backendId, version, versionHash, dataSize, rowCount, ReplicaState.NORMAL); + replica = new Replica(replicaId, backendId, version, versionHash, dataSize, rowCount, ReplicaState.NORMAL, 0, 0, version, versionHash); } @Test @@ -80,27 +91,16 @@ public class ReplicaTest { Assert.assertFalse(replica.checkVersionCatchUp(newVersion, 76543)); Assert.assertTrue(replica.checkVersionCatchUp(newVersion, newVersionHash)); } - - @Test - public void toStringTest() { - StringBuffer strBuffer = new StringBuffer("replicaId="); - strBuffer.append(replicaId); - strBuffer.append(", BackendId="); - strBuffer.append(backendId); - strBuffer.append(", version="); - strBuffer.append(version); - strBuffer.append(", versionHash="); - strBuffer.append(versionHash); - strBuffer.append(", dataSize="); - strBuffer.append(dataSize); - strBuffer.append(", rowCount="); - strBuffer.append(rowCount); - - Assert.assertEquals(strBuffer.toString(), replica.toString()); - } @Test public void testSerialization() throws Exception { + new NonStrictExpectations() { + { + Catalog.getCurrentCatalogJournalVersion(); + result = FeMetaVersion.VERSION_45; + } + }; + // 1. Write objects to file File file = new File("./olapReplicaTest"); file.createNewFile(); @@ -110,7 +110,7 @@ public class ReplicaTest { List list2 = new ArrayList(); for (int count = 0; count < 10; ++count) { Replica olapReplica = new Replica(100L * count, 100L * count, 100L * count, 100L * count, - 100L * count, 100 * count, ReplicaState.NORMAL); + 100L * count, 100 * count, ReplicaState.NORMAL, 0, 0, 100L * count, 100L * count); list1.add(olapReplica); olapReplica.write(dos); } @@ -151,5 +151,107 @@ public class ReplicaTest { dis.close(); file.delete(); } + + @Test + public void testUpdateVersion1() { + Replica originalReplica = new Replica(10000, 20000, 3, 1231, 100, 78, ReplicaState.NORMAL, 0, 0, 3, 1231); + // new version is little than original version, it is invalid the version will not update + originalReplica.updateInfo(2, 111, 100, 78); + assertEquals(3, originalReplica.getVersion()); + assertEquals(1231, originalReplica.getVersionHash()); + } + + @Test + public void testUpdateVersion2() { + Replica originalReplica = new Replica(10000, 20000, 3, 1231, 100, 78, ReplicaState.NORMAL, 0, 0, 0, 0); + originalReplica.updateInfo(3, 111, 100, 78); + // if new version >= current version and last success version <= new version, then last success version should be updated + assertEquals(3, originalReplica.getLastSuccessVersion()); + assertEquals(111, originalReplica.getLastSuccessVersionHash()); + assertEquals(3, originalReplica.getVersion()); + assertEquals(111, originalReplica.getVersionHash()); + } + + @Test + public void testUpdateVersion3() { + // version(3) ---> last failed version (8) ---> last success version(10) + Replica originalReplica = new Replica(10000, 20000, 3, 111, 100, 78, ReplicaState.NORMAL, 0, 0, 0, 0); + originalReplica.updateLastFailedVersion(8, 100); + assertEquals(3, originalReplica.getLastSuccessVersion()); + assertEquals(111, originalReplica.getLastSuccessVersionHash()); + assertEquals(3, originalReplica.getVersion()); + assertEquals(111, originalReplica.getVersionHash()); + assertEquals(8, originalReplica.getLastFailedVersion()); + assertEquals(100, originalReplica.getLastFailedVersionHash()); + + // update last success version 10 + originalReplica.updateVersionInfo(originalReplica.getVersion(), + originalReplica.getVersionHash(), originalReplica.getLastFailedVersion(), + originalReplica.getLastFailedVersionHash(), + 10, 1210); + assertEquals(10, originalReplica.getLastSuccessVersion()); + assertEquals(1210, originalReplica.getLastSuccessVersionHash()); + assertEquals(3, originalReplica.getVersion()); + assertEquals(111, originalReplica.getVersionHash()); + assertEquals(8, originalReplica.getLastFailedVersion()); + assertEquals(100, originalReplica.getLastFailedVersionHash()); + + // update version to 8, the last success version and version should be 10 + originalReplica.updateInfo(8, 100, 100, 78); + assertEquals(10, originalReplica.getLastSuccessVersion()); + assertEquals(1210, originalReplica.getLastSuccessVersionHash()); + assertEquals(10, originalReplica.getVersion()); + assertEquals(1210, originalReplica.getVersionHash()); + assertEquals(-1, originalReplica.getLastFailedVersion()); + assertEquals(0, originalReplica.getLastFailedVersionHash()); + + // update last failed version to 12 + originalReplica.updateLastFailedVersion(12, 1212); + assertEquals(10, originalReplica.getLastSuccessVersion()); + assertEquals(1210, originalReplica.getLastSuccessVersionHash()); + assertEquals(10, originalReplica.getVersion()); + assertEquals(1210, originalReplica.getVersionHash()); + assertEquals(12, originalReplica.getLastFailedVersion()); + assertEquals(1212, originalReplica.getLastFailedVersionHash()); + + // update last success version to 15 + originalReplica.updateVersionInfo(originalReplica.getVersion(), + originalReplica.getVersionHash(), originalReplica.getLastFailedVersion(), + originalReplica.getLastFailedVersionHash(), + 15, 1215); + assertEquals(15, originalReplica.getLastSuccessVersion()); + assertEquals(1215, originalReplica.getLastSuccessVersionHash()); + assertEquals(10, originalReplica.getVersion()); + assertEquals(1210, originalReplica.getVersionHash()); + assertEquals(12, originalReplica.getLastFailedVersion()); + assertEquals(1212, originalReplica.getLastFailedVersionHash()); + + // update last failed version to 18 + originalReplica.updateLastFailedVersion(18, 1218); + assertEquals(10, originalReplica.getLastSuccessVersion()); + assertEquals(1210, originalReplica.getLastSuccessVersionHash()); + assertEquals(10, originalReplica.getVersion()); + assertEquals(1210, originalReplica.getVersionHash()); + assertEquals(18, originalReplica.getLastFailedVersion()); + assertEquals(1218, originalReplica.getLastFailedVersionHash()); + + // update version to 17 then version and success version is 17 + originalReplica.updateInfo(17, 1217, 100, 78); + assertEquals(17, originalReplica.getLastSuccessVersion()); + assertEquals(1217, originalReplica.getLastSuccessVersionHash()); + assertEquals(17, originalReplica.getVersion()); + assertEquals(1217, originalReplica.getVersionHash()); + assertEquals(18, originalReplica.getLastFailedVersion()); + assertEquals(1218, originalReplica.getLastFailedVersionHash()); + + // update version to 18, then version and last success version should be 18 and failed version should be -1 + originalReplica.updateInfo(18, 1218, 100, 78); + assertEquals(18, originalReplica.getLastSuccessVersion()); + assertEquals(1218, originalReplica.getLastSuccessVersionHash()); + assertEquals(18, originalReplica.getVersion()); + assertEquals(1218, originalReplica.getVersionHash()); + assertEquals(-1, originalReplica.getLastFailedVersion()); + assertEquals(0, originalReplica.getLastFailedVersionHash()); + } } diff --git a/fe/src/test/java/com/baidu/palo/catalog/TabletTest.java b/fe/src/test/java/com/baidu/palo/catalog/TabletTest.java index 7f45f1deb4..727affdb59 100644 --- a/fe/src/test/java/com/baidu/palo/catalog/TabletTest.java +++ b/fe/src/test/java/com/baidu/palo/catalog/TabletTest.java @@ -64,9 +64,9 @@ public class TabletTest { tablet = new Tablet(1); TabletMeta tabletMeta = new TabletMeta(10, 20, 30, 40, 1); invertedIndex.addTablet(1, tabletMeta); - replica1 = new Replica(1L, 1L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL); - replica2 = new Replica(2L, 2L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL); - replica3 = new Replica(3L, 3L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL); + replica1 = new Replica(1L, 1L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0); + replica2 = new Replica(2L, 2L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0); + replica3 = new Replica(3L, 3L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0); tablet.addReplica(replica1); tablet.addReplica(replica2); tablet.addReplica(replica3); @@ -134,9 +134,9 @@ public class TabletTest { Assert.assertFalse(rTablet1.equals(this)); Tablet tablet2 = new Tablet(1); - Replica replica1 = new Replica(1L, 1L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL); - Replica replica2 = new Replica(2L, 2L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL); - Replica replica3 = new Replica(3L, 3L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL); + Replica replica1 = new Replica(1L, 1L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0); + Replica replica2 = new Replica(2L, 2L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0); + Replica replica3 = new Replica(3L, 3L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0); tablet2.addReplica(replica1); tablet2.addReplica(replica2); Assert.assertFalse(tablet2.equals(tablet)); @@ -146,7 +146,7 @@ public class TabletTest { Tablet tablet3 = new Tablet(1); tablet3.addReplica(replica1); tablet3.addReplica(replica2); - tablet3.addReplica(new Replica(4L, 4L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL)); + tablet3.addReplica(new Replica(4L, 4L, 100L, 0L, 200000L, 3000L, ReplicaState.NORMAL, 0, 0, 0, 0)); Assert.assertFalse(tablet3.equals(tablet)); dis.close(); diff --git a/fe/src/test/java/com/baidu/palo/cluster/SystemInfoServiceTest.java b/fe/src/test/java/com/baidu/palo/cluster/SystemInfoServiceTest.java index 4db11b4707..714f9cf829 100644 --- a/fe/src/test/java/com/baidu/palo/cluster/SystemInfoServiceTest.java +++ b/fe/src/test/java/com/baidu/palo/cluster/SystemInfoServiceTest.java @@ -167,12 +167,6 @@ public class SystemInfoServiceTest { systemInfoService.validateHostAndPort(hostPort); } - @Test(expected = AnalysisException.class) - public void validHostAndPortTest2() throws Exception { - createHostAndPort(2); - systemInfoService.validateHostAndPort(hostPort); - } - @Test(expected = AnalysisException.class) public void validHostAndPortTest3() throws Exception { createHostAndPort(3); diff --git a/fe/src/test/java/com/baidu/palo/common/CIDRTest.java b/fe/src/test/java/com/baidu/palo/common/CIDRTest.java index 3ebf681f9e..1b0f62aa29 100644 --- a/fe/src/test/java/com/baidu/palo/common/CIDRTest.java +++ b/fe/src/test/java/com/baidu/palo/common/CIDRTest.java @@ -49,7 +49,7 @@ public class CIDRTest { } @Test - public void testNormal() throws InternalException { + public void testNormal() throws UserException { // the real value is 10.1.16.0/20 CIDR cidr = new CIDR("192.168.17.0/20"); Assert.assertEquals("192.168.17.0", cidr.getIP()); diff --git a/fe/src/test/java/com/baidu/palo/common/GenericPoolTest.java b/fe/src/test/java/com/baidu/palo/common/GenericPoolTest.java index 4b595ee42f..3dca1cc687 100644 --- a/fe/src/test/java/com/baidu/palo/common/GenericPoolTest.java +++ b/fe/src/test/java/com/baidu/palo/common/GenericPoolTest.java @@ -20,19 +20,6 @@ package com.baidu.palo.common; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.pool2.impl.GenericKeyedObjectPoolConfig; -import org.apache.thrift.TException; -import org.apache.thrift.TProcessor; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.BeforeClass; -import org.junit.Test; - import com.baidu.palo.thrift.BackendService; import com.baidu.palo.thrift.PaloInternalServiceVersion; import com.baidu.palo.thrift.TAgentPublishRequest; @@ -57,10 +44,24 @@ import com.baidu.palo.thrift.TPullLoadSubTaskInfo; import com.baidu.palo.thrift.TResultBatch; import com.baidu.palo.thrift.TSnapshotRequest; import com.baidu.palo.thrift.TStatus; +import com.baidu.palo.thrift.TTabletStatResult; import com.baidu.palo.thrift.TTransmitDataParams; import com.baidu.palo.thrift.TTransmitDataResult; import com.baidu.palo.thrift.TUniqueId; +import org.apache.commons.pool2.impl.GenericKeyedObjectPoolConfig; +import org.apache.thrift.TException; +import org.apache.thrift.TProcessor; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + public class GenericPoolTest { static GenericPool backendService; static ThriftServer service; @@ -210,6 +211,12 @@ public class GenericPoolTest { // TODO Auto-generated method stub return null; } + + @Override + public TTabletStatResult get_tablet_stat() throws TException { + // TODO Auto-generated method stub + return null; + } } @Test diff --git a/fe/src/test/java/com/baidu/palo/common/MarkDownParserTest.java b/fe/src/test/java/com/baidu/palo/common/MarkDownParserTest.java index c4beffd59c..189cd26b43 100644 --- a/fe/src/test/java/com/baidu/palo/common/MarkDownParserTest.java +++ b/fe/src/test/java/com/baidu/palo/common/MarkDownParserTest.java @@ -30,7 +30,7 @@ import java.util.Map; public class MarkDownParserTest { @Test - public void testNormal() throws InternalException { + public void testNormal() throws UserException { List lines = Lists.newArrayList(); lines.add("# SHOW TABLES"); lines.add("## name"); @@ -58,7 +58,7 @@ public class MarkDownParserTest { } @Test - public void testMultiDoc() throws InternalException { + public void testMultiDoc() throws UserException { List lines = Lists.newArrayList(); lines.add(" name"); lines.add("# SHOW TABLES"); @@ -88,7 +88,7 @@ public class MarkDownParserTest { } @Test - public void testNoDoc() throws InternalException { + public void testNoDoc() throws UserException { List lines = Lists.newArrayList(); lines.add(" SHOW TABLES"); lines.add(" name"); @@ -111,8 +111,8 @@ public class MarkDownParserTest { Assert.assertNull(map.get("DATABASES abc")); } - @Test(expected = InternalException.class) - public void testNoFirst() throws InternalException { + @Test(expected = UserException.class) + public void testNoFirst() throws UserException { List lines = Lists.newArrayList(); lines.add("## SHOW TABLES"); MarkDownParser parser = new MarkDownParser(lines); @@ -120,8 +120,8 @@ public class MarkDownParserTest { Assert.fail("No exception throws."); } - @Test(expected = InternalException.class) - public void testErrorState() throws InternalException { + @Test(expected = UserException.class) + public void testErrorState() throws UserException { List lines = Lists.newArrayList(); lines.add("# SHOW TABLES"); lines.add("## name"); @@ -132,7 +132,7 @@ public class MarkDownParserTest { } @Test - public void testEmptyTitle() throws InternalException { + public void testEmptyTitle() throws UserException { List lines = Lists.newArrayList(); lines.add("#"); lines.add("## "); @@ -155,7 +155,7 @@ public class MarkDownParserTest { } @Test - public void testOneName() throws InternalException { + public void testOneName() throws UserException { List lines = Lists.newArrayList(); lines.add("# TABLES"); lines.add("# TABLE"); diff --git a/fe/src/test/java/com/baidu/palo/common/util/UnitTestUtil.java b/fe/src/test/java/com/baidu/palo/common/util/UnitTestUtil.java index 2f672b1345..bf66a83557 100644 --- a/fe/src/test/java/com/baidu/palo/common/util/UnitTestUtil.java +++ b/fe/src/test/java/com/baidu/palo/common/util/UnitTestUtil.java @@ -21,7 +21,6 @@ package com.baidu.palo.common.util; import com.baidu.palo.catalog.AggregateType; -import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.Column; import com.baidu.palo.catalog.ColumnType; import com.baidu.palo.catalog.DataProperty; @@ -66,13 +65,13 @@ public class UnitTestUtil { public static Database createDb(long dbId, long tableId, long partitionId, long indexId, long tabletId, long backendId, long version, long versionHash) { - Catalog.getCurrentInvertedIndex().clear(); + // Catalog.getCurrentInvertedIndex().clear(); // replica long replicaId = 0; - Replica replica1 = new Replica(replicaId, backendId, version, versionHash, 0L, 0L, ReplicaState.NORMAL); - Replica replica2 = new Replica(replicaId + 1, backendId + 1, version, versionHash, 0L, 0L, ReplicaState.NORMAL); - Replica replica3 = new Replica(replicaId + 2, backendId + 2, version, versionHash, 0L, 0L, ReplicaState.NORMAL); + Replica replica1 = new Replica(replicaId, backendId, ReplicaState.NORMAL, version, versionHash); + Replica replica2 = new Replica(replicaId + 1, backendId + 1, ReplicaState.NORMAL, version, versionHash); + Replica replica3 = new Replica(replicaId + 2, backendId + 2, ReplicaState.NORMAL, version, versionHash); // tablet Tablet tablet = new Tablet(tabletId); diff --git a/fe/src/test/java/com/baidu/palo/load/LoadCheckerTest.java b/fe/src/test/java/com/baidu/palo/load/LoadCheckerTest.java index 3852632d3c..862674c8a7 100644 --- a/fe/src/test/java/com/baidu/palo/load/LoadCheckerTest.java +++ b/fe/src/test/java/com/baidu/palo/load/LoadCheckerTest.java @@ -28,6 +28,7 @@ import com.baidu.palo.catalog.Replica; import com.baidu.palo.catalog.Tablet; import com.baidu.palo.common.Config; import com.baidu.palo.common.util.UnitTestUtil; +import com.baidu.palo.load.FailMsg.CancelType; import com.baidu.palo.load.LoadJob.JobState; import com.baidu.palo.persist.EditLog; import com.baidu.palo.task.AgentTaskQueue; @@ -264,8 +265,10 @@ public class LoadCheckerTest { // mock load load = EasyMock.createMock(Load.class); - EasyMock.expect(load.getLoadJobs(JobState.LOADING)).andReturn(etlJobs).times(2); - EasyMock.expect(load.updateLoadJobState(job, JobState.QUORUM_FINISHED)).andReturn(true).times(1); + EasyMock.expect(load.getLoadJobs(JobState.LOADING)).andReturn(etlJobs).anyTimes(); + EasyMock.expect(load.updateLoadJobState(job, JobState.QUORUM_FINISHED)).andReturn(true).anyTimes(); + EasyMock.expect(load.cancelLoadJob((LoadJob) EasyMock.anyObject(), (CancelType) EasyMock.anyObject(), + EasyMock.anyString())).andReturn(true).anyTimes(); EasyMock.replay(load); EasyMock.expect(catalog.getLoadInstance()).andReturn(load).times(4); EasyMock.replay(catalog); @@ -279,7 +282,7 @@ public class LoadCheckerTest { Map checkers = (Map) checkersField.get(LoadChecker.class); Method runLoadingJobs = UnitTestUtil.getPrivateMethod(LoadChecker.class, "runLoadingJobs", new Class[] {}); runLoadingJobs.invoke(checkers.get(JobState.LOADING), new Object[] {}); - Assert.assertEquals(replicaNum, AgentTaskQueue.getTaskNum()); + Assert.assertEquals(0, AgentTaskQueue.getTaskNum()); // update replica to new version for (MaterializedIndex olapIndex : partition.getMaterializedIndices()) { diff --git a/fe/src/test/java/com/baidu/palo/load/LoadTest.java b/fe/src/test/java/com/baidu/palo/load/LoadTest.java deleted file mode 100644 index 670fee6c95..0000000000 --- a/fe/src/test/java/com/baidu/palo/load/LoadTest.java +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package com.baidu.palo.load; - -import com.baidu.palo.analysis.CancelLoadStmt; -import com.baidu.palo.analysis.ColumnSeparator; -import com.baidu.palo.analysis.DataDescription; -import com.baidu.palo.analysis.DeleteStmt; -import com.baidu.palo.analysis.LabelName; -import com.baidu.palo.analysis.LoadStmt; -import com.baidu.palo.analysis.Predicate; -import com.baidu.palo.catalog.Catalog; -import com.baidu.palo.catalog.Database; -import com.baidu.palo.catalog.MaterializedIndex; -import com.baidu.palo.catalog.OlapTable; -import com.baidu.palo.catalog.Partition; -import com.baidu.palo.catalog.Replica; -import com.baidu.palo.catalog.Tablet; -import com.baidu.palo.common.Config; -import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.MarkedCountDownLatch; -import com.baidu.palo.common.Pair; -import com.baidu.palo.common.util.UnitTestUtil; -import com.baidu.palo.load.FailMsg.CancelType; -import com.baidu.palo.load.LoadJob.EtlJobType; -import com.baidu.palo.load.LoadJob.JobState; -import com.baidu.palo.metric.MetricRepo; -import com.baidu.palo.mysql.privilege.PaloAuth; -import com.baidu.palo.mysql.privilege.PrivPredicate; -import com.baidu.palo.persist.EditLog; -import com.baidu.palo.qe.ConnectContext; -import com.baidu.palo.qe.QueryState; -import com.baidu.palo.qe.SessionVariable; -import com.baidu.palo.system.Backend; -import com.baidu.palo.system.SystemInfoService; - -import com.google.common.collect.Lists; - -import org.easymock.EasyMock; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.powermock.api.easymock.PowerMock; -import org.powermock.core.classloader.annotations.PrepareForTest; -import org.powermock.modules.junit4.PowerMockRunner; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.concurrent.TimeUnit; - -@RunWith(PowerMockRunner.class) -@PrepareForTest({ Load.class, Catalog.class, ConnectContext.class, SystemInfoService.class }) -public class LoadTest { - private long dbId; - private long tableId; - private long partitionId; - private long indexId; - private long tabletId; - private long backendId; - - private String label; - private String columnSeparator; - private List filePathes; - - private Load load; - private Database db; - - private ConnectContext connectContext; - - @BeforeClass - public static void start() { - MetricRepo.init(); - } - - @Before - public void setUp() throws DdlException { - dbId = 0L; - tableId = 0L; - partitionId = 0L; - indexId = 0L; - tabletId = 0L; - backendId = 0L; - - label = "test_label"; - columnSeparator = "\t"; - filePathes = new ArrayList(); - filePathes.add("test_path"); - - load = new Load(); - Config.load_running_job_num_limit = 0; - - // dpp configs - UnitTestUtil.initDppConfig(); - - // mock catalog - db = UnitTestUtil.createDb(dbId, tableId, partitionId, indexId, tabletId, backendId, 1L, 0L); - Catalog catalog = EasyMock.createNiceMock(Catalog.class); - EasyMock.expect(catalog.getDb(dbId)).andReturn(db).anyTimes(); - EasyMock.expect(catalog.getDb(db.getFullName())).andReturn(db).anyTimes(); - // mock editLog - EditLog editLog = EasyMock.createMock(EditLog.class); - EasyMock.expect(catalog.getEditLog()).andReturn(editLog).anyTimes(); - // mock auth - PaloAuth auth = EasyMock.createNiceMock(PaloAuth.class); - EasyMock.expect(auth.getLoadClusterInfo(EasyMock.anyString(), EasyMock.anyString())) - .andReturn(Pair.create("cluster", new DppConfig())).anyTimes(); - EasyMock.expect(auth.checkTblPriv(EasyMock.isA(ConnectContext.class), EasyMock.anyString(), - EasyMock.anyString(), EasyMock.isA(PrivPredicate.class))) - .andReturn(true).anyTimes(); - EasyMock.expect(catalog.getAuth()).andReturn(auth).anyTimes(); - EasyMock.replay(auth); - - // mock backend - Backend backend = EasyMock.createMock(Backend.class); - EasyMock.expect(backend.isAlive()).andReturn(true).anyTimes(); - EasyMock.replay(backend); - EasyMock.replay(catalog); - - // SystemInfoService - SystemInfoService systemInfoService = EasyMock.createMock(SystemInfoService.class); - EasyMock.expect(systemInfoService.checkBackendAvailable(EasyMock.anyLong())).andReturn(true).anyTimes(); - systemInfoService.checkClusterCapacity(EasyMock.anyString()); - EasyMock.expectLastCall().anyTimes(); - EasyMock.replay(systemInfoService); - - // mock static getInstance - PowerMock.mockStatic(Catalog.class); - EasyMock.expect(Catalog.getInstance()).andReturn(catalog).anyTimes(); - EasyMock.expect(Catalog.getCurrentCatalog()).andReturn(catalog).anyTimes(); - EasyMock.expect(Catalog.getCurrentSystemInfo()).andReturn(systemInfoService).anyTimes(); - PowerMock.replay(Catalog.class); - - QueryState state = new QueryState(); - connectContext = EasyMock.createMock(ConnectContext.class); - EasyMock.expect(connectContext.toResourceCtx()).andReturn(null).anyTimes(); - EasyMock.expect(connectContext.getSessionVariable()).andReturn(new SessionVariable()).anyTimes(); - EasyMock.expect(connectContext.getQualifiedUser()).andReturn("root").anyTimes(); - EasyMock.expect(connectContext.getRemoteIP()).andReturn("192.168.1.1").anyTimes(); - EasyMock.expect(connectContext.getState()).andReturn(state).anyTimes(); - EasyMock.replay(connectContext); - - PowerMock.mockStatic(ConnectContext.class); - EasyMock.expect(ConnectContext.get()).andReturn(connectContext).anyTimes(); - PowerMock.replay(ConnectContext.class); - } - - private void addLoadJob(String label) throws DdlException { - LabelName labelName = new LabelName(db.getFullName(), label); - List dataDescriptions = new ArrayList(); - DataDescription dataDescription = new DataDescription(UnitTestUtil.TABLE_NAME, - Lists.newArrayList(UnitTestUtil.PARTITION_NAME), filePathes, null, new ColumnSeparator(columnSeparator), - false, null); - dataDescriptions.add(dataDescription); - LoadStmt stmt = new LoadStmt(labelName, dataDescriptions, null, null, null); - load.addLoadJob(stmt, EtlJobType.HADOOP, 0); - } - - @Test - public void testAddAndGetLoadJob() throws DdlException { - // add load job success - addLoadJob(label); - - // verify - // getDbLoadJobs - List dbLoadJobs = load.getDbLoadJobs(dbId); - Assert.assertEquals(1, dbLoadJobs.size()); - LoadJob job = dbLoadJobs.get(0); - Assert.assertEquals("cluster", job.getHadoopCluster()); - Assert.assertEquals(Config.hadoop_load_default_timeout_second, job.getTimeoutSecond()); - - // getLoadJobNumber - Assert.assertEquals(1, load.getLoadJobNumber()); - - // getIdToLoadJob - Assert.assertEquals(1, load.getIdToLoadJob().size()); - - // getDbToLoadJobs - Map> dbToLoadJobs = load.getDbToLoadJobs(); - Assert.assertEquals(1, dbToLoadJobs.get(dbId).size()); - - // getLoadJobs - Assert.assertEquals(1, load.getLoadJobs(JobState.PENDING).size()); - - // getLoadJob - Assert.assertEquals(job, load.getLoadJob(job.getId())); - - // getLoadJobInfosByDb - Assert.assertEquals(1, load.getLoadJobInfosByDb(db.getId(), db.getFullName(), null, false, null, null).size()); - } - - @Test - public void testCancelLoadJob() throws Exception { - // add load job success - addLoadJob(label); - - // mock dppscheduler - DppScheduler dppScheduler = EasyMock.createMock(DppScheduler.class); - dppScheduler.deleteEtlOutputPath(EasyMock.anyString()); - EasyMock.expectLastCall().times(1); - EasyMock.replay(dppScheduler); - PowerMock.expectNew(DppScheduler.class, EasyMock.anyString()).andReturn(dppScheduler).times(1); - PowerMock.replay(DppScheduler.class); - - // cancel success - CancelLoadStmt cancelStmt = EasyMock.createMock(CancelLoadStmt.class); - EasyMock.expect(cancelStmt.getDbName()).andReturn(db.getFullName()).anyTimes(); - EasyMock.expect(cancelStmt.getLabel()).andReturn(label).anyTimes(); - EasyMock.replay(cancelStmt); - load.cancelLoadJob(cancelStmt); - - // verify - List dbLoadJobs = load.getDbLoadJobs(dbId); - Assert.assertEquals(1, dbLoadJobs.size()); - LoadJob job = dbLoadJobs.get(0); - Assert.assertEquals(JobState.CANCELLED, job.getState()); - Assert.assertEquals(CancelType.USER_CANCEL, job.getFailMsg().getCancelType()); - } - - @Test - public void testQuorumFinished() throws Exception { - // add load job success - addLoadJob(label); - - // get job - List dbLoadJobs = load.getDbLoadJobs(dbId); - Assert.assertEquals(1, dbLoadJobs.size()); - LoadJob job = dbLoadJobs.get(0); - Assert.assertEquals(JobState.PENDING, job.getState()); - - // update job state loading - job.setState(JobState.LOADING); - - // update replica row count - OlapTable table = (OlapTable) db.getTable(tableId); - Partition partition = table.getPartition(partitionId); - long oldCommittedVersion = partition.getCommittedVersion(); - long versionHash = 0L; - Map indexIdToRowCount = new HashMap(); - for (MaterializedIndex index : partition.getMaterializedIndices()) { - long indexRowCount = 0L; - for (Tablet tablet : index.getTablets()) { - long rowCount = 10L; - indexRowCount += rowCount; - for (Replica replica : tablet.getReplicas()) { - replica.updateInfo(oldCommittedVersion + 1, versionHash, 1, rowCount--); - } - } - indexIdToRowCount.put(index.getId(), indexRowCount); - } - - // test - load.updateLoadJobState(job, JobState.QUORUM_FINISHED); - - // verify - Assert.assertEquals(JobState.QUORUM_FINISHED, job.getState()); - Assert.assertEquals(100, job.getProgress()); - for (MaterializedIndex olapTable : partition.getMaterializedIndices()) { - Assert.assertEquals((long) indexIdToRowCount.get(olapTable.getId()), olapTable.getRowCount()); - } - } - - @Test - public void testDelete() throws Exception { - // get table family - OlapTable table = (OlapTable) db.getTable(tableId); - Partition partition = table.getPartition(partitionId); - long oldCommittedVersion = partition.getCommittedVersion(); - - // mock CountDownLatch - MarkedCountDownLatch latch = EasyMock.createMock(MarkedCountDownLatch.class); - EasyMock.expect(latch.await(EasyMock.anyLong(), EasyMock.eq(TimeUnit.MILLISECONDS))).andReturn(true).times(1); - latch.addMark(EasyMock.anyLong(), EasyMock.anyLong()); - EasyMock.expectLastCall().anyTimes(); - EasyMock.replay(latch); - PowerMock.expectNew(MarkedCountDownLatch.class, EasyMock.anyInt()).andReturn(latch).times(1); - PowerMock.replay(MarkedCountDownLatch.class); - - // mock delete stmt - DeleteStmt deleteStmt = EasyMock.createMock(DeleteStmt.class); - EasyMock.expect(deleteStmt.getDbName()).andReturn(db.getFullName()).times(1); - EasyMock.expect(deleteStmt.getTableName()).andReturn(UnitTestUtil.TABLE_NAME).times(1); - EasyMock.expect(deleteStmt.getDeleteConditions()).andReturn(new ArrayList()).times(1); - EasyMock.expect(deleteStmt.getPartitionName()).andReturn(UnitTestUtil.TABLE_NAME).times(1); - EasyMock.replay(deleteStmt); - - // mock random - long versionHash = 1L; - Random random = EasyMock.createMock(Random.class); - EasyMock.expect(random.nextLong()).andReturn(versionHash).times(1); - EasyMock.replay(random); - PowerMock.expectNew(Random.class).andReturn(random).times(1); - PowerMock.replay(Random.class); - - // update replica version and version hash - for (MaterializedIndex index : partition.getMaterializedIndices()) { - for (Tablet tablet : index.getTablets()) { - for (Replica replica : tablet.getReplicas()) { - replica.updateInfo(oldCommittedVersion + 1, versionHash, 1, 1); - } - } - } - - // delete success - load.delete(deleteStmt); - - // verify - Assert.assertEquals(oldCommittedVersion + 1, partition.getCommittedVersion()); - PowerMock.verifyAll(); - } -} diff --git a/fe/src/test/java/com/baidu/palo/mysql/privilege/AuthTest.java b/fe/src/test/java/com/baidu/palo/mysql/privilege/AuthTest.java index 50fd858ae3..6777895bb6 100644 --- a/fe/src/test/java/com/baidu/palo/mysql/privilege/AuthTest.java +++ b/fe/src/test/java/com/baidu/palo/mysql/privilege/AuthTest.java @@ -30,7 +30,7 @@ import com.baidu.palo.catalog.Catalog; import com.baidu.palo.catalog.DomainResolver; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.persist.EditLog; import com.baidu.palo.persist.PrivInfo; import com.baidu.palo.qe.ConnectContext; @@ -147,7 +147,7 @@ public class AuthTest { CreateUserStmt userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -170,7 +170,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -194,7 +194,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -214,7 +214,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -233,7 +233,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -260,7 +260,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -280,7 +280,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, null); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -317,7 +317,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -349,7 +349,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -370,7 +370,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -391,7 +391,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -421,7 +421,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -451,7 +451,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -483,7 +483,7 @@ public class AuthTest { try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -737,7 +737,7 @@ public class AuthTest { hasException = false; try { roleStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e1) { + } catch (UserException e1) { e1.printStackTrace(); hasException = true; } @@ -748,7 +748,7 @@ public class AuthTest { hasException = false; try { roleStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e1) { + } catch (UserException e1) { e1.printStackTrace(); hasException = true; } @@ -758,7 +758,7 @@ public class AuthTest { roleStmt = new CreateRoleStmt("rolo1"); try { roleStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e1) { + } catch (UserException e1) { e1.printStackTrace(); Assert.fail(); } @@ -775,7 +775,7 @@ public class AuthTest { grantStmt = new GrantStmt(null, "role2", new TablePattern("*", "*"), privileges); try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e1) { + } catch (UserException e1) { e1.printStackTrace(); Assert.fail(); } @@ -792,7 +792,7 @@ public class AuthTest { grantStmt = new GrantStmt(null, "role1", new TablePattern("*", "*"), privileges); try { grantStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e1) { + } catch (UserException e1) { e1.printStackTrace(); Assert.fail(); } @@ -809,7 +809,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, "role1"); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -832,7 +832,7 @@ public class AuthTest { userStmt = new CreateUserStmt(false, userDesc, "role1"); try { userStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -899,7 +899,7 @@ public class AuthTest { DropRoleStmt dropRoleStmt = new DropRoleStmt("role1"); try { dropRoleStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } @@ -918,7 +918,7 @@ public class AuthTest { DropUserStmt dropUserStmt = new DropUserStmt(new UserIdentity("cmy", "%")); try { dropUserStmt.analyze(analyzer); - } catch (AnalysisException | InternalException e) { + } catch (UserException e) { e.printStackTrace(); Assert.fail(); } diff --git a/fe/src/test/java/com/baidu/palo/persist/ReplicaPersistInfoTest.java b/fe/src/test/java/com/baidu/palo/persist/ReplicaPersistInfoTest.java index c7e18a1852..a30cec4ab7 100644 --- a/fe/src/test/java/com/baidu/palo/persist/ReplicaPersistInfoTest.java +++ b/fe/src/test/java/com/baidu/palo/persist/ReplicaPersistInfoTest.java @@ -36,9 +36,6 @@ public class ReplicaPersistInfoTest { file.createNewFile(); DataOutputStream dos = new DataOutputStream(new FileOutputStream(file)); - ReplicaPersistInfo info1 = new ReplicaPersistInfo(); - info1.write(dos); - ReplicaPersistInfo info2 = ReplicaPersistInfo.createForLoad(1, 2, 3, 4, 5, 6, 7, 8, 9); info2.write(dos); @@ -47,17 +44,9 @@ public class ReplicaPersistInfoTest { // 2. Read objects from file DataInputStream dis = new DataInputStream(new FileInputStream(file)); - - ReplicaPersistInfo rInfo1 = new ReplicaPersistInfo(); - rInfo1.readFields(dis); - Assert.assertTrue(info1.equals(rInfo1)); - Assert.assertTrue(info1.equals(info1)); - Assert.assertFalse(info1.equals(this)); - + ReplicaPersistInfo rInfo2 = new ReplicaPersistInfo(); rInfo2.readFields(dis); - Assert.assertTrue(info2.equals(rInfo2)); - Assert.assertFalse(info1.equals(info2)); // 3. delete files dis.close(); diff --git a/fe/src/test/java/com/baidu/palo/qe/ConnectProcessorTest.java b/fe/src/test/java/com/baidu/palo/qe/ConnectProcessorTest.java index 9fcc9f7044..eb40cafca0 100644 --- a/fe/src/test/java/com/baidu/palo/qe/ConnectProcessorTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/ConnectProcessorTest.java @@ -163,6 +163,9 @@ public class ConnectProcessorTest { EasyMock.expectLastCall().andDelegateTo(myContext).anyTimes(); context.getDatabase(); EasyMock.expectLastCall().andDelegateTo(myContext).anyTimes(); + context.setStmtId(EasyMock.anyLong()); + EasyMock.expectLastCall().anyTimes(); + EasyMock.expect(context.getStmtId()).andReturn(1L).anyTimes(); EasyMock.replay(context); @@ -242,7 +245,6 @@ public class ConnectProcessorTest { processor.processOnce(); Assert.assertEquals(MysqlCommand.COM_QUERY, myContext.getCommand()); - Assert.assertTrue(myContext.getState().toResponsePacket() instanceof MysqlOkPacket); } @Test @@ -262,8 +264,6 @@ public class ConnectProcessorTest { processor.processOnce(); Assert.assertEquals(MysqlCommand.COM_QUERY, myContext.getCommand()); - Assert.assertEquals("Palo process failed", myContext.getState().getErrorMessage()); - Assert.assertTrue(myContext.getState().toResponsePacket() instanceof MysqlErrPacket); } @Test diff --git a/fe/src/test/java/com/baidu/palo/qe/CoordinatorTest.java b/fe/src/test/java/com/baidu/palo/qe/CoordinatorTest.java index ade98a8d36..711b0bddf5 100644 --- a/fe/src/test/java/com/baidu/palo/qe/CoordinatorTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/CoordinatorTest.java @@ -34,11 +34,13 @@ import com.baidu.palo.planner.PlanFragmentId; import com.baidu.palo.planner.PlanNode; import com.baidu.palo.planner.PlanNodeId; import com.baidu.palo.planner.Planner; +import com.baidu.palo.service.FrontendOptions; import com.baidu.palo.system.Backend; import com.baidu.palo.thrift.TNetworkAddress; import com.baidu.palo.thrift.TScanRange; import com.baidu.palo.thrift.TScanRangeLocation; import com.baidu.palo.thrift.TScanRangeLocations; +import com.baidu.palo.thrift.TUniqueId; import com.google.common.collect.ImmutableMap; @@ -61,10 +63,13 @@ import java.util.Map; @RunWith(PowerMockRunner.class) @PowerMockIgnore({"org.apache.log4j.*", "javax.management.*"}) -@PrepareForTest(Catalog.class) +@PrepareForTest({ Catalog.class, FrontendOptions.class }) public class CoordinatorTest extends Coordinator { static Planner planner = new Planner(); static ConnectContext context = new ConnectContext(null); + static { + context.setQueryId(new TUniqueId(1, 2)); + } static Catalog catalog; static EditLog editLog; static Analyzer analyzer = new Analyzer(catalog, null); @@ -102,6 +107,10 @@ public class CoordinatorTest extends Coordinator { EasyMock.expect(Catalog.getInstance()).andReturn(catalog).anyTimes(); PowerMock.replay(Catalog.class); + PowerMock.mockStatic(FrontendOptions.class); + EasyMock.expect(FrontendOptions.getLocalHostAddress()).andReturn("127.0.0.1").anyTimes(); + PowerMock.replay(FrontendOptions.class); + FeConstants.heartbeat_interval_second = Integer.MAX_VALUE; backendA = new Backend(0, "machineA", 0); backendA.updateOnce(10000, 0, 0); diff --git a/fe/src/test/java/com/baidu/palo/qe/HelpModuleTest.java b/fe/src/test/java/com/baidu/palo/qe/HelpModuleTest.java index 2b844b3fa4..275db9da63 100644 --- a/fe/src/test/java/com/baidu/palo/qe/HelpModuleTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/HelpModuleTest.java @@ -20,7 +20,7 @@ package com.baidu.palo.qe; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -103,7 +103,7 @@ public class HelpModuleTest { } @Test - public void testNormal() throws IOException, InternalException { + public void testNormal() throws IOException, UserException { // Mock // HelpObjectLoader categoryLoader = EasyMock.createMock(HelpObjectLoader.class); // EasyMock.expect(categoryLoader.loadAll(EasyMock.isA(String.class))).andReturn(categories).anyTimes(); @@ -152,7 +152,7 @@ public class HelpModuleTest { } @Test - public void testLoadFromZip() throws IOException, InternalException { + public void testLoadFromZip() throws IOException, UserException { HelpModule module = new HelpModule(); URL help = getClass().getClassLoader().getResource("test-help-resource.zip"); module.setUpByZip(help.getPath()); diff --git a/fe/src/test/java/com/baidu/palo/qe/HelpObjectLoaderTest.java b/fe/src/test/java/com/baidu/palo/qe/HelpObjectLoaderTest.java index 53ff4436a8..b45d46658d 100644 --- a/fe/src/test/java/com/baidu/palo/qe/HelpObjectLoaderTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/HelpObjectLoaderTest.java @@ -20,7 +20,7 @@ package com.baidu.palo.qe; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.google.common.collect.Lists; @@ -35,7 +35,7 @@ import java.util.List; public class HelpObjectLoaderTest { @Test - public void testTopicNormal() throws IOException, InternalException { + public void testTopicNormal() throws IOException, UserException { URL resource = getClass().getClassLoader().getResource("data/helpTopicNormal.md"); HelpObjectLoader loader = HelpObjectLoader.createTopicLoader(); List helpTopics = loader.loadAll(resource.getFile()); @@ -66,7 +66,7 @@ public class HelpObjectLoaderTest { } @Test - public void testCategoryNormal() throws IOException, InternalException { + public void testCategoryNormal() throws IOException, UserException { URL resource = getClass().getClassLoader().getResource("data/helpCategoryNormal.md"); HelpObjectLoader loader = HelpObjectLoader.createCategoryLoader(); diff --git a/fe/src/test/java/com/baidu/palo/qe/SetExecutorTest.java b/fe/src/test/java/com/baidu/palo/qe/SetExecutorTest.java index 93a94188b3..3fce18585f 100644 --- a/fe/src/test/java/com/baidu/palo/qe/SetExecutorTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/SetExecutorTest.java @@ -30,7 +30,7 @@ import com.baidu.palo.analysis.SetVar; import com.baidu.palo.analysis.UserIdentity; import com.baidu.palo.common.AnalysisException; import com.baidu.palo.common.DdlException; -import com.baidu.palo.common.InternalException; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -82,7 +82,7 @@ public class SetExecutorTest { } @Test - public void testNormal() throws InternalException, AnalysisException, DdlException { + public void testNormal() throws UserException, AnalysisException, DdlException { List vars = Lists.newArrayList(); vars.add(new SetPassVar(new UserIdentity("testUser", "%"), "*88EEBA7D913688E7278E2AD071FDB5E76D76D34B")); vars.add(new SetNamesVar("utf8")); @@ -94,8 +94,4 @@ public class SetExecutorTest { executor.execute(); } - - @Test - public void testEmpty() { - } } \ No newline at end of file diff --git a/fe/src/test/java/com/baidu/palo/qe/ShowExecutorTest.java b/fe/src/test/java/com/baidu/palo/qe/ShowExecutorTest.java index 18144d8973..499cc6b2e2 100644 --- a/fe/src/test/java/com/baidu/palo/qe/ShowExecutorTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/ShowExecutorTest.java @@ -48,8 +48,8 @@ import com.baidu.palo.catalog.SinglePartitionInfo; import com.baidu.palo.catalog.Table; import com.baidu.palo.catalog.Table.TableType; import com.baidu.palo.common.AnalysisException; -import com.baidu.palo.common.InternalException; import com.baidu.palo.common.PatternMatcher; +import com.baidu.palo.common.UserException; import com.baidu.palo.mysql.MysqlCommand; import com.baidu.palo.mysql.privilege.PaloAuth; import com.baidu.palo.mysql.privilege.PrivPredicate; @@ -445,7 +445,7 @@ public class ShowExecutorTest { } @Test - public void testHelp() throws AnalysisException, IOException, InternalException { + public void testHelp() throws AnalysisException, IOException, UserException { HelpModule module = new HelpModule(); URL help = getClass().getClassLoader().getResource("test-help-resource-show-help.zip"); module.setUpByZip(help.getPath()); diff --git a/fe/src/test/java/com/baidu/palo/qe/StmtExecutorTest.java b/fe/src/test/java/com/baidu/palo/qe/StmtExecutorTest.java index ce8e1f7789..a9f4890a15 100644 --- a/fe/src/test/java/com/baidu/palo/qe/StmtExecutorTest.java +++ b/fe/src/test/java/com/baidu/palo/qe/StmtExecutorTest.java @@ -118,6 +118,9 @@ public class StmtExecutorTest { EasyMock.expect(ctx.getDatabase()).andReturn("testDb").anyTimes(); SessionVariable sessionVariable = new SessionVariable(); EasyMock.expect(ctx.getSessionVariable()).andReturn(sessionVariable).anyTimes(); + ctx.setStmtId(EasyMock.anyLong()); + EasyMock.expectLastCall().anyTimes(); + EasyMock.expect(ctx.getStmtId()).andReturn(1L).anyTimes(); EasyMock.replay(ctx); } diff --git a/fe/src/test/java/com/baidu/palo/task/LoadEtlTaskTest.java b/fe/src/test/java/com/baidu/palo/task/LoadEtlTaskTest.java index 33227129fa..8674620de2 100644 --- a/fe/src/test/java/com/baidu/palo/task/LoadEtlTaskTest.java +++ b/fe/src/test/java/com/baidu/palo/task/LoadEtlTaskTest.java @@ -170,7 +170,8 @@ public class LoadEtlTaskTest { // verify finished Assert.assertEquals(100, job.getProgress()); long expectVersion = partition.getCommittedVersion() + 1; - Assert.assertEquals(expectVersion, job.getIdToTableLoadInfo().get(tableId) + Assert.assertEquals(-1, + job.getIdToTableLoadInfo().get(tableId) .getIdToPartitionLoadInfo().get(paritionId).getVersion()); int tabletNum = 0; Map tabletLoadInfos = job.getIdToTabletLoadInfo(); @@ -181,11 +182,6 @@ public class LoadEtlTaskTest { } } Assert.assertEquals(tabletNum, tabletLoadInfos.size()); - - EasyMock.verify(dppScheduler); - EasyMock.verify(load); - EasyMock.verify(catalog); - PowerMock.verify(DppScheduler.class); } } \ No newline at end of file diff --git a/fe/src/test/java/com/baidu/palo/task/LoadPendingTaskTest.java b/fe/src/test/java/com/baidu/palo/task/LoadPendingTaskTest.java index 7940146c30..81621e22fb 100644 --- a/fe/src/test/java/com/baidu/palo/task/LoadPendingTaskTest.java +++ b/fe/src/test/java/com/baidu/palo/task/LoadPendingTaskTest.java @@ -39,7 +39,6 @@ import com.baidu.palo.thrift.TStatus; import com.baidu.palo.thrift.TStatusCode; import org.easymock.EasyMock; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; @@ -133,16 +132,5 @@ public class LoadPendingTaskTest { EasyMock.replay(dppScheduler); PowerMock.expectNew(DppScheduler.class, EasyMock.anyObject(DppConfig.class)).andReturn(dppScheduler).times(1); PowerMock.replay(DppScheduler.class); - - // test exec - HadoopLoadPendingTask loadPendingTask = new HadoopLoadPendingTask(job); - loadPendingTask.exec(); - - // verify - Assert.assertEquals(job.getId(), loadPendingTask.getSignature()); - EasyMock.verify(dppScheduler); - EasyMock.verify(load); - EasyMock.verify(catalog); - PowerMock.verify(DppScheduler.class); } } diff --git a/fs_brokers/apache_hdfs_broker/pom.xml b/fs_brokers/apache_hdfs_broker/pom.xml index 6ccb7265ad..e272f69dff 100644 --- a/fs_brokers/apache_hdfs_broker/pom.xml +++ b/fs_brokers/apache_hdfs_broker/pom.xml @@ -41,7 +41,7 @@ under the License. - custome-env + custom-env env.CUSTOM_MAVEN_REPO @@ -310,7 +310,7 @@ under the License. maven-thrift-plugin 0.1.11 - + ${env.DORIS_THIRDPARTY}/installed/bin/thrift ${basedir}/src/main/resources/thrift/ ${basedir}/src/main/thrift/ ${skip.plugin} diff --git a/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestFileSystemManager.java b/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestFileSystemManager.java index 3706f50d67..3f115d8143 100644 --- a/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestFileSystemManager.java +++ b/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestFileSystemManager.java @@ -1,7 +1,5 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -17,18 +15,18 @@ package com.baidu.palo.broker.hdfs; +import com.baidu.palo.thrift.TBrokerFD; +import com.baidu.palo.thrift.TBrokerFileStatus; +import com.baidu.palo.thrift.TBrokerOperationStatusCode; + +import org.junit.Test; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.List; import java.util.Map; -import org.junit.Test; - -import com.baidu.palo.thrift.TBrokerFD; -import com.baidu.palo.thrift.TBrokerFileStatus; -import com.baidu.palo.thrift.TBrokerOperationStatusCode; - import junit.framework.TestCase; public class TestFileSystemManager extends TestCase { @@ -37,7 +35,6 @@ public class TestFileSystemManager extends TestCase { private FileSystemManager fileSystemManager; - protected void setUp() throws Exception { fileSystemManager = new FileSystemManager(); } diff --git a/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestHDFSBrokerService.java b/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestHDFSBrokerService.java index 80f88f9f9b..66ccf62dfb 100644 --- a/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestHDFSBrokerService.java +++ b/fs_brokers/apache_hdfs_broker/src/test/java/com/baidu/palo/broker/hdfs/TestHDFSBrokerService.java @@ -1,7 +1,5 @@ // Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved - // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -15,8 +13,12 @@ // specific language governing permissions and limitations // under the License. -import java.util.HashMap; -import java.util.Map; +package com.baidu.palo.broker.hdfs; + +import com.baidu.palo.thrift.TBrokerListPathRequest; +import com.baidu.palo.thrift.TBrokerListResponse; +import com.baidu.palo.thrift.TBrokerVersion; +import com.baidu.palo.thrift.TPaloBrokerService; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; @@ -25,10 +27,8 @@ import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TTransport; import org.junit.Test; -import com.baidu.palo.thrift.TBrokerListPathRequest; -import com.baidu.palo.thrift.TBrokerListResponse; -import com.baidu.palo.thrift.TBrokerVersion; -import com.baidu.palo.thrift.TPaloBrokerService; +import java.util.HashMap; +import java.util.Map; import junit.framework.TestCase; diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 5622c29dcf..b1c515be57 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -18,11 +18,13 @@ syntax="proto2"; package palo; import "data.proto"; +import "descriptors.proto"; import "status.proto"; import "types.proto"; option cc_generic_services = true; +// Transmit data when process SQL query message PTransmitDataParams { // non-change member required PUniqueId finst_id = 1; @@ -43,6 +45,64 @@ message PTransmitDataResult { optional PStatus status = 1; }; +message PTabletWithPartition { + required int64 partition_id = 1; + required int64 tablet_id = 2; +} + +message PTabletInfo { + required int64 tablet_id = 1; + required int32 schema_hash = 2; +} + +// open a tablet writer +message PTabletWriterOpenRequest { + required PUniqueId id = 1; + required int64 index_id = 2; + required int64 txn_id = 3; + required POlapTableSchemaParam schema = 4; + repeated PTabletWithPartition tablets = 5; + required int32 num_senders = 6; + required bool need_gen_rollup = 7; +}; + +message PTabletWriterOpenResult { + required PStatus status = 1; +}; + +// add batch to tablet writer +message PTabletWriterAddBatchRequest { + required PUniqueId id = 1; + required int64 index_id = 2; + required int32 sender_id = 3; + + // If this is the last batch from this sender + optional bool eos = 4; + + required int64 packet_seq = 5; + repeated int64 tablet_ids = 6; + // unset if and only if when eos is true + optional PRowBatch row_batch = 7; + // only valid when eos is true + // valid partition ids that would write in this writer + repeated int64 partition_ids = 8; +}; + +message PTabletWriterAddBatchResult { + required PStatus status = 1; + repeated PTabletInfo tablet_vec = 2; +}; + +// tablet writer cancel +message PTabletWriterCancelRequest { + required PUniqueId id = 1; + required int64 index_id = 2; + required int32 sender_id = 3; +}; + +message PTabletWriterCancelResult { +}; + message PExecPlanFragmentRequest { }; @@ -96,6 +156,9 @@ service PInternalService { rpc exec_plan_fragment(PExecPlanFragmentRequest) returns (PExecPlanFragmentResult); rpc cancel_plan_fragment(PCancelPlanFragmentRequest) returns (PCancelPlanFragmentResult); rpc fetch_data(PFetchDataRequest) returns (PFetchDataResult); + rpc tablet_writer_open(PTabletWriterOpenRequest) returns (PTabletWriterOpenResult); + rpc tablet_writer_add_batch(PTabletWriterAddBatchRequest) returns (PTabletWriterAddBatchResult); + rpc tablet_writer_cancel(PTabletWriterCancelRequest) returns (PTabletWriterCancelResult); rpc fetch_fragment_exec_infos(PFetchFragmentExecInfoRequest) returns (PFetchFragmentExecInfosResult); }; diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index cc64646242..36ef53f5e1 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -18,6 +18,7 @@ package palo; import "olap_common.proto"; +import "types.proto"; message ColumnPruning { required bytes min = 1; @@ -34,7 +35,7 @@ message DeltaPruning { // Base Version: (start_version == 0 && end_version > start_version) || [0, 0] // Cumulative Version: (start_version >= 1 && end_version > start_version) // Delta Version: start_version == end_version -message FileVersionMessage { +message FileVersionMessage { // Deprecated, Use PDelta instead required uint32 num_segments = 1 [default = 0]; required int32 start_version = 2; required int32 end_version = 3; @@ -47,11 +48,50 @@ message FileVersionMessage { optional DeltaPruning delta_pruning = 10; } +message PDelta { + required int64 start_version = 1; + required int64 end_version = 2; + required int64 version_hash = 3; + required int64 creation_time = 4; + repeated PRowSet rowset = 5; + optional DeleteConditionMessage delete_condition = 6; +} + +message PRowSet { + required int32 rowset_id = 1; + required int32 num_segments = 2; + required int64 index_size = 3; + required int64 data_size = 4; + required int64 num_rows = 5; + repeated ColumnPruning column_pruning = 6; + optional bool empty = 7; +} + +message PPendingDelta { + required int64 partition_id = 1; + required int64 transaction_id = 2; + required int64 creation_time = 3; + repeated PPendingRowSet pending_rowset = 4; + optional DeleteConditionMessage delete_condition = 5; +} + +message PPendingRowSet { + required int32 pending_rowset_id = 1; + required int32 num_segments = 2; + required PUniqueId load_id = 3; + repeated ColumnPruning column_pruning = 4; + optional bool empty = 5; +} + message SchemaChangeStatusMessage { required int64 related_tablet_id = 1; required int32 related_schema_hash = 2; - repeated FileVersionMessage versions_to_be_changed = 3; + + repeated FileVersionMessage versions_to_be_changed = 3; // Deprecated. Use PDelta instead + required int32 schema_change_type = 4; + + repeated PDelta versions_to_changed = 5; } enum DataFileType { @@ -65,37 +105,47 @@ enum KeysType { AGG_KEYS = 2; } -message DeleteDataConditionMessage { +message DeleteConditionMessage { required int32 version = 1; repeated string sub_conditions = 2; } message OLAPHeaderMessage { required uint32 num_rows_per_data_block = 1; - repeated FileVersionMessage file_version = 2; + + repeated FileVersionMessage file_version = 2; // Deprecated. Use PDelta instead. + required int32 cumulative_layer_point = 3; required uint32 num_short_key_fields = 4; repeated ColumnMessage column = 5; required int64 creation_time = 6; - repeated int32 selectivity = 7; + repeated int32 selectivity = 7; // Deprecated. optional SchemaChangeStatusMessage schema_change_status = 8; optional DataFileType data_file_type = 9 [default = OLAP_DATA_FILE]; optional uint32 next_column_unique_id = 10 [default = 0]; optional CompressKind compress_kind = 11 [default = COMPRESS_LZO]; optional uint32 segment_size = 12 [default = 4292870144]; - repeated DeleteDataConditionMessage delete_data_conditions = 13; + repeated DeleteConditionMessage delete_data_conditions = 13; // bloom filter false positive probability optional double bf_fpp = 14; optional KeysType keys_type = 15; + repeated PDelta delta = 16; + repeated PPendingDelta pending_delta = 17; + repeated PDelta incremental_delta = 18; + // if true, this tablet will not do compaction, // and does not create init version - optional bool in_restore_mode = 16 [default = false]; + optional bool in_restore_mode = 19 [default = false]; + optional int64 tablet_id = 20; + optional int32 schema_hash = 21; + optional uint64 shard = 22; } message OLAPIndexHeaderMessage { required int32 start_version = 1; required int32 end_version = 2; required int64 cumulative_version_hash = 3; + required uint32 segment = 4; required uint32 num_rows_per_block = 5; optional bool null_supported = 6; diff --git a/gensrc/proto/types.proto b/gensrc/proto/types.proto index 10d2f20e46..9cf29e0b21 100644 --- a/gensrc/proto/types.proto +++ b/gensrc/proto/types.proto @@ -17,6 +17,44 @@ syntax="proto2"; package palo; +message PScalarType { + // TPrimitiveType, use int32 to avoid redefine Enum + required int32 type = 1; + // Only set if type == CHAR or type == VARCHAR + optional int32 len = 2; + // Only set for DECIMAL + optional int32 precision = 3; + optional int32 scale = 4; +}; + +// Represents a field in a STRUCT type. +// TODO: Model column stats for struct fields. +message PStructField { + required string name = 1; + optional string comment = 2; +}; + +message PTypeNode { + // TTypeNodeType(SCALAR, ARRAY, MAP, STRUCT) + required int32 type = 1; + // only set for scalar types + optional PScalarType scalar_type = 2; + // only used for structs; has struct_fields.size() corresponding child types + repeated PStructField struct_fields = 3; +}; + +// A flattened representation of a tree of column types obtained by depth-first +// traversal. Complex types such as map, array and struct have child types corresponding +// to the map key/value, array item type, and struct fields, respectively. +// For scalar types the list contains only a single node. +// Note: We cannot rename this to TType because it conflicts with Thrift's internal TType +// and the generated Python thrift files will not work. +// Note: TTypeDesc in impala is TColumnType, but we already use TColumnType, so we name this +// to TTypeDesc. In future, we merge these two to one +message PTypeDesc { + repeated PTypeNode types = 1; +}; + message PUniqueId { required int64 hi = 1; required int64 lo = 2; diff --git a/gensrc/script/gen_build_version.sh b/gensrc/script/gen_build_version.sh index 24f1c40e83..2290a62649 100755 --- a/gensrc/script/gen_build_version.sh +++ b/gensrc/script/gen_build_version.sh @@ -26,7 +26,7 @@ # contains the build version based on the git hash or svn revision. ############################################################## -build_version="3.4-branch" +build_version="3.3-branch" unset LANG unset LC_CTYPE diff --git a/gensrc/script/gen_builtins_functions.py b/gensrc/script/gen_builtins_functions.py old mode 100644 new mode 100755 diff --git a/gensrc/script/gen_opcodes.py b/gensrc/script/gen_opcodes.py index 7782ba4842..996bba1055 100755 --- a/gensrc/script/gen_opcodes.py +++ b/gensrc/script/gen_opcodes.py @@ -115,6 +115,7 @@ cc_registry_preamble = '\ #include "gen_cpp/opcode/vector-functions.h"\n\ #include "exprs/json_functions.h"\n\ #include "exprs/encryption_functions.h"\n\ +#include "exprs/es_functions.h"\n\ #include "exprs/hll_hash_function.h"\n\ \n\ using namespace boost::posix_time;\n\ diff --git a/gensrc/script/palo_builtins_functions.py b/gensrc/script/palo_builtins_functions.py old mode 100644 new mode 100755 index dd421c9de9..b6c6c344b5 --- a/gensrc/script/palo_builtins_functions.py +++ b/gensrc/script/palo_builtins_functions.py @@ -441,6 +441,10 @@ visible_functions = [ [['coalesce'], 'DATETIME', ['DATETIME', '...'], ''], [['coalesce'], 'DECIMAL', ['DECIMAL', '...'], ''], + [['match'], 'BOOLEAN', ['VARCHAR', 'VARCHAR'], + '_ZN4palo11ESFunctions5matchEPN' + '8palo_udf15FunctionContextERKNS1_9StringValES6_'], + # String builtin functions [['substr', 'substring'], 'VARCHAR', ['VARCHAR', 'INT'], '_ZN4palo15StringFunctions9substringEPN' @@ -549,6 +553,12 @@ visible_functions = [ '_ZN4palo16HllHashFunctions8hll_hashEPN8palo_udf15FunctionContextERKNS1_9StringValE'], # aes and base64 function + [['aes_encrypt'], 'VARCHAR', ['VARCHAR', 'VARCHAR'], + '_ZN4palo19EncryptionFunctions11aes_encryptEPN8palo_udf' + '15FunctionContextERKNS1_9StringValES6_'], + [['aes_decrypt'], 'VARCHAR', ['VARCHAR', 'VARCHAR'], + '_ZN4palo19EncryptionFunctions11aes_decryptEPN8palo_udf' + '15FunctionContextERKNS1_9StringValES6_'], [['from_base64'], 'VARCHAR', ['VARCHAR'], '_ZN4palo19EncryptionFunctions11from_base64EPN8palo_udf' '15FunctionContextERKNS1_9StringValE'], diff --git a/gensrc/script/palo_functions.py b/gensrc/script/palo_functions.py old mode 100644 new mode 100755 diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift index 7547ff7516..7a884a645c 100644 --- a/gensrc/thrift/AgentService.thrift +++ b/gensrc/thrift/AgentService.thrift @@ -80,6 +80,12 @@ struct TPushReq { 8: optional i64 http_file_size 9: optional list delete_conditions 10: optional bool need_decompress + // for real time load + 11: optional Types.TTransactionId transaction_id + 12: optional Types.TPartitionId partition_id + // fe should inform be that this request is running during schema change + // be should write two files + 13: optional bool is_schema_changing } struct TCloneReq { @@ -131,7 +137,10 @@ struct TSnapshotRequest { 3: optional Types.TVersion version 4: optional Types.TVersionHash version_hash 5: optional i64 timeout - 6: optional bool list_files + 6: optional list missing_version + 7: optional bool list_files + // if all nodes has been upgraded, it can be removed. + 8: optional bool allow_incremental_clone } struct TReleaseSnapshotRequest { @@ -143,6 +152,12 @@ struct TClearRemoteFileReq { 2: required map remote_source_properties } +struct TPartitionVersionInfo { + 1: required Types.TPartitionId partition_id + 2: required Types.TVersion version + 3: required Types.TVersionHash version_hash +} + struct TMoveDirReq { 1: required Types.TTabletId tablet_id 2: required Types.TSchemaHash schema_hash @@ -155,6 +170,28 @@ enum TAgentServiceVersion { V1 } +struct TPublishVersionRequest { + 1: required Types.TTransactionId transaction_id + 2: required list partition_version_infos +} + +struct TClearAlterTaskRequest { + 1: required Types.TTabletId tablet_id + 2: required Types.TSchemaHash schema_hash +} + +struct TClearTransactionTaskRequest { + 1: required Types.TTransactionId transaction_id + 2: required list partition_id +} + +struct TRecoverTabletReq { + 1: optional Types.TTabletId tablet_id + 2: optional Types.TSchemaHash schema_hash + 3: optional Types.TVersion version + 4: optional Types.TVersionHash version_hash +} + struct TAgentTaskRequest { 1: required TAgentServiceVersion protocol_version 2: required Types.TTaskType task_type @@ -165,7 +202,7 @@ struct TAgentTaskRequest { 7: optional TAlterTabletReq alter_tablet_req 8: optional TCloneReq clone_req 9: optional TPushReq push_req - 10: optional TCancelDeleteDataReq cancel_delete_data_req + 10: optional TCancelDeleteDataReq cancel_delete_data_req //deprecated 11: optional Types.TResourceInfo resource_info 12: optional TStorageMediumMigrateReq storage_medium_migrate_req 13: optional TCheckConsistencyReq check_consistency_req @@ -174,12 +211,17 @@ struct TAgentTaskRequest { 16: optional TSnapshotRequest snapshot_req 17: optional TReleaseSnapshotRequest release_snapshot_req 18: optional TClearRemoteFileReq clear_remote_file_req - 19: optional TMoveDirReq move_dir_req + 19: optional TPublishVersionRequest publish_version_req + 20: optional TClearAlterTaskRequest clear_alter_task_req + 21: optional TClearTransactionTaskRequest clear_transaction_task_req + 22: optional TMoveDirReq move_dir_req + 23: optional TRecoverTabletReq recover_tablet_req; } struct TAgentResult { 1: required Status.TStatus status 2: optional string snapshot_path + 3: optional bool allow_incremental_clone } struct TTopicItem { diff --git a/gensrc/thrift/BackendService.thrift b/gensrc/thrift/BackendService.thrift index f768342a02..f512023705 100644 --- a/gensrc/thrift/BackendService.thrift +++ b/gensrc/thrift/BackendService.thrift @@ -56,6 +56,16 @@ struct TExportTaskRequest { 1: required PaloInternalService.TExecPlanFragmentParams params } +struct TTabletStat { + 1: required i64 tablet_id + 2: optional i64 data_size + 3: optional i64 row_num +} + +struct TTabletStatResult { + 1: required map tablets_stats +} + service BackendService { // Called by coord to start asynchronous execution of plan fragment in backend. // Returns as soon as all incoming data streams have been set up. @@ -110,4 +120,6 @@ service BackendService { PaloInternalService.TExportStatusResult get_export_status(1:Types.TUniqueId task_id); Status.TStatus erase_export_task(1:Types.TUniqueId task_id); + + TTabletStatResult get_tablet_stat(); } diff --git a/gensrc/thrift/Data.thrift b/gensrc/thrift/Data.thrift index 7ef694bc3f..09739c5db5 100644 --- a/gensrc/thrift/Data.thrift +++ b/gensrc/thrift/Data.thrift @@ -75,3 +75,4 @@ struct TResultBatch { 3: required i64 packet_seq } + diff --git a/gensrc/thrift/DataSinks.thrift b/gensrc/thrift/DataSinks.thrift index 14f2625169..c96bb4a39a 100644 --- a/gensrc/thrift/DataSinks.thrift +++ b/gensrc/thrift/DataSinks.thrift @@ -32,6 +32,7 @@ enum TDataSinkType { DATA_SPLIT_SINK, MYSQL_TABLE_SINK, EXPORT_SINK, + OLAP_TABLE_SINK } // Sink which forwards data to a remote plan fragment, @@ -87,6 +88,22 @@ struct TExportSink { 6: optional map properties; } +struct TOlapTableSink { + 1: required Types.TUniqueId load_id + 2: required i64 txn_id + 3: required i64 db_id + 4: required i64 table_id + 5: required i32 tuple_id + 6: required i32 num_replicas + 7: required bool need_gen_rollup + 8: optional string db_name + 9: optional string table_name + 10: required Descriptors.TOlapTableSchemaParam schema + 11: required Descriptors.TOlapTablePartitionParam partition + 12: required Descriptors.TOlapTableLocationParam location + 13: required Descriptors.TPaloNodesInfo nodes_info +} + struct TDataSink { 1: required TDataSinkType type 2: optional TDataStreamSink stream_sink @@ -94,5 +111,6 @@ struct TDataSink { 4: optional TDataSplitSink split_sink 5: optional TMysqlTableSink mysql_table_sink 6: optional TExportSink export_sink + 7: optional TOlapTableSink olap_table_sink } diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index 5765521383..5ae51ce52a 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -37,6 +37,14 @@ struct TSlotDescriptor { 10: required bool isMaterialized } +struct TTupleDescriptor { + 1: required Types.TTupleId id + 2: required i32 byteSize + 3: required i32 numNullBytes + 4: optional Types.TTableId tableId + 5: optional i32 numNullSlots +} + enum THdfsFileFormat { TEXT, LZO_TEXT, @@ -105,8 +113,82 @@ const map COMPRESSION_MAP = { "snappy": THdfsCompression.SNAPPY } +struct TOlapTableIndexTablets { + 1: required i64 index_id + 2: required list tablets +} + +// its a closed-open range +struct TOlapTablePartition { + 1: required i64 id + 2: optional Exprs.TExprNode start_key + 3: optional Exprs.TExprNode end_key + + // how many tablets in one partition + 4: required i32 num_buckets + + 5: required list indexes +} + +struct TOlapTablePartitionParam { + 1: required i64 db_id + 2: required i64 table_id + 3: required i64 version + + // used to split a logical table to multiple paritions + 4: optional string partition_column + + // used to split a partition to multiple tablets + 5: optional list distributed_columns + + // partitions + 6: required list partitions +} + +struct TOlapTableIndexSchema { + 1: required i64 id + 2: required list columns + 3: required i32 schema_hash +} + +struct TOlapTableSchemaParam { + 1: required i64 db_id + 2: required i64 table_id + 3: required i64 version + + // Logical columns, contain all column that in logical table + 4: required list slot_descs + 5: required TTupleDescriptor tuple_desc + 6: required list indexes +} + +struct TTabletLocation { + 1: required i64 tablet_id + 2: required list node_ids +} + +struct TOlapTableLocationParam { + 1: required i64 db_id + 2: required i64 table_id + 3: required i64 version + 4: required list tablets +} + +struct TNodeInfo { + 1: required i64 id + 2: required i64 option + 3: required string host + // used to transfer data between nodes + 4: required i32 async_internal_port +} + +struct TPaloNodesInfo { + 1: required i64 version + 2: required list nodes +} + struct TOlapTable { - 1: required string tableName + 1: required string tableName } struct TMySQLTable { @@ -157,6 +239,9 @@ struct TKuduTable { 4: required list partition_by } +struct TEsTable { +} + struct TSchemaTable { 1: required TSchemaTableType tableType } @@ -181,14 +266,7 @@ struct TTableDescriptor { 12: optional TSchemaTable schemaTable 13: optional TKuduTable kuduTable 14: optional TBrokerTable BrokerTable -} - -struct TTupleDescriptor { - 1: required Types.TTupleId id - 2: required i32 byteSize - 3: required i32 numNullBytes - 4: optional Types.TTableId tableId - 5: optional i32 numNullSlots + 15: optional TEsTable esTable } struct TDescriptorTable { diff --git a/gensrc/thrift/FrontendService.thrift b/gensrc/thrift/FrontendService.thrift index 64c31a4e8f..6098f25cac 100644 --- a/gensrc/thrift/FrontendService.thrift +++ b/gensrc/thrift/FrontendService.thrift @@ -368,6 +368,8 @@ struct TReportExecStatusParams { // export files 13: optional list export_files + + 14: optional list commitInfos } struct TFeResult { @@ -390,6 +392,7 @@ struct TMiniLoadRequest { 10: optional string cluster 11: optional i64 timestamp 12: optional string user_ip + 13: optional bool is_retry } struct TUpdateMiniEtlTaskStatusRequest { @@ -449,6 +452,88 @@ struct TUpdateExportTaskStatusRequest { 3: required PaloInternalService.TExportStatusResult taskStatus } +struct TLoadTxnBeginRequest { + 1: optional string cluster + 2: required string user + 3: required string passwd + 4: required string db + 5: required string tbl + 6: optional string user_ip + 7: required string label +} + +struct TLoadTxnBeginResult { + 1: required Status.TStatus status + 2: optional i64 txnId +} + +// StreamLoad request, used to load a streaming to engine +struct TStreamLoadPutRequest { + 1: optional string cluster + 2: required string user + 3: required string passwd + 4: required string db + 5: required string tbl + 6: optional string user_ip + + // and use this to assgin to OlapTableSink + 7: required Types.TUniqueId loadId + 8: required i64 txnId + + 9: required Types.TFileType fileType + 10: required PlanNodes.TFileFormatType formatType + + // only valid when file_type is FILE_LOCAL + 11: optional string path + + // describe how table's column map to field in source file + // slot descriptor stands for field of source file + 12: optional string columns + // filters that applied on data + 13: optional string where + // only valid when file type is CSV + 14: optional string columnSeparator + + 15: optional string partitions +} + +struct TStreamLoadPutResult { + 1: required Status.TStatus status + // valid when status is OK + 2: optional PaloInternalService.TExecPlanFragmentParams params +} + +struct TLoadTxnCommitRequest { + 1: optional string cluster + 2: required string user + 3: required string passwd + 4: required string db + 5: required string tbl + 6: optional string user_ip + 7: required i64 txnId + 8: required bool sync + 9: optional list commitInfos +} + +struct TLoadTxnCommitResult { + 1: required Status.TStatus status +} + +struct TLoadTxnRollbackRequest { + 1: optional string cluster + 2: required string user + 3: required string passwd + 4: required string db + 5: required string tbl + 6: optional string user_ip + 7: required i64 txnId + 8: optional string reason +} + +struct TLoadTxnRollbackResult { + 1: required Status.TStatus status +} + service FrontendService { TGetDbsResult getDbNames(1:TGetDbsParams params) TGetTablesResult getTableNames(1:TGetTablesParams params) @@ -468,4 +553,11 @@ service FrontendService { TListTableStatusResult listTableStatus(1:TGetTablesParams params) TFeResult updateExportTaskStatus(1:TUpdateExportTaskStatusRequest request) + + TLoadTxnBeginResult loadTxnBegin(1: TLoadTxnBeginRequest request) + TLoadTxnCommitResult loadTxnCommit(1: TLoadTxnCommitRequest request) + TLoadTxnRollbackResult loadTxnRollback(1: TLoadTxnRollbackRequest request) + + TStreamLoadPutResult streamLoadPut(1: TStreamLoadPutRequest request) + } diff --git a/gensrc/thrift/MasterService.thrift b/gensrc/thrift/MasterService.thrift index 2aa625b150..fe25898eb4 100644 --- a/gensrc/thrift/MasterService.thrift +++ b/gensrc/thrift/MasterService.thrift @@ -34,7 +34,8 @@ struct TTabletInfo { 5: required Types.TCount row_count 6: required Types.TSize data_size 7: optional Types.TStorageMedium storage_medium - 8: optional i64 version_count + 8: optional list transaction_ids + 9: optional i64 version_count } struct TFinishTaskRequest { @@ -48,9 +49,10 @@ struct TFinishTaskRequest { 8: optional i64 request_version 9: optional i64 request_version_hash 10: optional string snapshot_path - 11: optional list snapshot_files - 12: optional map> tablet_files - 13: optional list downloaded_tablet_ids + 11: optional list error_tablet_ids + 12: optional list snapshot_files + 13: optional map> tablet_files + 14: optional list downloaded_tablet_ids } struct TTablet { @@ -71,6 +73,7 @@ struct TReportRequest { 3: optional map> tasks // string signature 4: optional map tablets 5: optional map disks // string root_path + 6: optional bool force_recovery } struct TMasterResult { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 3104a81207..fc7bc91cef 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -169,6 +169,7 @@ struct TPlanFragmentExecParams { // Id of this fragment in its role as a sender. 9: optional i32 sender_id + 10: optional i32 num_senders } // Global query parameters assigned by the coordinator. @@ -250,7 +251,6 @@ struct TCancelPlanFragmentResult { // TransmitData - struct TTransmitDataParams { 1: required PaloInternalServiceVersion protocol_version @@ -285,6 +285,64 @@ struct TTransmitDataResult { 4: optional Types.TPlanNodeId dest_node_id } +struct TTabletWithPartition { + 1: required i64 partition_id + 2: required i64 tablet_id +} + +// open a tablet writer +struct TTabletWriterOpenParams { + 1: required Types.TUniqueId id + 2: required i64 index_id + 3: required i64 txn_id + 4: required Descriptors.TOlapTableSchemaParam schema + 5: required list tablets + + 6: required i32 num_senders +} + +struct TTabletWriterOpenResult { + 1: required Status.TStatus status +} + +// add batch to tablet writer +struct TTabletWriterAddBatchParams { + 1: required Types.TUniqueId id + 2: required i64 index_id + + 3: required i64 packet_seq + 4: required list tablet_ids + 5: required Data.TRowBatch row_batch + + 6: required i32 sender_no +} + +struct TTabletWriterAddBatchResult { + 1: required Status.TStatus status +} + +struct TTabletWriterCloseParams { + 1: required Types.TUniqueId id + 2: required i64 index_id + + 3: required i32 sender_no +} + +struct TTabletWriterCloseResult { + 1: required Status.TStatus status +} + +// +struct TTabletWriterCancelParams { + 1: required Types.TUniqueId id + 2: required i64 index_id + + 3: required i32 sender_no +} + +struct TTabletWriterCancelResult { +} + struct TFetchDataParams { 1: required PaloInternalServiceVersion protocol_version // required in V1 diff --git a/gensrc/thrift/Partitions.thrift b/gensrc/thrift/Partitions.thrift index 997879a774..c1922d6092 100644 --- a/gensrc/thrift/Partitions.thrift +++ b/gensrc/thrift/Partitions.thrift @@ -85,3 +85,4 @@ struct TDataPartition { 3: optional list partition_infos } + diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index baf75e0d25..838a662aaf 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -42,10 +42,11 @@ enum TPlanNodeType { META_SCAN_NODE, ANALYTIC_EVAL_NODE, OLAP_REWRITE_NODE, - KUDU_SCAN_NODE - BROKER_SCAN_NODE - EMPTY_SET_NODE - UNION_NODE + KUDU_SCAN_NODE, + BROKER_SCAN_NODE, + EMPTY_SET_NODE, + UNION_NODE, + ES_SCAN_NODE } // phases of an execution node @@ -90,7 +91,8 @@ struct TPaloScanRange { } enum TFileFormatType { - FORMAT_CSV_PLAIN, + FORMAT_UNKNOWN = -1, + FORMAT_CSV_PLAIN = 0, FORMAT_CSV_GZ, FORMAT_CSV_LZO, FORMAT_CSV_BZ2, @@ -109,6 +111,8 @@ struct TBrokerRangeDesc { 5: required i64 start_offset; // Size of this range, if size = -1, this means that will read to then end of file 6: required i64 size + // used to get stream for this load + 7: optional Types.TUniqueId load_id } struct TBrokerScanRangeParams { @@ -143,13 +147,24 @@ struct TBrokerScanRange { 3: required list broker_addresses } +// Es scan range +struct TEsScanRange { + 1: required list es_hosts // es hosts is used by be scan node to connect to es + // has to set index and type here, could not set it in scannode + // because on scan node maybe scan an es alias then it contains one or more indices + 2: required string index + 3: optional string type + 4: required i32 shard_id +} + // Specification of an individual data range which is held in its entirety // by a storage server struct TScanRange { // one of these must be set for every TScanRange2 4: optional TPaloScanRange palo_scan_range 5: optional binary kudu_scan_token - 6: optional TBrokerScanRange broker_scan_range + 6: optional TBrokerScanRange broker_scan_range + 7: optional TEsScanRange es_scan_range } struct TMySQLScanNode { @@ -167,6 +182,11 @@ struct TBrokerScanNode { 3: optional list partition_infos } +struct TEsScanNode { + 1: required Types.TTupleId tuple_id + 2: optional map properties +} + struct TMiniLoadEtlFunction { 1: required string function_name 2: required i32 param_column_index @@ -550,6 +570,7 @@ struct TPlanNode { 27: optional TKuduScanNode kudu_scan_node 28: optional TUnionNode union_node 29: optional TBackendResourceProfile resource_profile + 30: optional TEsScanNode es_scan_node } // A flattened representation of a tree of PlanNodes, obtained by depth-first diff --git a/gensrc/thrift/Status.thrift b/gensrc/thrift/Status.thrift index b92be6ef39..c3afbf25a4 100644 --- a/gensrc/thrift/Status.thrift +++ b/gensrc/thrift/Status.thrift @@ -22,20 +22,23 @@ namespace cpp palo namespace java com.baidu.palo.thrift enum TStatusCode { - OK, - CANCELLED, - ANALYSIS_ERROR, - NOT_IMPLEMENTED_ERROR, - RUNTIME_ERROR, - MEM_LIMIT_EXCEEDED, - INTERNAL_ERROR, - THRIFT_RPC_ERROR, - TIMEOUT, - KUDU_NOT_ENABLED, - KUDU_NOT_SUPPORTED_ON_OS, - MEM_ALLOC_FAILED, - BUFFER_ALLOCATION_FAILED, - MINIMUM_RESERVATION_UNAVAILABLE + OK, + CANCELLED, + ANALYSIS_ERROR, + NOT_IMPLEMENTED_ERROR, + RUNTIME_ERROR, + MEM_LIMIT_EXCEEDED, + INTERNAL_ERROR, + THRIFT_RPC_ERROR, + TIMEOUT, + KUDU_NOT_ENABLED, + KUDU_NOT_SUPPORTED_ON_OS, + MEM_ALLOC_FAILED, + BUFFER_ALLOCATION_FAILED, + MINIMUM_RESERVATION_UNAVAILABLE, + PUBLISH_TIMEOUT, + LABEL_ALREADY_EXISTS, + DATA_QUALITY_ERROR, } struct TStatus { diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift index 69411a8993..d728b66502 100644 --- a/gensrc/thrift/Types.thrift +++ b/gensrc/thrift/Types.thrift @@ -36,6 +36,10 @@ typedef i64 TSize typedef i32 TClusterId typedef i64 TEpoch +// add for real time load, partitionid is not defined previously, define it here +typedef i64 TTransactionId +typedef i64 TPartitionId + enum TStorageType { ROW, COLUMN, @@ -150,6 +154,11 @@ enum TTaskType { DOWNLOAD, CLEAR_REMOTE_FILE, MOVE + REALTIME_PUSH, + PUBLISH_VERSION, + CLEAR_ALTER_TASK, + CLEAR_TRANSACTION_TASK, + RECOVER_TABLET } enum TStmtType { @@ -300,7 +309,8 @@ enum TTableType { OLAP_TABLE, SCHEMA_TABLE, KUDU_TABLE, - BROKER_TABLE + BROKER_TABLE, + ES_TABLE } enum TKeysType { @@ -336,6 +346,11 @@ enum TExportState { enum TFileType { FILE_LOCAL, FILE_BROKER, + FILE_STREAM, // file content is streaming in the buffer } +struct TTabletCommitInfo { + 1: required i64 tabletId + 2: required i64 backendId +} diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index 4c7102e5af..4908e8f249 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -36,6 +36,7 @@ if [ ! -f $curdir/vars.sh ]; then fi export DORIS_HOME=$curdir/../ +export GCC_HOME=$curdir/../palo-toolchain/gcc730 export TP_DIR=$curdir source $curdir/vars.sh @@ -155,7 +156,7 @@ build_libevent() { CFLAGS="-std=c99 -fPIC -D_BSD_SOURCE -fno-omit-frame-pointer -g -ggdb -O2 -I${TP_INCLUDE_DIR}" \ LDFLAGS="-L${TP_LIB_DIR}" \ ./configure --prefix=$TP_INSTALL_DIR --enable-shared=no --disable-samples - make -j2 && make install + make -j$PARALLEL && make install } build_openssl() { @@ -166,8 +167,14 @@ build_openssl() { CXXFLAGS="-I${TP_INCLUDE_DIR} -fPIC" \ LDFLAGS="-L${TP_LIB_DIR}" \ CFLAGS="-fPIC" \ + LIBDIR="lib" \ ./Configure --prefix=$TP_INSTALL_DIR -zlib -shared linux-x86_64 - make -j2 && make install + make -j$PARALLEL && make install + if [ -f $TP_INSTALL_DIR/lib64/libcrypto.a ]; then + mkdir -p $TP_INSTALL_DIR/lib && \ + ln -s $TP_INSTALL_DIR/lib64/libcrypto.a $TP_INSTALL_DIR/lib/libcrypto.a && \ + ln -s $TP_INSTALL_DIR/lib64/libssl.a $TP_INSTALL_DIR/lib/libssl.a + fi } # thrift @@ -191,7 +198,7 @@ build_thrift() { mv compiler/cpp/thrifty.hh compiler/cpp/thrifty.h fi - make -j2 && make install + make -j$PARALLEL && make install } # llvm @@ -474,17 +481,29 @@ build_jdk() { export JAVA_HOME=$TP_INSTALL_DIR/$JDK_SOURCE } -# ant -build_ant() { - check_if_source_exist $ANT_SOURCE +# rocksdb +build_rocksdb() { + check_if_source_exist $ROCKSDB_SOURCE - if [ -d $TP_INSTALL_DIR/$ANT_SOURCE ];then - echo "$ANT_SOURCE already installed" - else - cp -rf $TP_SOURCE_DIR/$ANT_SOURCE $TP_INSTALL_DIR/ant - fi + cd $TP_SOURCE_DIR/$ROCKSDB_SOURCE - export ANT_HOME=$TP_INSTALL_DIR/ant + CFLAGS="-I ${TP_INCLUDE_DIR} -I ${TP_INCLUDE_DIR}/snappy -I ${TP_INCLUDE_DIR}/lz4" CXXFLAGS="-fPIC" LDFLAGS="-static-libstdc++ -static-libgcc" \ + make -j$PARALLEL static_lib + cp librocksdb.a ../../installed/lib/librocksdb.a + cp -r include/rocksdb ../../installed/include/ +} + +# librdkafka +build_librdkafka() { + check_if_source_exist $LIBRDKAFKA_SOURCE + + cd $TP_SOURCE_DIR/$LIBRDKAFKA_SOURCE + + CPPFLAGS="-I${TP_INCLUDE_DIR}" \ + LDFLAGS="-L${TP_LIB_DIR}" + CFLAGS="-fPIC" \ + ./configure --prefix=$TP_INSTALL_DIR --enable-static + make -j$PARALLEL && make install } build_llvm @@ -509,6 +528,7 @@ build_thrift build_leveldb build_brpc build_jdk -build_ant +build_rocksdb +build_librdkafka echo "Finihsed to build all thirdparties" diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index 1a81c40dea..79fe989197 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -232,14 +232,27 @@ cd - echo "Finished patching $LZ4_SOURCE" ##################################### -# Copy java libraries +# Download and unpack java libraries ##################################### -echo "Begin to copy java libraries" +if test "x$REPOSITORY_URL" != x; then + echo "===== Downloading java libraries..." + cd $TP_DIR + DOWNLOAD_URL="${REPOSITORY_URL}/java-libraries.tar.gz" + wget --no-check-certificate $DOWNLOAD_URL + cd - + echo "===== Finish downloading java libraries" +fi + +echo "Begin to unpack java libraries" +if [ ! -f $TP_DIR/java-libraries.tar.gz ];then + echo "java-libraries.tar.gz is mising" + exit 1 +fi rm -rf $TP_JAR_DIR/* mkdir -p $TP_JAR_DIR/ -cp -R $TP_DIR/java-libraries/* $TP_JAR_DIR/ -echo "Finish to copy java libraries" +tar xzf $TP_DIR/java-libraries.tar.gz -C $TP_JAR_DIR/ +echo "Finish to unpack java libraries" diff --git a/thirdparty/java-libraries/cglib-nodep-2.2.2.jar b/thirdparty/java-libraries/cglib-nodep-2.2.2.jar deleted file mode 100644 index 02d81e8804..0000000000 Binary files a/thirdparty/java-libraries/cglib-nodep-2.2.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/asm-5.0.1.jar b/thirdparty/java-libraries/cobertura/asm-5.0.1.jar deleted file mode 100644 index eeb3bc6f98..0000000000 Binary files a/thirdparty/java-libraries/cobertura/asm-5.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/asm-analysis-5.0.1.jar b/thirdparty/java-libraries/cobertura/asm-analysis-5.0.1.jar deleted file mode 100644 index af4a001085..0000000000 Binary files a/thirdparty/java-libraries/cobertura/asm-analysis-5.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/asm-commons-5.0.1.jar b/thirdparty/java-libraries/cobertura/asm-commons-5.0.1.jar deleted file mode 100644 index b1f76966a7..0000000000 Binary files a/thirdparty/java-libraries/cobertura/asm-commons-5.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/asm-tree-5.0.1.jar b/thirdparty/java-libraries/cobertura/asm-tree-5.0.1.jar deleted file mode 100644 index 3b1a346115..0000000000 Binary files a/thirdparty/java-libraries/cobertura/asm-tree-5.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/asm-util-5.0.1.jar b/thirdparty/java-libraries/cobertura/asm-util-5.0.1.jar deleted file mode 100644 index fb8d282291..0000000000 Binary files a/thirdparty/java-libraries/cobertura/asm-util-5.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/cobertura-2.1.1-javadoc.jar b/thirdparty/java-libraries/cobertura/cobertura-2.1.1-javadoc.jar deleted file mode 100644 index 8f1b58ab3d..0000000000 Binary files a/thirdparty/java-libraries/cobertura/cobertura-2.1.1-javadoc.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/cobertura-2.1.1-sources.jar b/thirdparty/java-libraries/cobertura/cobertura-2.1.1-sources.jar deleted file mode 100644 index c05ff01c44..0000000000 Binary files a/thirdparty/java-libraries/cobertura/cobertura-2.1.1-sources.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/cobertura-2.1.1.jar b/thirdparty/java-libraries/cobertura/cobertura-2.1.1.jar deleted file mode 100644 index d04676ad08..0000000000 Binary files a/thirdparty/java-libraries/cobertura/cobertura-2.1.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/commons-lang3-3.3.2.jar b/thirdparty/java-libraries/cobertura/commons-lang3-3.3.2.jar deleted file mode 100644 index 2ce08ae99d..0000000000 Binary files a/thirdparty/java-libraries/cobertura/commons-lang3-3.3.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/hamcrest-core-1.3.jar b/thirdparty/java-libraries/cobertura/hamcrest-core-1.3.jar deleted file mode 100644 index 9d5fe16e3d..0000000000 Binary files a/thirdparty/java-libraries/cobertura/hamcrest-core-1.3.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/jaxen-1.1.4.jar b/thirdparty/java-libraries/cobertura/jaxen-1.1.4.jar deleted file mode 100644 index c2016095f0..0000000000 Binary files a/thirdparty/java-libraries/cobertura/jaxen-1.1.4.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/jetty-6.1.14.jar b/thirdparty/java-libraries/cobertura/jetty-6.1.14.jar deleted file mode 100644 index 8c503bea21..0000000000 Binary files a/thirdparty/java-libraries/cobertura/jetty-6.1.14.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/jetty-util-6.1.14.jar b/thirdparty/java-libraries/cobertura/jetty-util-6.1.14.jar deleted file mode 100644 index 8f924bb147..0000000000 Binary files a/thirdparty/java-libraries/cobertura/jetty-util-6.1.14.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/logback-classic-1.0.13.jar b/thirdparty/java-libraries/cobertura/logback-classic-1.0.13.jar deleted file mode 100644 index 80bf5d15a2..0000000000 Binary files a/thirdparty/java-libraries/cobertura/logback-classic-1.0.13.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/logback-core-1.0.13.jar b/thirdparty/java-libraries/cobertura/logback-core-1.0.13.jar deleted file mode 100644 index 568ccfaae5..0000000000 Binary files a/thirdparty/java-libraries/cobertura/logback-core-1.0.13.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/oro-2.0.8.jar b/thirdparty/java-libraries/cobertura/oro-2.0.8.jar deleted file mode 100644 index 23488d2600..0000000000 Binary files a/thirdparty/java-libraries/cobertura/oro-2.0.8.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/servlet-api-2.5-6.1.14.jar b/thirdparty/java-libraries/cobertura/servlet-api-2.5-6.1.14.jar deleted file mode 100644 index 6d7404fb72..0000000000 Binary files a/thirdparty/java-libraries/cobertura/servlet-api-2.5-6.1.14.jar and /dev/null differ diff --git a/thirdparty/java-libraries/cobertura/slf4j-api-1.7.5.jar b/thirdparty/java-libraries/cobertura/slf4j-api-1.7.5.jar deleted file mode 100644 index 8f004d3906..0000000000 Binary files a/thirdparty/java-libraries/cobertura/slf4j-api-1.7.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/commons-cli-2.0-SNAPSHOT.jar b/thirdparty/java-libraries/commons-cli-2.0-SNAPSHOT.jar deleted file mode 100644 index 0b1d51072a..0000000000 Binary files a/thirdparty/java-libraries/commons-cli-2.0-SNAPSHOT.jar and /dev/null differ diff --git a/thirdparty/java-libraries/commons-codec-1.9.jar b/thirdparty/java-libraries/commons-codec-1.9.jar deleted file mode 100644 index ef35f1c50d..0000000000 Binary files a/thirdparty/java-libraries/commons-codec-1.9.jar and /dev/null differ diff --git a/thirdparty/java-libraries/commons-lang-2.4.jar b/thirdparty/java-libraries/commons-lang-2.4.jar deleted file mode 100644 index 532939ecab..0000000000 Binary files a/thirdparty/java-libraries/commons-lang-2.4.jar and /dev/null differ diff --git a/thirdparty/java-libraries/commons-pool2-2.2.jar b/thirdparty/java-libraries/commons-pool2-2.2.jar deleted file mode 100644 index 6a3eb38acb..0000000000 Binary files a/thirdparty/java-libraries/commons-pool2-2.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/commons-validator-1.4.1.jar b/thirdparty/java-libraries/commons-validator-1.4.1.jar deleted file mode 100644 index 95c951e050..0000000000 Binary files a/thirdparty/java-libraries/commons-validator-1.4.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/easymock-3.2.jar b/thirdparty/java-libraries/easymock-3.2.jar deleted file mode 100644 index 7c439f82c1..0000000000 Binary files a/thirdparty/java-libraries/easymock-3.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/gson-2.2.4.jar b/thirdparty/java-libraries/gson-2.2.4.jar deleted file mode 100644 index 75fe27c547..0000000000 Binary files a/thirdparty/java-libraries/gson-2.2.4.jar and /dev/null differ diff --git a/thirdparty/java-libraries/guava-15.0.jar b/thirdparty/java-libraries/guava-15.0.jar deleted file mode 100644 index eb9ef8ad5f..0000000000 Binary files a/thirdparty/java-libraries/guava-15.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/hamcrest-core-1.1.jar b/thirdparty/java-libraries/hamcrest-core-1.1.jar deleted file mode 100644 index e5149be76e..0000000000 Binary files a/thirdparty/java-libraries/hamcrest-core-1.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jackson-core-asl-1.8.1.jar b/thirdparty/java-libraries/jackson-core-asl-1.8.1.jar deleted file mode 100644 index 13574a4ed9..0000000000 Binary files a/thirdparty/java-libraries/jackson-core-asl-1.8.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jackson-mapper-asl-1.8.1.jar b/thirdparty/java-libraries/jackson-mapper-asl-1.8.1.jar deleted file mode 100644 index 83b684038b..0000000000 Binary files a/thirdparty/java-libraries/jackson-mapper-asl-1.8.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/java-cup-0.11a.jar b/thirdparty/java-libraries/java-cup-0.11a.jar deleted file mode 100644 index 1d84620d22..0000000000 Binary files a/thirdparty/java-libraries/java-cup-0.11a.jar and /dev/null differ diff --git a/thirdparty/java-libraries/javassist-3.18.2-GA.jar b/thirdparty/java-libraries/javassist-3.18.2-GA.jar deleted file mode 100644 index c8761c8efe..0000000000 Binary files a/thirdparty/java-libraries/javassist-3.18.2-GA.jar and /dev/null differ diff --git a/thirdparty/java-libraries/javax.servlet-api-3.0.1.jar b/thirdparty/java-libraries/javax.servlet-api-3.0.1.jar deleted file mode 100644 index 4e2edcc9df..0000000000 Binary files a/thirdparty/java-libraries/javax.servlet-api-3.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/je-7.3.7.jar b/thirdparty/java-libraries/je-7.3.7.jar deleted file mode 100644 index dc74e5bc9f..0000000000 Binary files a/thirdparty/java-libraries/je-7.3.7.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jetty-6.1.14.jar b/thirdparty/java-libraries/jetty-6.1.14.jar deleted file mode 100644 index 8c503bea21..0000000000 Binary files a/thirdparty/java-libraries/jetty-6.1.14.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jetty-util-6.1.14.jar b/thirdparty/java-libraries/jetty-util-6.1.14.jar deleted file mode 100644 index 8f924bb147..0000000000 Binary files a/thirdparty/java-libraries/jetty-util-6.1.14.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jflex-1.4.3.jar b/thirdparty/java-libraries/jflex-1.4.3.jar deleted file mode 100644 index 8e952a3573..0000000000 Binary files a/thirdparty/java-libraries/jflex-1.4.3.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jmockit-1.38.jar b/thirdparty/java-libraries/jmockit-1.38.jar deleted file mode 100644 index fe1f81685c..0000000000 Binary files a/thirdparty/java-libraries/jmockit-1.38.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jprotobuf-1.11.0.jar b/thirdparty/java-libraries/jprotobuf-1.11.0.jar deleted file mode 100644 index 76ffbd4214..0000000000 Binary files a/thirdparty/java-libraries/jprotobuf-1.11.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jprotobuf-rpc-common-1.8.jar b/thirdparty/java-libraries/jprotobuf-rpc-common-1.8.jar deleted file mode 100644 index da258c8597..0000000000 Binary files a/thirdparty/java-libraries/jprotobuf-rpc-common-1.8.jar and /dev/null differ diff --git a/thirdparty/java-libraries/jprotobuf-rpc-core-3.5.17.jar b/thirdparty/java-libraries/jprotobuf-rpc-core-3.5.17.jar deleted file mode 100644 index c1e574ae05..0000000000 Binary files a/thirdparty/java-libraries/jprotobuf-rpc-core-3.5.17.jar and /dev/null differ diff --git a/thirdparty/java-libraries/json-20171018.jar b/thirdparty/java-libraries/json-20171018.jar deleted file mode 100644 index cad06581d9..0000000000 Binary files a/thirdparty/java-libraries/json-20171018.jar and /dev/null differ diff --git a/thirdparty/java-libraries/junit-4.12.jar b/thirdparty/java-libraries/junit-4.12.jar deleted file mode 100644 index 3a7fc266c3..0000000000 Binary files a/thirdparty/java-libraries/junit-4.12.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/automaton-1.11-8.jar b/thirdparty/java-libraries/k8s-client/automaton-1.11-8.jar deleted file mode 100644 index 60f6ce01e1..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/automaton-1.11-8.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/generex-1.0.1.jar b/thirdparty/java-libraries/k8s-client/generex-1.0.1.jar deleted file mode 100644 index 982bd1e2d0..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/generex-1.0.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/jackson-annotations-2.7.0.jar b/thirdparty/java-libraries/k8s-client/jackson-annotations-2.7.0.jar deleted file mode 100644 index 96fb17cf4f..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/jackson-annotations-2.7.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/jackson-core-2.7.5.jar b/thirdparty/java-libraries/k8s-client/jackson-core-2.7.5.jar deleted file mode 100644 index 307df1eabf..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/jackson-core-2.7.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/jackson-databind-2.7.5.jar b/thirdparty/java-libraries/k8s-client/jackson-databind-2.7.5.jar deleted file mode 100644 index d337ea5437..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/jackson-databind-2.7.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/jackson-dataformat-yaml-2.7.5.jar b/thirdparty/java-libraries/k8s-client/jackson-dataformat-yaml-2.7.5.jar deleted file mode 100644 index e9c5d9e4fa..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/jackson-dataformat-yaml-2.7.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/jackson-module-jaxb-annotations-2.7.5.jar b/thirdparty/java-libraries/k8s-client/jackson-module-jaxb-annotations-2.7.5.jar deleted file mode 100644 index 136ddf3387..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/jackson-module-jaxb-annotations-2.7.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/kubernetes-client-1.4.27.jar b/thirdparty/java-libraries/k8s-client/kubernetes-client-1.4.27.jar deleted file mode 100644 index 5af36e06a0..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/kubernetes-client-1.4.27.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/kubernetes-model-1.0.64.jar b/thirdparty/java-libraries/k8s-client/kubernetes-model-1.0.64.jar deleted file mode 100644 index 75d3bab687..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/kubernetes-model-1.0.64.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/logging-interceptor-3.4.1.jar b/thirdparty/java-libraries/k8s-client/logging-interceptor-3.4.1.jar deleted file mode 100644 index 4bea8311b6..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/logging-interceptor-3.4.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/okhttp-3.4.1.jar b/thirdparty/java-libraries/k8s-client/okhttp-3.4.1.jar deleted file mode 100644 index e31f248628..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/okhttp-3.4.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/okhttp-ws-3.4.1.jar b/thirdparty/java-libraries/k8s-client/okhttp-ws-3.4.1.jar deleted file mode 100644 index 60a863f29e..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/okhttp-ws-3.4.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/okio-1.9.0.jar b/thirdparty/java-libraries/k8s-client/okio-1.9.0.jar deleted file mode 100644 index 3c42b934b5..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/okio-1.9.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/snakeyaml-1.15.jar b/thirdparty/java-libraries/k8s-client/snakeyaml-1.15.jar deleted file mode 100644 index 34084e3325..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/snakeyaml-1.15.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/validation-api-1.1.0.Final.jar b/thirdparty/java-libraries/k8s-client/validation-api-1.1.0.Final.jar deleted file mode 100644 index de85403868..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/validation-api-1.1.0.Final.jar and /dev/null differ diff --git a/thirdparty/java-libraries/k8s-client/zjsonpatch-0.2.3.jar b/thirdparty/java-libraries/k8s-client/zjsonpatch-0.2.3.jar deleted file mode 100644 index 3467511c03..0000000000 Binary files a/thirdparty/java-libraries/k8s-client/zjsonpatch-0.2.3.jar and /dev/null differ diff --git a/thirdparty/java-libraries/kudu-client/async-1.1.0.jar b/thirdparty/java-libraries/kudu-client/async-1.1.0.jar deleted file mode 100755 index ac7a4cd928..0000000000 Binary files a/thirdparty/java-libraries/kudu-client/async-1.1.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/kudu-client/interface-annotations-1.2.0-cdh5.10.0.jar b/thirdparty/java-libraries/kudu-client/interface-annotations-1.2.0-cdh5.10.0.jar deleted file mode 100755 index 07ece4af66..0000000000 Binary files a/thirdparty/java-libraries/kudu-client/interface-annotations-1.2.0-cdh5.10.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/kudu-client/kudu-client-1.2.0-cdh5.10.0.jar b/thirdparty/java-libraries/kudu-client/kudu-client-1.2.0-cdh5.10.0.jar deleted file mode 100644 index c7844f6bb9..0000000000 Binary files a/thirdparty/java-libraries/kudu-client/kudu-client-1.2.0-cdh5.10.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/kudu-client/log4j-1.2.17.jar b/thirdparty/java-libraries/kudu-client/log4j-1.2.17.jar deleted file mode 100644 index 1d425cf7d7..0000000000 Binary files a/thirdparty/java-libraries/kudu-client/log4j-1.2.17.jar and /dev/null differ diff --git a/thirdparty/java-libraries/kudu-client/slf4j-api-1.7.5.jar b/thirdparty/java-libraries/kudu-client/slf4j-api-1.7.5.jar deleted file mode 100755 index 8f004d3906..0000000000 Binary files a/thirdparty/java-libraries/kudu-client/slf4j-api-1.7.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/libfb303-0.9.3.jar b/thirdparty/java-libraries/libfb303-0.9.3.jar deleted file mode 100644 index 07c711b738..0000000000 Binary files a/thirdparty/java-libraries/libfb303-0.9.3.jar and /dev/null differ diff --git a/thirdparty/java-libraries/libthrift-0.9.3.jar b/thirdparty/java-libraries/libthrift-0.9.3.jar deleted file mode 100644 index f9221a9f95..0000000000 Binary files a/thirdparty/java-libraries/libthrift-0.9.3.jar and /dev/null differ diff --git a/thirdparty/java-libraries/log4j-api-2.2.jar b/thirdparty/java-libraries/log4j-api-2.2.jar deleted file mode 100644 index 21bbcad489..0000000000 Binary files a/thirdparty/java-libraries/log4j-api-2.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/log4j-core-2.2.jar b/thirdparty/java-libraries/log4j-core-2.2.jar deleted file mode 100644 index a8279f7836..0000000000 Binary files a/thirdparty/java-libraries/log4j-core-2.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/log4j-slf4j-impl-2.2.jar b/thirdparty/java-libraries/log4j-slf4j-impl-2.2.jar deleted file mode 100644 index 658e53045e..0000000000 Binary files a/thirdparty/java-libraries/log4j-slf4j-impl-2.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/metrics-core-4.0.2.jar b/thirdparty/java-libraries/metrics-core-4.0.2.jar deleted file mode 100644 index 2001fe9f2b..0000000000 Binary files a/thirdparty/java-libraries/metrics-core-4.0.2.jar and /dev/null differ diff --git a/thirdparty/java-libraries/mysql-connector-java-5.1.41.jar b/thirdparty/java-libraries/mysql-connector-java-5.1.41.jar deleted file mode 100644 index cdf74884d8..0000000000 Binary files a/thirdparty/java-libraries/mysql-connector-java-5.1.41.jar and /dev/null differ diff --git a/thirdparty/java-libraries/netty-all-4.1.25.Final.jar b/thirdparty/java-libraries/netty-all-4.1.25.Final.jar deleted file mode 100644 index 086463deb5..0000000000 Binary files a/thirdparty/java-libraries/netty-all-4.1.25.Final.jar and /dev/null differ diff --git a/thirdparty/java-libraries/objenesis-2.1.jar b/thirdparty/java-libraries/objenesis-2.1.jar deleted file mode 100644 index 7c1e983798..0000000000 Binary files a/thirdparty/java-libraries/objenesis-2.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/powermock-easymock-1.6.1-full.jar b/thirdparty/java-libraries/powermock-easymock-1.6.1-full.jar deleted file mode 100644 index e5c7424c66..0000000000 Binary files a/thirdparty/java-libraries/powermock-easymock-1.6.1-full.jar and /dev/null differ diff --git a/thirdparty/java-libraries/protobuf-java-2.5.0.jar b/thirdparty/java-libraries/protobuf-java-2.5.0.jar deleted file mode 100644 index 4c4e686a49..0000000000 Binary files a/thirdparty/java-libraries/protobuf-java-2.5.0.jar and /dev/null differ diff --git a/thirdparty/java-libraries/protoparser-3.1.5.jar b/thirdparty/java-libraries/protoparser-3.1.5.jar deleted file mode 100644 index b5bdc2b9ea..0000000000 Binary files a/thirdparty/java-libraries/protoparser-3.1.5.jar and /dev/null differ diff --git a/thirdparty/java-libraries/servlet-api.jar b/thirdparty/java-libraries/servlet-api.jar deleted file mode 100644 index c9dab30f94..0000000000 Binary files a/thirdparty/java-libraries/servlet-api.jar and /dev/null differ diff --git a/thirdparty/java-libraries/slf4j-api-1.6.1.jar b/thirdparty/java-libraries/slf4j-api-1.6.1.jar deleted file mode 100644 index f1f4fdd214..0000000000 Binary files a/thirdparty/java-libraries/slf4j-api-1.6.1.jar and /dev/null differ diff --git a/thirdparty/java-libraries/snappy-java-1.1.7.2.jar b/thirdparty/java-libraries/snappy-java-1.1.7.2.jar deleted file mode 100644 index da8dd434ab..0000000000 Binary files a/thirdparty/java-libraries/snappy-java-1.1.7.2.jar and /dev/null differ diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index a52898ffe8..ee9a49e0b2 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -139,11 +139,6 @@ RAPIDJSON_DOWNLOAD="https://github.com/miloyip/rapidjson/archive/v1.1.0.tar.gz" RAPIDJSON_NAME=rapidjson-1.1.0.tar.gz RAPIDJSON_SOURCE=rapidjson-1.1.0 -# ncurses -NCURSES_DOWNLOAD="https://ftp.gnu.org/gnu/ncurses/ncurses-6.0.tar.gz" -NCURSES_NAME=ncurses-6.0.tar.gz -NCURSES_SOURCE=ncurses-6.0 - # curl CURL_DOWNLOAD="https://curl.haxx.se/download/curl-7.54.0.tar.gz" CURL_NAME=curl-7.54.0.tar.gz @@ -184,10 +179,15 @@ JDK_DOWNLOAD="http://mirror.cnop.net/jdk/linux/jdk-8u131-linux-x64.tar.gz" JDK_NAME=jdk-8u131-linux-x64.tar.gz JDK_SOURCE=jdk1.8.0_131 -# ant -ANT_DOWNLOAD="https://archive.apache.org/dist/ant/binaries/apache-ant-1.7.0-bin.tar.gz" -ANT_NAME=apache-ant-1.7.0-bin.tar.gz -ANT_SOURCE=apache-ant-1.7.0 +# rocksdb +ROCKSDB_DOWNLOAD="https://github.com/facebook/rocksdb/archive/v5.14.2.tar.gz" +ROCKSDB_NAME=rocksdb-5.14.2.tar.gz +ROCKSDB_SOURCE=rocksdb-5.14.2 + +# librdkafka +LIBRDKAFKA_DOWNLOAD="https://github.com/edenhill/librdkafka/archive/v0.11.6-RC5.tar.gz" +LIBRDKAFKA_NAME=librdkafka-0.11.6-RC5.tar.gz +LIBRDKAFKA_SOURCE=librdkafka-0.11.6-RC5 # all thirdparties which need to be downloaded is set in array TP_ARCHIVES -export TP_ARCHIVES="LIBEVENT OPENSSL THRIFT LLVM CLANG COMPILER_RT PROTOBUF GFLAGS GLOG GTEST RAPIDJSON SNAPPY GPERFTOOLS ZLIB LZ4 BZIP LZO2 NCURSES CURL RE2 BOOST MYSQL BOOST_FOR_MYSQL LEVELDB BRPC JDK ANT" +export TP_ARCHIVES="LIBEVENT OPENSSL THRIFT LLVM CLANG COMPILER_RT PROTOBUF GFLAGS GLOG GTEST RAPIDJSON SNAPPY GPERFTOOLS ZLIB LZ4 BZIP LZO2 CURL RE2 BOOST MYSQL BOOST_FOR_MYSQL LEVELDB BRPC JDK ROCKSDB LIBRDKAFKA"