Files
doris/be/src/agent/agent_server.cpp
2017-08-11 17:51:21 +08:00

445 lines
16 KiB
C++

// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "agent/agent_server.h"
#include <string>
#include "boost/filesystem.hpp"
#include "boost/lexical_cast.hpp"
#include "thrift/concurrency/ThreadManager.h"
#include "thrift/concurrency/PosixThreadFactory.h"
#include "thrift/server/TThreadPoolServer.h"
#include "thrift/server/TThreadedServer.h"
#include "thrift/transport/TSocket.h"
#include "thrift/transport/TTransportUtils.h"
#include "util/thrift_server.h"
#include "agent/status.h"
#include "agent/task_worker_pool.h"
#include "agent/user_resource_listener.h"
#include "common/status.h"
#include "common/logging.h"
#include "gen_cpp/AgentService_types.h"
#include "gen_cpp/MasterService_types.h"
#include "gen_cpp/Status_types.h"
#include "olap/utils.h"
#include "olap/command_executor.h"
#include "runtime/exec_env.h"
#include "runtime/etl_job_mgr.h"
#include "util/debug_util.h"
using apache::thrift::transport::TProcessor;
using std::deque;
using std::list;
using std::map;
using std::nothrow;
using std::set;
using std::string;
using std::to_string;
using std::vector;
namespace palo {
AgentServer::AgentServer(ExecEnv* exec_env,
const TMasterInfo& master_info) :
_exec_env(exec_env),
_master_info(master_info),
_topic_subscriber(new TopicSubscriber()) {
// clean dpp download dir
_command_executor = new CommandExecutor();
vector<OLAPRootPathStat>* root_paths_stat = new vector<OLAPRootPathStat>();
_command_executor->get_all_root_path_stat(root_paths_stat);
for (auto root_path_stat : *root_paths_stat) {
try {
string dpp_download_path_str = root_path_stat.root_path + DPP_PREFIX;
boost::filesystem::path dpp_download_path(dpp_download_path_str);
if (boost::filesystem::exists(dpp_download_path)) {
boost::filesystem::remove_all(dpp_download_path);
}
} catch (...) {
OLAP_LOG_WARNING("boost exception when remove dpp download path. [path='%s']",
root_path_stat.root_path.c_str());
}
}
// create tmp dir
boost::filesystem::path tmp_path(config::agent_tmp_dir);
if (boost::filesystem::exists(tmp_path)) {
boost::filesystem::remove_all(tmp_path);
}
boost::filesystem::create_directories(config::agent_tmp_dir);
// init task worker pool
_create_table_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::CREATE_TABLE,
master_info);
_drop_table_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::DROP_TABLE,
master_info);
_push_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::PUSH,
master_info);
_delete_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::DELETE,
master_info);
_alter_table_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::ALTER_TABLE,
master_info);
_clone_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::CLONE,
master_info);
_storage_medium_migrate_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::STORAGE_MEDIUM_MIGRATE,
master_info);
_cancel_delete_data_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::CANCEL_DELETE_DATA,
master_info);
_check_consistency_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::CHECK_CONSISTENCY,
master_info);
_report_task_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::REPORT_TASK,
master_info);
_report_disk_state_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::REPORT_DISK_STATE,
master_info);
_report_olap_table_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::REPORT_OLAP_TABLE,
master_info);
_upload_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::UPLOAD,
master_info);
_restore_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::RESTORE,
master_info);
_make_snapshot_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::MAKE_SNAPSHOT,
master_info);
_release_snapshot_workers = new TaskWorkerPool(
TaskWorkerPool::TaskWorkerType::RELEASE_SNAPSHOT,
master_info);
#ifndef BE_TEST
_create_table_workers->start();
_drop_table_workers->start();
_push_workers->start();
_delete_workers->start();
_alter_table_workers->start();
_clone_workers->start();
_storage_medium_migrate_workers->start();
_cancel_delete_data_workers->start();
_check_consistency_workers->start();
_report_task_workers->start();
_report_disk_state_workers->start();
_report_olap_table_workers->start();
_upload_workers->start();
_restore_workers->start();
_make_snapshot_workers->start();
_release_snapshot_workers->start();
// Add subscriber here and register listeners
TopicListener* user_resource_listener = new UserResourceListener(exec_env, master_info);
LOG(INFO) << "Register user resource listener";
_topic_subscriber->register_listener(palo::TTopicType::type::RESOURCE, user_resource_listener);
#endif
}
AgentServer::~AgentServer() {
if (_command_executor != NULL) {
delete _command_executor;
}
if (_create_table_workers != NULL) {
delete _create_table_workers;
}
if (_drop_table_workers != NULL) {
delete _drop_table_workers;
}
if (_push_workers != NULL) {
delete _push_workers;
}
if (_delete_workers != NULL) {
delete _delete_workers;
}
if (_alter_table_workers != NULL) {
delete _alter_table_workers;
}
if (_clone_workers != NULL) {
delete _clone_workers;
}
if (_storage_medium_migrate_workers != NULL) {
delete _storage_medium_migrate_workers;
}
if (_cancel_delete_data_workers != NULL) {
delete _cancel_delete_data_workers;
}
if (_check_consistency_workers != NULL) {
delete _check_consistency_workers;
}
if (_report_task_workers != NULL) {
delete _report_task_workers;
}
if (_report_disk_state_workers != NULL) {
delete _report_disk_state_workers;
}
if (_report_olap_table_workers != NULL) {
delete _report_olap_table_workers;
}
if (_upload_workers != NULL) {
delete _upload_workers;
}
if (_restore_workers != NULL) {
delete _restore_workers;
}
if (_make_snapshot_workers != NULL) {
delete _make_snapshot_workers;
}
if (_release_snapshot_workers != NULL) {
delete _release_snapshot_workers;
}
if (_topic_subscriber !=NULL) {
delete _topic_subscriber;
}
}
void AgentServer::submit_tasks(
TAgentResult& return_value,
const vector<TAgentTaskRequest>& tasks) {
// Set result to dm
vector<string> error_msgs;
TStatusCode::type status_code = TStatusCode::OK;
// TODO check require master same to heartbeat master
if (_master_info.network_address.hostname == ""
|| _master_info.network_address.port == 0) {
error_msgs.push_back("Not get master heartbeat yet.");
return_value.status.__set_error_msgs(error_msgs);
return_value.status.__set_status_code(TStatusCode::CANCELLED);
return;
}
for (auto task : tasks) {
TTaskType::type task_type = task.task_type;
int64_t signature = task.signature;
switch (task_type) {
case TTaskType::CREATE:
if (task.__isset.create_tablet_req) {
_create_table_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::DROP:
if (task.__isset.drop_tablet_req) {
_drop_table_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::PUSH:
if (task.__isset.push_req) {
if (task.push_req.push_type == TPushType::LOAD
|| task.push_req.push_type == TPushType::LOAD_DELETE) {
_push_workers->submit_task(task);
} else if (task.push_req.push_type == TPushType::DELETE) {
_delete_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::ROLLUP:
case TTaskType::SCHEMA_CHANGE:
if (task.__isset.alter_tablet_req) {
_alter_table_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::CLONE:
if (task.__isset.clone_req) {
_clone_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::STORAGE_MEDIUM_MIGRATE:
if (task.__isset.storage_medium_migrate_req) {
_storage_medium_migrate_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::CANCEL_DELETE:
if (task.__isset.cancel_delete_data_req) {
_cancel_delete_data_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::CHECK_CONSISTENCY:
if (task.__isset.check_consistency_req) {
_check_consistency_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::UPLOAD:
if (task.__isset.upload_req) {
_upload_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::RESTORE:
if (task.__isset.restore_req) {
_restore_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::MAKE_SNAPSHOT:
if (task.__isset.snapshot_req) {
_make_snapshot_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
case TTaskType::RELEASE_SNAPSHOT:
if (task.__isset.release_snapshot_req) {
_release_snapshot_workers->submit_task(task);
} else {
status_code = TStatusCode::ANALYSIS_ERROR;
}
break;
default:
status_code = TStatusCode::ANALYSIS_ERROR;
break;
}
if (status_code == TStatusCode::ANALYSIS_ERROR) {
OLAP_LOG_WARNING("task anaysis_error, signature: %ld", signature);
error_msgs.push_back("the task signature is:" + to_string(signature) + " has wrong request.");
}
}
return_value.status.__set_error_msgs(error_msgs);
return_value.status.__set_status_code(status_code);
}
void AgentServer::make_snapshot(TAgentResult& return_value,
const TSnapshotRequest& snapshot_request) {
TStatus status;
vector<string> error_msgs;
TStatusCode::type status_code = TStatusCode::OK;
string snapshot_path;
OLAPStatus make_snapshot_status =
_command_executor->make_snapshot(snapshot_request, &snapshot_path);
if (make_snapshot_status != OLAP_SUCCESS) {
status_code = TStatusCode::RUNTIME_ERROR;
OLAP_LOG_WARNING("make_snapshot failed. tablet_id: %ld, schema_hash: %ld, status: %d",
snapshot_request.tablet_id, snapshot_request.schema_hash,
make_snapshot_status);
error_msgs.push_back("make_snapshot failed. status: " +
boost::lexical_cast<string>(make_snapshot_status));
} else {
OLAP_LOG_INFO("make_snapshot success. tablet_id: %ld, schema_hash: %ld, snapshot_path: %s",
snapshot_request.tablet_id, snapshot_request.schema_hash,
snapshot_path.c_str());
return_value.__set_snapshot_path(snapshot_path);
}
status.__set_error_msgs(error_msgs);
status.__set_status_code(status_code);
return_value.__set_status(status);
}
void AgentServer::release_snapshot(TAgentResult& return_value, const std::string& snapshot_path) {
vector<string> error_msgs;
TStatusCode::type status_code = TStatusCode::OK;
OLAPStatus release_snapshot_status =
_command_executor->release_snapshot(snapshot_path);
if (release_snapshot_status != OLAP_SUCCESS) {
status_code = TStatusCode::RUNTIME_ERROR;
OLAP_LOG_WARNING("release_snapshot failed. snapshot_path: %s, status: %d",
snapshot_path.c_str(), release_snapshot_status);
error_msgs.push_back("release_snapshot failed. status: " +
boost::lexical_cast<string>(release_snapshot_status));
} else {
OLAP_LOG_INFO("release_snapshot success. snapshot_path: %s, status: %d",
snapshot_path.c_str(), release_snapshot_status);
}
return_value.status.__set_error_msgs(error_msgs);
return_value.status.__set_status_code(status_code);
}
void AgentServer::publish_cluster_state(TAgentResult& _return, const TAgentPublishRequest& request) {
vector<string> error_msgs;
_topic_subscriber->handle_updates(request);
OLAP_LOG_INFO("AgentService receive contains %d publish updates", request.updates.size());
_return.status.__set_status_code(TStatusCode::OK);
}
void AgentServer::submit_etl_task(TAgentResult& return_value,
const TMiniLoadEtlTaskRequest& request) {
Status status = _exec_env->etl_job_mgr()->start_job(request);
if (status.ok()) {
VLOG_RPC << "start etl task successfull id="
<< request.params.params.fragment_instance_id;
} else {
VLOG_RPC << "start etl task failed id="
<< request.params.params.fragment_instance_id
<< " and err_msg=" << status.get_error_msg();
}
status.to_thrift(&return_value.status);
}
void AgentServer::get_etl_status(TMiniLoadEtlStatusResult& return_value,
const TMiniLoadEtlStatusRequest& request) {
Status status = _exec_env->etl_job_mgr()->get_job_state(request.mini_load_id, &return_value);
if (!status.ok()) {
LOG(WARNING) << "get job state failed. [id=" << request.mini_load_id << "]";
} else {
VLOG_RPC << "get job state successful. [id=" << request.mini_load_id << ",status="
<< return_value.status.status_code << ",etl_state=" << return_value.etl_state
<< ",files=";
for (auto& item : return_value.file_map) {
VLOG_RPC << item.first << ":" << item.second << ";";
}
VLOG_RPC << "]";
}
}
void AgentServer::delete_etl_files(TAgentResult& result,
const TDeleteEtlFilesRequest& request) {
Status status = _exec_env->etl_job_mgr()->erase_job(request);
if (!status.ok()) {
LOG(WARNING) << "delete etl files failed. because " << status.get_error_msg()
<< " with request " << request;
} else {
VLOG_RPC << "delete etl files successful with param " << request;
}
status.to_thrift(&result.status);
}
} // namesapce palo