upgrade clang-format version to 16 move thrift to fe-common fix core dump on pipeline engine when operator canceled and not prepared
279 lines
7.4 KiB
Bash
Executable File
279 lines
7.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
##############################################################
|
|
# This script is used to load generated ssb data set to Doris
|
|
# Only for 1 fact table: lineorder
|
|
##############################################################
|
|
|
|
set -eo pipefail
|
|
|
|
ROOT=$(dirname "$0")
|
|
ROOT=$(
|
|
cd "${ROOT}"
|
|
pwd
|
|
)
|
|
|
|
CURDIR="${ROOT}"
|
|
SSB_DATA_DIR="${CURDIR}/ssb-data/"
|
|
|
|
usage() {
|
|
echo "
|
|
Usage: $0 <options>
|
|
Optional options:
|
|
-c parallelism to load data of lineorder table, default is 5.
|
|
|
|
Eg.
|
|
$0 load data using default value.
|
|
$0 -c 10 load lineorder table data using parallelism 10.
|
|
"
|
|
exit 1
|
|
}
|
|
|
|
OPTS=$(getopt \
|
|
-n "$0" \
|
|
-o '' \
|
|
-o 'hc:' \
|
|
-- "$@")
|
|
|
|
eval set -- "${OPTS}"
|
|
|
|
PARALLEL=5
|
|
HELP=0
|
|
|
|
if [[ $# == 0 ]]; then
|
|
usage
|
|
fi
|
|
|
|
while true; do
|
|
case "$1" in
|
|
-h)
|
|
HELP=1
|
|
shift
|
|
;;
|
|
-c)
|
|
PARALLEL=$2
|
|
shift 2
|
|
;;
|
|
--)
|
|
shift
|
|
break
|
|
;;
|
|
*)
|
|
echo "Internal error"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ "${HELP}" -eq 1 ]]; then
|
|
usage
|
|
fi
|
|
|
|
echo "Parallelism: ${PARALLEL}"
|
|
|
|
# check if ssb-data exists
|
|
if [[ ! -d ${SSB_DATA_DIR}/ ]]; then
|
|
echo "${SSB_DATA_DIR} does not exist. Run sh gen-ssb-data.sh first."
|
|
exit 1
|
|
fi
|
|
|
|
check_prerequest() {
|
|
local CMD=$1
|
|
local NAME=$2
|
|
if ! ${CMD}; then
|
|
echo "${NAME} is missing. This script depends on cURL to load data to Doris."
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
run_sql() {
|
|
sql="$*"
|
|
echo "${sql}"
|
|
mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -D"${DB}" -e "$@"
|
|
}
|
|
|
|
load_lineitem_flat() {
|
|
# Loading data in batches by year.
|
|
for con in 'lo_orderdate<19930101' 'lo_orderdate>=19930101 and lo_orderdate<19940101' 'lo_orderdate>=19940101 and lo_orderdate<19950101' 'lo_orderdate>=19950101 and lo_orderdate<19960101' 'lo_orderdate>=19960101 and lo_orderdate<19970101' 'lo_orderdate>=19970101 and lo_orderdate<19980101' 'lo_orderdate>=19980101'; do
|
|
echo -e "\n${con}"
|
|
run_sql "
|
|
INSERT INTO lineorder_flat
|
|
SELECT
|
|
LO_ORDERDATE,
|
|
LO_ORDERKEY,
|
|
LO_LINENUMBER,
|
|
LO_CUSTKEY,
|
|
LO_PARTKEY,
|
|
LO_SUPPKEY,
|
|
LO_ORDERPRIORITY,
|
|
LO_SHIPPRIORITY,
|
|
LO_QUANTITY,
|
|
LO_EXTENDEDPRICE,
|
|
LO_ORDTOTALPRICE,
|
|
LO_DISCOUNT,
|
|
LO_REVENUE,
|
|
LO_SUPPLYCOST,
|
|
LO_TAX,
|
|
LO_COMMITDATE,
|
|
LO_SHIPMODE,
|
|
C_NAME,
|
|
C_ADDRESS,
|
|
C_CITY,
|
|
C_NATION,
|
|
C_REGION,
|
|
C_PHONE,
|
|
C_MKTSEGMENT,
|
|
S_NAME,
|
|
S_ADDRESS,
|
|
S_CITY,
|
|
S_NATION,
|
|
S_REGION,
|
|
S_PHONE,
|
|
P_NAME,
|
|
P_MFGR,
|
|
P_CATEGORY,
|
|
P_BRAND,
|
|
P_COLOR,
|
|
P_TYPE,
|
|
P_SIZE,
|
|
P_CONTAINER
|
|
FROM (
|
|
SELECT
|
|
lo_orderkey,
|
|
lo_linenumber,
|
|
lo_custkey,
|
|
lo_partkey,
|
|
lo_suppkey,
|
|
lo_orderdate,
|
|
lo_orderpriority,
|
|
lo_shippriority,
|
|
lo_quantity,
|
|
lo_extendedprice,
|
|
lo_ordtotalprice,
|
|
lo_discount,
|
|
lo_revenue,
|
|
lo_supplycost,
|
|
lo_tax,
|
|
lo_commitdate,
|
|
lo_shipmode
|
|
FROM lineorder
|
|
WHERE ${con}
|
|
) l
|
|
INNER JOIN customer c
|
|
ON (c.c_custkey = l.lo_custkey)
|
|
INNER JOIN supplier s
|
|
ON (s.s_suppkey = l.lo_suppkey)
|
|
INNER JOIN part p
|
|
ON (p.p_partkey = l.lo_partkey);
|
|
"
|
|
done
|
|
}
|
|
|
|
check_prerequest "curl --version" "curl"
|
|
|
|
source "${CURDIR}/../conf/doris-cluster.conf"
|
|
export MYSQL_PWD=${PASSWORD}
|
|
|
|
echo "FE_HOST: ${FE_HOST}"
|
|
echo "FE_HTTP_PORT: ${FE_HTTP_PORT}"
|
|
echo "USER: ${USER}"
|
|
echo "DB: ${DB}"
|
|
|
|
start_time=$(date +%s)
|
|
echo "Start time: $(date)"
|
|
echo "==========Start to load data into ssb tables=========="
|
|
|
|
echo 'Loading data for table: part'
|
|
curl --location-trusted -u "${USER}":"${PASSWORD}" \
|
|
-H "column_separator:|" \
|
|
-H "columns:p_partkey,p_name,p_mfgr,p_category,p_brand,p_color,p_type,p_size,p_container,p_dummy" \
|
|
-T "${SSB_DATA_DIR}"/part.tbl http://"${FE_HOST}":"${FE_HTTP_PORT}"/api/"${DB}"/part/_stream_load
|
|
|
|
echo 'Loading data for table: date'
|
|
curl --location-trusted -u "${USER}":"${PASSWORD}" \
|
|
-H "column_separator:|" \
|
|
-H "columns:d_datekey,d_date,d_dayofweek,d_month,d_year,d_yearmonthnum,d_yearmonth,d_daynuminweek,d_daynuminmonth,d_daynuminyear,d_monthnuminyear,d_weeknuminyear,d_sellingseason,d_lastdayinweekfl,d_lastdayinmonthfl,d_holidayfl,d_weekdayfl,d_dummy" \
|
|
-T "${SSB_DATA_DIR}"/date.tbl http://"${FE_HOST}":"${FE_HTTP_PORT}"/api/"${DB}"/dates/_stream_load
|
|
|
|
echo 'Loading data for table: supplier'
|
|
curl --location-trusted -u "${USER}":"${PASSWORD}" \
|
|
-H "column_separator:|" \
|
|
-H "columns:s_suppkey,s_name,s_address,s_city,s_nation,s_region,s_phone,s_dummy" \
|
|
-T "${SSB_DATA_DIR}"/supplier.tbl http://"${FE_HOST}":"${FE_HTTP_PORT}"/api/"${DB}"/supplier/_stream_load
|
|
|
|
echo 'Loading data for table: customer'
|
|
curl --location-trusted -u "${USER}":"${PASSWORD}" \
|
|
-H "column_separator:|" \
|
|
-H "columns:c_custkey,c_name,c_address,c_city,c_nation,c_region,c_phone,c_mktsegment,no_use" \
|
|
-T "${SSB_DATA_DIR}"/customer.tbl http://"${FE_HOST}":"${FE_HTTP_PORT}"/api/"${DB}"/customer/_stream_load
|
|
|
|
echo "Loading data for table: lineorder, with ${PARALLEL} parallel"
|
|
function load() {
|
|
echo "$@"
|
|
curl --location-trusted -u "${USER}":"${PASSWORD}" \
|
|
-H "column_separator:|" \
|
|
-H "columns:lo_orderkey,lo_linenumber,lo_custkey,lo_partkey,lo_suppkey,lo_orderdate,lo_orderpriority,lo_shippriority,lo_quantity,lo_extendedprice,lo_ordtotalprice,lo_discount,lo_revenue,lo_supplycost,lo_tax,lo_commitdate,lo_shipmode,lo_dummy" \
|
|
-T "$@" http://"${FE_HOST}":"${FE_HTTP_PORT}"/api/"${DB}"/lineorder/_stream_load
|
|
}
|
|
|
|
# set parallelism
|
|
[[ -e /tmp/fd1 ]] || mkfifo /tmp/fd1
|
|
exec 3<>/tmp/fd1
|
|
rm -rf /tmp/fd1
|
|
|
|
for ((i = 1; i <= PARALLEL; i++)); do
|
|
echo >&3
|
|
done
|
|
|
|
date
|
|
for file in "${SSB_DATA_DIR}"/lineorder.tbl.*; do
|
|
read -r -u3
|
|
{
|
|
load "${file}"
|
|
echo >&3
|
|
} &
|
|
done
|
|
|
|
# wait for child thread finished
|
|
wait
|
|
date
|
|
|
|
echo "==========Start to insert data into ssb flat table=========="
|
|
echo "change some session variables before load, and then restore after load."
|
|
origin_parallel=$(
|
|
set -e
|
|
run_sql 'select @@parallel_fragment_exec_instance_num;' | sed -n '3p'
|
|
)
|
|
# set parallel_fragment_exec_instance_num=1, loading maybe slow but stable.
|
|
run_sql "set global insert_timeout=7200;"
|
|
run_sql "set global parallel_fragment_exec_instance_num=1;"
|
|
echo '============================================'
|
|
date
|
|
load_lineitem_flat
|
|
date
|
|
echo '============================================'
|
|
echo "restore session variables"
|
|
run_sql "set global parallel_fragment_exec_instance_num=${origin_parallel};"
|
|
echo '============================================'
|
|
|
|
end_time=$(date +%s)
|
|
echo "End time: $(date)"
|
|
|
|
echo "Finish load ssb data, Time taken: $((end_time - start_time)) seconds"
|