[tools](ssb and tpch)optimize tools (#11975)

This commit is contained in:
Dongyang Li
2022-08-24 14:29:38 +08:00
committed by GitHub
parent 81b6c8d9f9
commit 9cceddaeb3
20 changed files with 266 additions and 487 deletions

View File

@ -20,27 +20,21 @@ under the License.
# Usage
These scripts are used to make ssb and ssb flat test.
The ssb flat data comes from ssb tables by way of 'INSERT INTO ... SELECT ...',
which means ssb test steps 1 to 4 should have been done before loading ssb flat data.
The ssb flat data comes from ssb tables by way of 'INSERT INTO ... SELECT ...'.
## ssb test, follow the steps below:
### 1. build ssb dbgen tool.
./build-ssb-dbgen.sh
./bin/build-ssb-dbgen.sh
### 2. generate ssb data. use -h for more infomations.
./gen-ssb-data.sh -s 1
### 3. create ssb tables. modify `doris-cluster.conf` to specify doris info, then run script below.
./create-ssb-tables.sh
./bin/gen-ssb-data.sh -s 1
### 3. create ssb tables. modify `conf/doris-cluster.conf` to specify Doris cluster info, then run script below.
./bin/create-ssb-tables.sh
### 4. load ssb data. use -h for help.
./load-ssb-dimension-data.sh
./load-ssb-fact-data.sh
./bin/load-ssb-data.sh
### 5. run ssb queries.
./run-ssb-queries.sh
./bin/run-ssb-queries.sh
## ssb flat test, follow the steps below:
### 1. prepare ssb data, which means ssb test steps 1 to 4 have been done.
### 2. create ssb flat table in the same database of ssb tables.
./create-ssb-flat-table.sh
### 3. load ssb flat data.
./load-ssb-flat-data.sh
### 4. run ssb flat queries.
./run-ssb-flat-queries.sh
### 2. run ssb flat queries.
./bin/run-ssb-flat-queries.sh

View File

@ -19,14 +19,17 @@
##############################################################
# This script is used to build ssb-dbgen
# sssb-dbgen's source code is from https://github.com/electrum/ssb-dbgen.git
# Usage:
# Usage:
# sh build-ssb-dbgen.sh
##############################################################
set -eo pipefail
ROOT=`dirname "$0"`
ROOT=`cd "$ROOT"; pwd`
ROOT=$(dirname "$0")
ROOT=$(
cd "$ROOT"
pwd
)
CURDIR=${ROOT}
SSB_DBGEN_DIR=$CURDIR/ssb-dbgen/
@ -35,17 +38,23 @@ SSB_DBGEN_DIR=$CURDIR/ssb-dbgen/
if [[ -d $SSB_DBGEN_DIR ]]; then
echo "Dir $CURDIR/ssb-dbgen/ already exists. No need to download."
echo "If you want to download ssb-dbgen again, please delete this dir first."
exit 1
else
curl https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz | tar xz -C $CURDIR/
cd "$CURDIR"
wget https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz && tar -xzvf ssb-dbgen-linux.tar.gz -C $CURDIR/
fi
# compile ssb-dbgen
cd $SSB_DBGEN_DIR/ && make
cd "$SSB_DBGEN_DIR/" && make
cd -
# check
if [[ -f $CURDIR/ssb-dbgen/dbgen ]]; then
echo "Build succeed! Run $CURDIR/ssb-dbgen/dbgen -h"
echo -e "
################
Build succeed!
################
Run $CURDIR/ssb-dbgen/dbgen -h"
exit 0
else
echo "Build failed!"

View File

@ -29,19 +29,20 @@ ROOT=$(
)
CURDIR=${ROOT}
DDL="${CURDIR}/ddl/create-ssb-tables.sql"
SSB_DDL="${CURDIR}/../ddl/create-ssb-tables.sql"
SSB_FLAT_DDL="${CURDIR}/../ddl/create-ssb-flat-table.sql"
usage() {
echo "
This script is used to create SSB tables,
will use mysql client to connect Doris server which is specified in doris-cluster.conf file.
will use mysql client to connect Doris server which is specified in conf/doris-cluster.conf file.
Usage: $0
"
exit 1
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'h' \
-- "$@")
@ -86,7 +87,8 @@ check_prerequest() {
check_prerequest "mysql --version" "mysql"
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
@ -95,7 +97,10 @@ echo "USER: $USER"
echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -e "CREATE DATABASE IF NOT EXISTS $DB"
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -e "CREATE DATABASE IF NOT EXISTS $DB"
echo "Run DDL from ${DDL}"
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB <${DDL}
echo "Run DDL from $SSB_DDL"
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -D"$DB" <"$SSB_DDL"
echo "Run DDL from $SSB_FLAT_DDL"
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -D"$DB" <"$SSB_FLAT_DDL"

View File

@ -48,7 +48,7 @@ Usage: $0 <options>
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'hs:c:' \
-- "$@")
@ -107,24 +107,24 @@ if [[ -d $SSB_DATA_DIR/ ]]; then
exit 1
fi
mkdir $SSB_DATA_DIR/
mkdir "$SSB_DATA_DIR/"
# gen data
cd $SSB_DBGEN_DIR
cd "$SSB_DBGEN_DIR"
echo "Begin to generate data for table: customer"
$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T c
"$SSB_DBGEN_DIR/dbgen" -f -s "$SCALE_FACTOR" -T c
echo "Begin to generate data for table: part"
$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T p
"$SSB_DBGEN_DIR/dbgen" -f -s "$SCALE_FACTOR" -T p
echo "Begin to generate data for table: supplier"
$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T s
"$SSB_DBGEN_DIR/dbgen" -f -s "$SCALE_FACTOR" -T s
echo "Begin to generate data for table: date"
$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T d
"$SSB_DBGEN_DIR/dbgen" -f -s "$SCALE_FACTOR" -T d
echo "Begin to generate data for table: lineorder"
$SSB_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T l -C $PARALLEL
"$SSB_DBGEN_DIR/dbgen" -f -s "$SCALE_FACTOR" -T l -C "$PARALLEL"
cd -
# move data to $SSB_DATA_DIR
mv $SSB_DBGEN_DIR/*.tbl* $SSB_DATA_DIR/
mv "$SSB_DBGEN_DIR"/*.tbl* "$SSB_DATA_DIR/"
# check data
du -sh $SSB_DATA_DIR/*.tbl*
du -sh "$SSB_DATA_DIR"/*.tbl*

View File

@ -30,30 +30,46 @@ ROOT=$(
)
CURDIR=${ROOT}
SSB_DATA_DIR="$CURDIR/ssb-data/"
usage() {
echo "
The ssb flat data actually comes from ssb tables, and will load by 'INSERT INTO ... SELECT ...'
Usage: $0 <options>
Optional options:
-c parallelism to load data of lineorder table, default is 5.
Eg.
$0 load data using default value.
$0 -c 10 load lineorder table data using parallelism 10.
"
exit 1
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'h' \
-o 'hc:' \
-- "$@")
eval set -- "$OPTS"
PARALLEL=5
HELP=0
if [ $# == 0 ]; then
usage
fi
while true; do
case "$1" in
-h)
HELP=1
shift
;;
-c)
PARALLEL=$2
shift 2
;;
--)
shift
break
@ -70,6 +86,14 @@ if [[ ${HELP} -eq 1 ]]; then
exit
fi
echo "Parallelism: $PARALLEL"
# check if ssb-data exists
if [[ ! -d $SSB_DATA_DIR/ ]]; then
echo "$SSB_DATA_DIR does not exist. Run sh gen-ssb-data.sh first."
exit 1
fi
check_prerequest() {
local CMD=$1
local NAME=$2
@ -80,9 +104,9 @@ check_prerequest() {
}
run_sql() {
sql="$@"
echo $sql
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB -e "$@"
sql="$*"
echo "$sql"
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -D"$DB" -e "$@"
}
load_lineitem_flat() {
@ -165,7 +189,9 @@ ON (p.p_partkey = l.lo_partkey);
check_prerequest "curl --version" "curl"
# load lineorder
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
echo "FE_HTTP_PORT: $FE_HTTP_PORT"
@ -173,25 +199,78 @@ echo "USER: $USER"
echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
echo 'Loading data for table: lineorder_flat'
date
echo "==========Start to load data into ssb tables=========="
echo 'Loading data for table: part'
curl --location-trusted -u "$USER":"$PASSWORD" \
-H "column_separator:|" \
-H "columns:p_partkey,p_name,p_mfgr,p_category,p_brand,p_color,p_type,p_size,p_container,p_dummy" \
-T "$SSB_DATA_DIR"/part.tbl http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/part/_stream_load
echo '============================================'
echo 'Loading data for table: date'
curl --location-trusted -u "$USER":"$PASSWORD" \
-H "column_separator:|" \
-H "columns:d_datekey,d_date,d_dayofweek,d_month,d_year,d_yearmonthnum,d_yearmonth,d_daynuminweek,d_daynuminmonth,d_daynuminyear,d_monthnuminyear,d_weeknuminyear,d_sellingseason,d_lastdayinweekfl,d_lastdayinmonthfl,d_holidayfl,d_weekdayfl,d_dummy" \
-T "$SSB_DATA_DIR"/date.tbl http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/dates/_stream_load
echo 'Loading data for table: supplier'
curl --location-trusted -u "$USER":"$PASSWORD" \
-H "column_separator:|" \
-H "columns:s_suppkey,s_name,s_address,s_city,s_nation,s_region,s_phone,s_dummy" \
-T "$SSB_DATA_DIR"/supplier.tbl http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/supplier/_stream_load
echo 'Loading data for table: customer'
curl --location-trusted -u "$USER":"$PASSWORD" \
-H "column_separator:|" \
-H "columns:c_custkey,c_name,c_address,c_city,c_nation,c_region,c_phone,c_mktsegment,no_use" \
-T "$SSB_DATA_DIR"/customer.tbl http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/customer/_stream_load
echo "Loading data for table: lineorder, with $PARALLEL parallel"
function load() {
echo "$@"
curl --location-trusted -u "$USER":"$PASSWORD" \
-H "column_separator:|" \
-H "columns:lo_orderkey,lo_linenumber,lo_custkey,lo_partkey,lo_suppkey,lo_orderdate,lo_orderpriority,lo_shippriority,lo_quantity,lo_extendedprice,lo_ordtotalprice,lo_discount,lo_revenue,lo_supplycost,lo_tax,lo_commitdate,lo_shipmode,lo_dummy" \
-T "$@" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/lineorder/_stream_load
}
# set parallelism
[ -e /tmp/fd1 ] || mkfifo /tmp/fd1
exec 3<>/tmp/fd1
rm -rf /tmp/fd1
for ((i = 1; i <= PARALLEL; i++)); do
echo >&3
done
date
for file in "$SSB_DATA_DIR"/lineorder.tbl.*; do
read -r -u3
{
load "$file"
echo >&3
} &
done
# wait for child thread finished
wait
date
echo "==========Start to insert data into ssb flat table=========="
echo "change some session variables before load, and then restore after load."
origin_query_timeout=$(run_sql 'select @@query_timeout;' | sed -n '3p')
origin_parallel=$(run_sql 'select @@parallel_fragment_exec_instance_num;' | sed -n '3p')
# set parallel_fragment_exec_instance_num=1, loading maybe slow but stable.
run_sql "set global query_timeout=7200;"
run_sql "set global parallel_fragment_exec_instance_num=1;"
echo '============================================'
echo $(date)
date
load_lineitem_flat
date
echo '============================================'
echo "restore session variables"
run_sql "set global query_timeout=${origin_query_timeout};"
run_sql "set global parallel_fragment_exec_instance_num=${origin_parallel};"
echo '============================================'
echo $(date)
echo "DONE."

View File

@ -28,8 +28,8 @@ ROOT=$(
pwd
)
CURDIR=${ROOT}
QUERIES_DIR=$CURDIR/ssb-flat-queries
CURDIR="${ROOT}"
QUERIES_DIR="$CURDIR/../ssb-flat-queries"
usage() {
echo "
@ -41,7 +41,7 @@ Usage: $0
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'h' \
-- "$@")
@ -86,7 +86,8 @@ check_prerequest() {
check_prerequest "mysqlslap --version" "mysqlslap"
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
@ -96,8 +97,8 @@ echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
pre_set() {
echo $@
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB -e "$@"
echo "$@"
mysql -h"$FE_HOST" -P"$FE_QUERY_PORT" -u"$USER" -D"$DB" -e "$@"
}
pre_set "set global enable_vectorized_engine=1;"
@ -105,14 +106,15 @@ pre_set "set global parallel_fragment_exec_instance_num=8;"
pre_set "set global exec_mem_limit=8G;"
pre_set "set global batch_size=4096;"
echo '============================================'
pre_set "show variables"
pre_set "show variables;"
echo '============================================'
pre_set "show table status;"
echo '============================================'
for i in '1.1' '1.2' '1.3' '2.1' '2.2' '2.3' '3.1' '3.2' '3.3' '3.4' '4.1' '4.2' '4.3'; do
# First run to prevent the affect of cold start
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D $DB <$QUERIES_DIR/q${i}.sql >/dev/null 2>&1
mysql -h"$FE_HOST" -P"$FE_QUERY_PORT" -u"$USER" -D "$DB" <"$QUERIES_DIR"/q${i}.sql >/dev/null 2>&1
# Then run 3 times and takes the average time
res=$(mysqlslap -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT --create-schema=$DB --query=$QUERIES_DIR/q${i}.sql -F '\r' -i 3 | sed -n '2p' | cut -d ' ' -f 9,10)
res=$(mysqlslap -h"$FE_HOST" -P"$FE_QUERY_PORT" -u"$USER" --create-schema="$DB" --query="$QUERIES_DIR"/q${i}.sql -F '\r' -i 3 | sed -n '2p' | cut -d ' ' -f 9,10)
echo "q$i: $res"
sleep 1
done

View File

@ -29,7 +29,7 @@ ROOT=$(
)
CURDIR=${ROOT}
QUERIES_DIR=$CURDIR/ssb-queries
QUERIES_DIR=$CURDIR/../ssb-queries
usage() {
echo "
@ -41,7 +41,7 @@ Usage: $0
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'h' \
-- "$@")
@ -86,7 +86,8 @@ check_prerequest() {
check_prerequest "mysqlslap --version" "mysql slap"
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
@ -96,8 +97,8 @@ echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
pre_set() {
echo $@
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB -e "$@"
echo "$@"
mysql -h"$FE_HOST" -P"$FE_QUERY_PORT" -u"$USER" -D"$DB" -e "$@"
}
pre_set "set global enable_vectorized_engine=1;"
@ -108,11 +109,13 @@ pre_set "set global enable_projection=true;"
pre_set "set global runtime_filter_mode=global;"
# pre_set "set global enable_cost_based_join_reorder=1"
echo '============================================'
pre_set "show variables"
pre_set "show variables;"
echo '============================================'
pre_set "show table status;"
echo '============================================'
for i in '1.1' '1.2' '1.3' '2.1' '2.2' '2.3' '3.1' '3.2' '3.3' '3.4' '4.1' '4.2' '4.3'; do
# Each query is executed 3 times and takes the average time
res=$(mysqlslap -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT --create-schema=$DB --query=$QUERIES_DIR/q${i}.sql -F '\r' -i 3 | sed -n '2p' | cut -d ' ' -f 9,10)
res=$(mysqlslap -h"$FE_HOST" -P"$FE_QUERY_PORT" -u"$USER" --create-schema="$DB" --query="$QUERIES_DIR"/q${i}.sql -F '\r' -i 3 | sed -n '2p' | cut -d ' ' -f 9,10)
echo "q$i: $res"
done

View File

@ -1,101 +0,0 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
##############################################################
# This script is used to create ssb flat table
##############################################################
set -eo pipefail
ROOT=$(dirname "$0")
ROOT=$(
cd "$ROOT"
pwd
)
CURDIR=${ROOT}
DDL="${CURDIR}/ddl/create-ssb-flat-table.sql"
usage() {
echo "
This script is used to create ssb flat table,
will use mysql client to connect Doris server which is specified in doris-cluster.conf file.
Usage: $0
"
exit 1
}
OPTS=$(getopt \
-n $0 \
-o '' \
-o 'h' \
-- "$@")
eval set -- "$OPTS"
HELP=0
if [ $# == 0 ]; then
usage
fi
while true; do
case "$1" in
-h)
HELP=1
shift
;;
--)
shift
break
;;
*)
echo "Internal error"
exit 1
;;
esac
done
if [[ ${HELP} -eq 1 ]]; then
usage
exit
fi
check_prerequest() {
local CMD=$1
local NAME=$2
if ! $CMD; then
echo "$NAME is missing. This script depends on mysql to create tables in Doris."
exit 1
fi
}
check_prerequest "mysql --version" "mysql"
source $CURDIR/doris-cluster.conf
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
echo "FE_QUERY_PORT: $FE_QUERY_PORT"
echo "USER: $USER"
echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -e "CREATE DATABASE IF NOT EXISTS $DB"
echo "Run DDL from ${DDL}"
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB <${DDL}

View File

@ -15,7 +15,7 @@
-- specific language governing permissions and limitations
-- under the License.
CREATE TABLE `lineorder_flat` (
CREATE TABLE IF NOT EXISTS `lineorder_flat` (
`LO_ORDERDATE` int(11) NOT NULL COMMENT "",
`LO_ORDERKEY` int(11) NOT NULL COMMENT "",
`LO_LINENUMBER` tinyint(4) NOT NULL COMMENT "",

View File

@ -1,86 +0,0 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
##############################################################
# This script is used to load generated ssb data set to Doris
# Only for 4 dimension tables: customer, part, supplier and date.
# Usage:
# sh load-dimension-data.sh
##############################################################
set -eo pipefail
ROOT=$(dirname "$0")
ROOT=$(
cd "$ROOT"
pwd
)
CURDIR=${ROOT}
SSB_DATA_DIR=$CURDIR/ssb-data/
# check if ssb-data exists
if [[ ! -d $SSB_DATA_DIR/ ]]; then
echo "$SSB_DATA_DIR does not exist. Run sh gen-ssb-data.sh first."
exit 1
fi
check_prerequest() {
local CMD=$1
local NAME=$2
if ! $CMD; then
echo "$NAME is missing. This script depends on cURL to load data to Doris."
exit 1
fi
}
check_prerequest "curl --version" "curl"
# load 4 small dimension tables
source $CURDIR/doris-cluster.conf
echo "FE_HOST: $FE_HOST"
echo "FE_HTTP_PORT: $FE_HTTP_PORT"
echo "USER: $USER"
echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
echo 'Loading data for table: part'
curl --location-trusted -u $USER:$PASSWORD \
-H "column_separator:|" \
-H "columns:p_partkey,p_name,p_mfgr,p_category,p_brand,p_color,p_type,p_size,p_container,p_dummy" \
-T $SSB_DATA_DIR/part.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/part/_stream_load
echo 'Loading data for table: date'
curl --location-trusted -u $USER:$PASSWORD \
-H "column_separator:|" \
-H "columns:d_datekey,d_date,d_dayofweek,d_month,d_year,d_yearmonthnum,d_yearmonth,d_daynuminweek,d_daynuminmonth,d_daynuminyear,d_monthnuminyear,d_weeknuminyear,d_sellingseason,d_lastdayinweekfl,d_lastdayinmonthfl,d_holidayfl,d_weekdayfl,d_dummy" \
-T $SSB_DATA_DIR/date.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/dates/_stream_load
echo 'Loading data for table: supplier'
curl --location-trusted -u $USER:$PASSWORD \
-H "column_separator:|" \
-H "columns:s_suppkey,s_name,s_address,s_city,s_nation,s_region,s_phone,s_dummy" \
-T $SSB_DATA_DIR/supplier.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/supplier/_stream_load
echo 'Loading data for table: customer'
curl --location-trusted -u $USER:$PASSWORD \
-H "column_separator:|" \
-H "columns:c_custkey,c_name,c_address,c_city,c_nation,c_region,c_phone,c_mktsegment,no_use" \
-T $SSB_DATA_DIR/customer.tbl http://$FE_HOST:$FE_HTTP_PORT/api/$DB/customer/_stream_load

View File

@ -1,147 +0,0 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
##############################################################
# This script is used to load generated ssb data set to Doris
# Only for 1 fact table: lineorder
##############################################################
set -eo pipefail
ROOT=$(dirname "$0")
ROOT=$(
cd "$ROOT"
pwd
)
CURDIR=${ROOT}
SSB_DATA_DIR=$CURDIR/ssb-data/
usage() {
echo "
Usage: $0 <options>
Optional options:
-c parallelism to load data of lineorder table, default is 5.
Eg.
$0 load data using default value.
$0 -c 10 load lineorder table data using parallelism 10.
"
exit 1
}
OPTS=$(getopt \
-n $0 \
-o '' \
-o 'hc:' \
-- "$@")
eval set -- "$OPTS"
PARALLEL=3
HELP=0
if [ $# == 0 ]; then
usage
fi
while true; do
case "$1" in
-h)
HELP=1
shift
;;
-c)
PARALLEL=$2
shift 2
;;
--)
shift
break
;;
*)
echo "Internal error"
exit 1
;;
esac
done
if [[ ${HELP} -eq 1 ]]; then
usage
exit
fi
echo "Parallelism: $PARALLEL"
# check if ssb-data exists
if [[ ! -d $SSB_DATA_DIR/ ]]; then
echo "$SSB_DATA_DIR does not exist. Run sh gen-ssb-data.sh first."
exit 1
fi
check_prerequest() {
local CMD=$1
local NAME=$2
if ! $CMD; then
echo "$NAME is missing. This script depends on cURL to load data to Doris."
exit 1
fi
}
check_prerequest "curl --version" "curl"
# load lineorder
source $CURDIR/doris-cluster.conf
echo "FE_HOST: $FE_HOST"
echo "FE_HTTP_PORT: $FE_HTTP_PORT"
echo "USER: $USER"
echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
function load() {
echo $@
curl --location-trusted -u $USER:$PASSWORD \
-H "column_separator:|" \
-H "columns:lo_orderkey,lo_linenumber,lo_custkey,lo_partkey,lo_suppkey,lo_orderdate,lo_orderpriority,lo_shippriority,lo_quantity,lo_extendedprice,lo_ordtotalprice,lo_discount,lo_revenue,lo_supplycost,lo_tax,lo_commitdate,lo_shipmode,lo_dummy" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/lineorder/_stream_load
}
# set parallelism
[ -e /tmp/fd1 ] || mkfifo /tmp/fd1
exec 3<>/tmp/fd1
rm -rf /tmp/fd1
for ((i = 1; i <= $PARALLEL; i++)); do
echo >&3
done
echo $(date)
for file in $(ls $SSB_DATA_DIR/lineorder.tbl.*); do
read -u3
{
load $file
echo >&3
} &
done
# wait for child thread finished
wait
echo $(date)
echo "DONE."

View File

@ -24,23 +24,23 @@ follow the steps below:
### 1. build tpc-h dbgen tool.
./build-tpch-dbgen.sh
./bin/build-tpch-dbgen.sh
### 2. generate tpc-h data. use -h for more infomations.
./gen-tpch-data.sh -s 1
./bin/gen-tpch-data.sh -s 1
### 3. create tpc-h tables. modify `doris-cluster.conf` to specify doris info, then run script below.
### 3. create tpc-h tables. modify `conf/doris-cluster.conf` to specify doris info, then run script below.
./create-tpch-tables.sh
./bin/create-tpch-tables.sh
### 4. load tpc-h data. use -h for help.
./load-tpch-data.sh
./bin/load-tpch-data.sh
### 5. run tpc-h queries.
./run-tpch-queries.sh
./bin/run-tpch-queries.sh
NOTICE: At present, Doris's query optimizer and statistical information functions are not complete, so we rewrite some queries in TPC-H to adapt to Doris' execution framework, but it does not affect the correctness of the results. The rewritten SQL is marked with "Modified" in the corresponding .sql file.

View File

@ -51,13 +51,12 @@ if [[ -d $TPCH_DBGEN_DIR ]]; then
echo "If you want to download TPC-H_Tools_v3.0.0 again, please delete this dir first."
else
wget "https://tools-chengdu.oss-cn-chengdu.aliyuncs.com/TPC-H_Tools_v3.0.0.zip"
unzip TPC-H_Tools_v3.0.0.zip -d $CURDIR/
unzip TPC-H_Tools_v3.0.0.zip -d "$CURDIR"/
fi
# modify tpcd.h
cd $TPCH_DBGEN_DIR/
echo '
cd "$TPCH_DBGEN_DIR"/
printf '%s' '
#ifdef MYSQL
#define GEN_QUERY_PLAN ""
#define START_TRAN "START TRANSACTION"
@ -81,7 +80,11 @@ cd -
# check
if [[ -f $TPCH_DBGEN_DIR/dbgen ]]; then
echo "Build succeed! Run $TPCH_DBGEN_DIR/dbgen -h"
echo "
################
Build succeed!
################
Run $TPCH_DBGEN_DIR/dbgen -h"
exit 0
else
echo "Build failed!"

View File

@ -40,7 +40,7 @@ Usage: $0
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-- "$@")
@ -84,7 +84,8 @@ check_prerequest() {
check_prerequest "mysql --version" "mysql"
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
@ -93,7 +94,7 @@ echo "USER: $USER"
echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -e "CREATE DATABASE IF NOT EXISTS $DB"
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -e "CREATE DATABASE IF NOT EXISTS $DB"
echo "Run SQLs from $CURDIR/create-tpch-tables.sql"
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB <$CURDIR/create-tpch-tables.sql
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -D"$DB" <"$CURDIR"/../ddl/create-tpch-tables.sql

View File

@ -48,7 +48,7 @@ Usage: $0 <options>
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'hs:c:' \
-- "$@")
@ -107,40 +107,40 @@ if [[ -d $TPCH_DATA_DIR/ ]]; then
exit 1
fi
mkdir $TPCH_DATA_DIR/
mkdir "$TPCH_DATA_DIR"/
# gen data
cd $TPCH_DBGEN_DIR
cd "$TPCH_DBGEN_DIR"
echo "Begin to generate data for table: region"
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T r
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T r
echo "Begin to generate data for table: nation"
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T n
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T n
echo "Begin to generate data for table: supplier"
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T s
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T s
echo "Begin to generate data for table: part"
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T P
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T P
echo "Begin to generate data for table: customer"
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T c
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T c
echo "Begin to generate data for table: partsupp"
for i in $(seq 1 $PARALLEL); do
for i in $(seq 1 "$PARALLEL"); do
{
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T S -C $PARALLEL -S ${i}
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T S -C "$PARALLEL" -S "$i"
} &
done
wait
echo "Begin to generate data for table: orders"
for i in $(seq 1 $PARALLEL); do
for i in $(seq 1 "$PARALLEL"); do
{
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T O -C $PARALLEL -S ${i}
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T O -C "$PARALLEL" -S "$i"
} &
done
wait
echo "Begin to generate data for table: lineitem"
for i in $(seq 1 $PARALLEL); do
for i in $(seq 1 "$PARALLEL"); do
{
$TPCH_DBGEN_DIR/dbgen -f -s $SCALE_FACTOR -T L -C $PARALLEL -S ${i}
"$TPCH_DBGEN_DIR"/dbgen -f -s "$SCALE_FACTOR" -T L -C "$PARALLEL" -S "$i"
} &
done
wait
@ -148,7 +148,7 @@ wait
cd -
# move data to $TPCH_DATA_DIR
mv $TPCH_DBGEN_DIR/*.tbl* $TPCH_DATA_DIR/
mv "$TPCH_DBGEN_DIR"/*.tbl* "$TPCH_DATA_DIR"/
# check data
du -sh $TPCH_DATA_DIR/*.tbl*
du -sh "$TPCH_DATA_DIR"/*.tbl*

View File

@ -46,7 +46,7 @@ Usage: $0 <options>
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-o 'hc:' \
-- "$@")
@ -89,7 +89,7 @@ fi
echo "Parallelism: $PARALLEL"
# check if tpch-data exists
if [[ ! -d $TPCH_DATA_DIR/ ]]; then
if [[ ! -d "$TPCH_DATA_DIR"/ ]]; then
echo "$TPCH_DATA_DIR does not exist. Run sh gen-tpch-data.sh first."
exit 1
fi
@ -106,7 +106,9 @@ check_prerequest() {
check_prerequest "curl --version" "curl"
# load tables
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
echo "FE_HTTP_PORT: $FE_HTTP_PORT"
@ -115,61 +117,62 @@ echo "PASSWORD: $PASSWORD"
echo "DB: $DB"
function load_region() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: r_regionkey, r_name, r_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/region/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/region/_stream_load
}
function load_nation() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: n_nationkey, n_name, n_regionkey, n_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/nation/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/nation/_stream_load
}
function load_supplier() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/supplier/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/supplier/_stream_load
}
function load_customer() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/customer/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/customer/_stream_load
}
function load_part() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/part/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/part/_stream_load
}
function load_partsupp() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/partsupp/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/partsupp/_stream_load
}
function load_orders() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment, temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/orders/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/orders/_stream_load
}
function load_lineitem() {
echo $@
curl --location-trusted -u $USER:$PASSWORD -H "column_separator:|" \
echo "$*"
curl --location-trusted -u "$USER":"$PASSWORD" -H "column_separator:|" \
-H "columns: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag,l_linestatus, l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment,temp" \
-T $@ http://$FE_HOST:$FE_HTTP_PORT/api/$DB/lineitem/_stream_load
-T "$*" http://"$FE_HOST":"$FE_HTTP_PORT"/api/"$DB"/lineitem/_stream_load
}
# start load
load_region $TPCH_DATA_DIR/region.tbl
load_nation $TPCH_DATA_DIR/nation.tbl
load_supplier $TPCH_DATA_DIR/supplier.tbl
load_customer $TPCH_DATA_DIR/customer.tbl
load_part $TPCH_DATA_DIR/part.tbl
date
load_region "$TPCH_DATA_DIR"/region.tbl
load_nation "$TPCH_DATA_DIR"/nation.tbl
load_supplier "$TPCH_DATA_DIR"/supplier.tbl
load_customer "$TPCH_DATA_DIR"/customer.tbl
load_part "$TPCH_DATA_DIR"/part.tbl
date
# set parallelism
# 以PID为名, 防止创建命名管道时与已有文件重名,从而失败
@ -182,20 +185,21 @@ exec 3<>${fifo}
rm -rf ${fifo}
# 在fd3中放置$PARALLEL个空行作为令牌
for ((i = 1; i <= $PARALLEL; i++)); do
for ((i = 1; i <= PARALLEL; i++)); do
echo >&3
done
for file in $(ls $TPCH_DATA_DIR/lineitem.tbl*); do
date
for file in "$TPCH_DATA_DIR"/lineitem.tbl*; do
# 领取令牌, 即从fd3中读取行, 每次一行
# 对管道,读一行便少一行,每次只能读取一行
# 所有行读取完毕, 执行挂起, 直到管道再次有可读行
# 因此实现了进程数量控制
read -u3
read -r -u3
# 要批量执行的命令放在大括号内, 后台运行
{
load_lineitem $file
load_lineitem "$file"
echo "----loaded $file"
sleep 2
# 归还令牌, 即进程结束后,再写入一行,使挂起的循环继续执行
@ -203,20 +207,22 @@ for file in $(ls $TPCH_DATA_DIR/lineitem.tbl*); do
} &
done
for file in $(ls $TPCH_DATA_DIR/orders.tbl*); do
read -u3
date
for file in "$TPCH_DATA_DIR"/orders.tbl*; do
read -r -u3
{
load_orders $file
load_orders "$file"
echo "----loaded $file"
sleep 2
echo >&3
} &
done
for file in $(ls $TPCH_DATA_DIR/partsupp.tbl*); do
read -u3
date
for file in "$TPCH_DATA_DIR"/partsupp.tbl*; do
read -r -u3
{
load_partsupp $file
load_partsupp "$file"
echo "----loaded $file"
sleep 2
echo >&3
@ -227,3 +233,6 @@ done
wait
# 删除文件标识符
exec 3>&-
date
echo "DONE."

View File

@ -29,7 +29,7 @@ ROOT=$(
)
CURDIR=${ROOT}
QUERIES_DIR=$CURDIR/queries
QUERIES_DIR=$CURDIR/../queries
usage() {
echo "
@ -41,7 +41,7 @@ Usage: $0
}
OPTS=$(getopt \
-n $0 \
-n "$0" \
-o '' \
-- "$@")
@ -85,7 +85,9 @@ check_prerequest() {
check_prerequest "mysql --version" "mysql"
source $CURDIR/doris-cluster.conf
# shellcheck source=/dev/null
source "$CURDIR/../conf/doris-cluster.conf"
export MYSQL_PWD=$PASSWORD
echo "FE_HOST: $FE_HOST"
echo "FE_QUERY_PORT: $FE_QUERY_PORT"
@ -95,10 +97,16 @@ echo "DB: $DB"
echo "Time Unit: ms"
pre_set() {
echo $@
mysql -h$FE_HOST -u$USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB -e "$@"
echo "$*"
mysql -h"$FE_HOST" -u"$USER" -P"$FE_QUERY_PORT" -D"$DB" -e "$*"
}
echo '============================================'
pre_set "show variables;"
echo '============================================'
pre_set "show table status;"
echo '============================================'
sum=0
for i in $(seq 1 22); do
total=0
@ -106,12 +114,12 @@ for i in $(seq 1 22); do
# Each query is executed ${run} times and takes the average time
for j in $(seq 1 ${run}); do
start=$(date +%s%3N)
mysql -h$FE_HOST -u $USER --password=$PASSWORD -P$FE_QUERY_PORT -D$DB --comments <$QUERIES_DIR/q$i.sql >/dev/null
mysql -h"$FE_HOST" -u "$USER" -P"$FE_QUERY_PORT" -D"$DB" --comments <"$QUERIES_DIR"/q"$i".sql >/dev/null
end=$(date +%s%3N)
total=$((total + end - start))
done
cost=$((total / ${run}))
cost=$((total / run))
echo "q$i: ${cost}"
sum=$((sum + $cost))
sum=$((sum + cost))
done
echo "Total cost: $sum"

View File

@ -26,4 +26,4 @@ export USER='root'
# Doris password
export PASSWORD=''
# The database where TPC-H tables located
export DB='tpch1'
export DB='tpch'