diff --git a/tools/tpcds-tools/README.md b/tools/tpcds-tools/README.md new file mode 100644 index 0000000000..ed179e1c12 --- /dev/null +++ b/tools/tpcds-tools/README.md @@ -0,0 +1,47 @@ + + +## Usage + +These scripts are used to make tpc-ds test. +follow the steps below: + +### 1. build tpc-ds dsdgen dsqgen tool. + + ./bin/build-tpcds-tools.sh + +### 2. generate tpc-ds data. use -h for more infomations. + + ./bin/gen-tpcds-data.sh -s 1 + +### 3. generate tpc-ds queries. use -h for more infomations. + + ./bin/gen-tpcds-queries.sh -s 1 + +### 4. create tpc-ds tables. modify `conf/doris-cluster.conf` to specify doris info, then run script below. + + ./bin/create-tpcds-tables.sh + +### 5. load tpc-ds data. use -h for help. + + ./bin/load-tpcds-data.sh + +### 6. run tpc-ds queries. + + ./bin/run-tpcds-queries.sh diff --git a/tools/tpcds-tools/bin/build-tpcds-tools.sh b/tools/tpcds-tools/bin/build-tpcds-tools.sh new file mode 100755 index 0000000000..fd6a486f4c --- /dev/null +++ b/tools/tpcds-tools/bin/build-tpcds-tools.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to build tpcds-dsdgen +# TPC-DS_Tools_v3.2.0.zip is from https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp +# Usage: +# sh build-tpcds-dsdgen.sh +############################################################## + +set -eo pipefail + +ROOT=$(dirname "$0") +ROOT=$( + cd "${ROOT}" + pwd +) + +CURDIR="${ROOT}" +TPCDS_DBGEN_DIR="${CURDIR}/DSGen-software-code-3.2.0rc1/tools" + +check_prerequest() { + local CMD=$1 + local NAME=$2 + if ! ${CMD} >/dev/null; then + echo "${NAME} is missing. This script depends on unzip to extract files from TPC-DS_Tools_v3.2.0.zip" + exit 1 + fi +} + +check_prerequest "unzip -h" "unzip" + +# download tpcds tools package first +if [[ -d "${CURDIR}/DSGen-software-code-3.2.0rc1" ]]; then + echo "If you want to rebuild TPC-DS_Tools_v3.2.0 again, please delete ${CURDIR}/DSGen-software-code-3.2.0rc1 first." && exit 1 +elif [[ -f "${CURDIR}/TPC-DS_Tools_v3.2.0.zip" ]]; then + unzip TPC-DS_Tools_v3.2.0.zip -d "${CURDIR}/" +else + wget "https://doris-build-1308700295.cos.ap-beijing.myqcloud.com/tools/TPC-DS_Tools_v3.2.0.zip" + unzip TPC-DS_Tools_v3.2.0.zip -d "${CURDIR}/" +fi + +# compile tpcds-dsdgen +cd "${TPCDS_DBGEN_DIR}/" +make >/dev/null +cd - + +# check +if [[ -f ${TPCDS_DBGEN_DIR}/dsdgen ]]; then + echo " +################ +Build succeed! +################ +Run ${TPCDS_DBGEN_DIR}/dsdgen -h" + exit 0 +else + echo "Build failed!" + exit 1 +fi diff --git a/tools/tpcds-tools/bin/create-tpcds-tables.sh b/tools/tpcds-tools/bin/create-tpcds-tables.sh new file mode 100755 index 0000000000..ede65b039b --- /dev/null +++ b/tools/tpcds-tools/bin/create-tpcds-tables.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to create TPC-DS tables +############################################################## + +set -eo pipefail + +ROOT=$(dirname "$0") +ROOT=$( + cd "${ROOT}" + pwd +) + +CURDIR=${ROOT} + +usage() { + echo " +This script is used to create TPC-DS tables, +will use mysql client to connect Doris server which is specified in doris-cluster.conf file. +Usage: $0 + " + exit 1 +} + +OPTS=$(getopt \ + -n "$0" \ + -o '' \ + -- "$@") + +eval set -- "${OPTS}" +HELP=0 + +if [[ $# == 0 ]]; then + usage +fi + +while true; do + case "$1" in + -h) + HELP=1 + shift + ;; + --) + shift + break + ;; + *) + echo "Internal error" + exit 1 + ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage +fi + +check_prerequest() { + local CMD=$1 + local NAME=$2 + if ! ${CMD}; then + echo "${NAME} is missing. This script depends on mysql to create tables in Doris." + exit 1 + fi +} + +check_prerequest "mysql --version" "mysql" + +source "${CURDIR}/../conf/doris-cluster.conf" +export MYSQL_PWD=${PASSWORD} + +echo "FE_HOST: ${FE_HOST}" +echo "FE_QUERY_PORT: ${FE_QUERY_PORT}" +echo "USER: ${USER}" +echo "DB: ${DB}" + +mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -e "DROP DATABASE IF EXISTS ${DB}" +mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -e "CREATE DATABASE ${DB}" + +echo "Run SQLs from ${CURDIR}/../ddl/create-tpcds-tables.sql" +mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -D"${DB}" <"${CURDIR}"/../ddl/create-tpcds-tables.sql + +echo "tpcds tables has been created" diff --git a/tools/tpcds-tools/bin/gen-tpcds-data.sh b/tools/tpcds-tools/bin/gen-tpcds-data.sh new file mode 100755 index 0000000000..e0dcea64d9 --- /dev/null +++ b/tools/tpcds-tools/bin/gen-tpcds-data.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to generate TPC-DS data set +############################################################## + +set -eo pipefail + +ROOT=$(dirname "$0") +ROOT=$( + cd "${ROOT}" + pwd +) + +CURDIR="${ROOT}" +TPCDS_DBGEN_DIR="${CURDIR}/DSGen-software-code-3.2.0rc1/tools" + +usage() { + echo " +Usage: $0 + Optional options: + -s scale factor, default is 1 + -c parallelism to generate data of (lineitem, orders, partsupp) table, default is 10 + + Eg. + $0 generate data using default value. + $0 -s 100 generate data with scale factor 100. + $0 -s 1000 -c 100 generate data with scale factor 1000. And using 1000 threads to generate data concurrently. + " + exit 1 +} + +OPTS=$(getopt \ + -n "$0" \ + -o '' \ + -o 'hs:c:' \ + -- "$@") + +eval set -- "${OPTS}" + +SCALE_FACTOR=1 +PARALLEL=10 +HELP=0 + +if [[ $# == 0 ]]; then + usage +fi + +while true; do + case "$1" in + -h) + HELP=1 + shift + ;; + -s) + SCALE_FACTOR=$2 + shift 2 + ;; + -c) + PARALLEL=$2 + shift 2 + ;; + --) + shift + break + ;; + *) + echo "Internal error" + exit 1 + ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage +fi + +TPCDS_DATA_DIR="${CURDIR}/tpcds-data" +echo "Scale Factor: ${SCALE_FACTOR}" +echo "Parallelism: ${PARALLEL}" + +# check if dsdgen exists +if [[ ! -f ${TPCDS_DBGEN_DIR}/dsdgen ]]; then + echo "${TPCDS_DBGEN_DIR}/dsdgen does not exist. Run build-tpcds-dsdgen.sh first to build it first." + exit 1 +fi + +if [[ -d ${TPCDS_DATA_DIR}/ ]]; then + echo "${TPCDS_DATA_DIR} exists. Remove it before generating data" + exit 1 +fi + +mkdir "${TPCDS_DATA_DIR}"/ + +# gen data +echo "Begin to generate data..." +date +cd "${TPCDS_DBGEN_DIR}" +if [[ ${PARALLEL} -eq 1 ]] && "${TPCDS_DBGEN_DIR}"/dsdgen -SCALE "${SCALE_FACTOR}" -TERMINATE N -DIR "${TPCDS_DATA_DIR}"; then + echo "data genarated." +elif [[ ${PARALLEL} -gt 1 ]] && "${TPCDS_DBGEN_DIR}"/dsdgen -SCALE "${SCALE_FACTOR}" -PARALLEL "${PARALLEL}" -TERMINATE N -DIR "${TPCDS_DATA_DIR}"; then + echo "data genarated." +else + echo "ERROR occured." && exit 1 +fi +cd - +date +# check data +du -sh "${TPCDS_DATA_DIR}"/*.dat* diff --git a/tools/tpcds-tools/bin/gen-tpcds-queries.sh b/tools/tpcds-tools/bin/gen-tpcds-queries.sh new file mode 100755 index 0000000000..61b83563bb --- /dev/null +++ b/tools/tpcds-tools/bin/gen-tpcds-queries.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to generate TPC-DS tables +############################################################## + +set -eo pipefail + +ROOT=$(dirname "$0") +ROOT=$( + cd "${ROOT}" + pwd +) + +CURDIR="${ROOT}" +TPCDS_DSQGEN_DIR="${CURDIR}/DSGen-software-code-3.2.0rc1/tools" +TPCDS_QUERIE_DIR="${CURDIR}/../queries" + +usage() { + echo " +This script is used to generate TPC-DS 99 queries of different scale, +Usage: $0 + Optional options: + -s scale factor, default is 1 + Eg. + $0 -s 100 generate tpcds queries with scale factor 100. + " + exit 1 +} + +OPTS=$(getopt \ + -n "$0" \ + -o '' \ + -o 'hs:' \ + -- "$@") + +eval set -- "${OPTS}" +HELP=0 +SCALE=1 + +if [[ $# == 0 ]]; then + usage +fi + +while true; do + case "$1" in + -h) + HELP=1 + shift + ;; + -s) + SCALE=$2 + shift 2 + ;; + --) + shift + break + ;; + *) + echo "Internal error" + exit 1 + ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage +fi + +# check if dsqgen exists +if [[ ! -f ${TPCDS_DSQGEN_DIR}/dsqgen ]]; then + echo "${TPCDS_DSQGEN_DIR}/dsqgen does not exist. Run build-tpcds-dbgen.sh to build it first." + exit 1 +fi + +if [[ -d ${TPCDS_QUERIE_DIR}/ ]]; then + echo "${TPCDS_QUERIE_DIR} exists. Remove it before re-generating" + exit 1 +else + mkdir "${TPCDS_QUERIE_DIR}" +fi + +cd "${TPCDS_DSQGEN_DIR}" +for i in {1..99}; do + sed -i '/define _END/d' "../query_templates/query${i}.tpl" + echo 'define _END = "";' >>"../query_templates/query${i}.tpl" +done + +# generate query +"${TPCDS_DSQGEN_DIR}"/dsqgen \ + -DIRECTORY ../query_templates/ \ + -INPUT ../query_templates/templates.lst \ + -DIALECT netezza \ + -QUALIFY Y \ + -SCALE "${SCALE}" + +if [[ -f query_0.sql ]]; then + i=1 + IFS=';' + query_strs=$(cat query_0.sql) + of="${TPCDS_QUERIE_DIR}/tpcds_queries.sql" + for q in ${query_strs}; do + if [[ ${i} -eq 2 ]]; then + echo "${q/from catalog_sales))/from catalog_sales) t)};" >>"${of}" + elif [[ ${i} -eq 5 ]]; then + q="${q// 14 days/ interval 14 day}" && + q="${q//\'store\' || s_store_id as id/concat(\'store\', s_store_id) id}" && + q="${q//\'catalog_page\' || cp_catalog_page_id as id/concat(\'catalog_page\', cp_catalog_page_id) id}" && + q="${q//\'web_site\' || web_site_id as id/concat(\'web_site\', web_site_id) id}" && + echo "${q};" >>"${of}" + elif [[ ${i} -eq 12 ]] || [[ ${i} -eq 21 ]] || [[ ${i} -eq 22 ]] || [[ ${i} -eq 44 ]] || [[ ${i} -eq 81 ]] || [[ ${i} -eq 102 ]]; then + q="${q//30 days/interval 30 day}" && echo "${q};" >>"${of}" + elif [[ ${i} -eq 14 ]]; then + # q="${q//and d3.d_year between 1999 AND 1999 + 2)/and d3.d_year between 1999 AND 1999 + 2) t}" && echo "${q};" >>"${of}" + q="${q//where i_brand_id = brand_id/t where i_brand_id = brand_id}" && echo "${q};" >>"${of}" + elif [[ ${i} -eq 17 ]] || [[ ${i} -eq 40 ]] || [[ ${i} -eq 86 ]] || [[ ${i} -eq 98 ]] || [[ ${i} -eq 99 ]]; then + q="${q//60 days)/interval 60 day)}" && echo "${q};" >>"${of}" + elif [[ ${i} -eq 24 ]]; then + q="${q//c_customer_sk))/c_customer_sk) t)}" && + q="${q//best_ss_customer))/best_ss_customer)) t2}" && + echo "${q};" >>"${of}" + elif [[ ${i} -eq 25 ]]; then + q="${q//c_customer_sk))/c_customer_sk) t)}" && + q="${q//c_first_name)/c_first_name) t2}" && + echo "${q};" >>"${of}" + elif [[ ${i} -eq 35 ]] || [[ ${i} -eq 96 ]]; then + q="${q//90 days)/interval 90 day)}" && echo "${q};" >>"${of}" + elif [[ ${i} -eq 53 ]]; then + q="${q//order by 1/ t order by 1}" && echo "${q};" >>"${of}" + elif [[ ${i} -eq 70 ]]; then + v=$(echo "${q}" | grep "||" | grep -o "'.*'" | sed -n '1p') + smc1=$(echo "${v//\'/}" | awk -F"|" '{print $1}') + smc2=$(echo "${v//\'/}" | awk -F"|" '{print $5}') + origin="'${smc1% }' || ',' || '${smc2# }'" + q="${q//${origin}/concat(concat(\'${smc1}\', \',\'), \'${smc2}\')}" && echo "${q};" >>"${of}" + elif [[ ${i} -eq 84 ]]; then + q="${q//30 days/interval 30 day}" && + q="${q//\'store\' || store_id/concat(\'store\', store_id)}" && + q="${q//\'catalog_page\' || catalog_page_id/concat(\'catalog_page\', catalog_page_id)}" && + q="${q//\'web_site\' || web_site_id/concat(\'web_site\', web_site_id)}" && + echo "${q};" >>"${of}" + elif [[ ${i} -eq 88 ]]; then + q="${q//coalesce(c_last_name,\'\') || \', \' || coalesce(c_first_name,\'\')/concat(concat(coalesce(c_last_name,\'\'), \',\'), coalesce(c_first_name,\'\'))}" && + echo "${q};" >>"${of}" + else + echo "${q};" >>"${of}" + fi + i=$((i + 1)) + done + echo -e "\033[32m +tpcds queries generated in: + ${TPCDS_QUERIE_DIR}/tpcds_queries.sql +\033[0m" +else + echo -e "\033[31m ERROR: tpcds queries generate failed \033[0m" && exit 1 +fi +cd - >/dev/null diff --git a/tools/tpcds-tools/bin/load-tpcds-data.sh b/tools/tpcds-tools/bin/load-tpcds-data.sh new file mode 100755 index 0000000000..a127796705 --- /dev/null +++ b/tools/tpcds-tools/bin/load-tpcds-data.sh @@ -0,0 +1,199 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to load generated TPC-DS data set into Doris. +############################################################## + +set -eo pipefail + +ROOT=$(dirname "$0") +ROOT=$( + cd "${ROOT}" + pwd +) + +CURDIR="${ROOT}" +TPCDS_DATA_DIR="${CURDIR}/tpcds-data" + +usage() { + echo " +Usage: $0 + Optional options: + -c parallelism to load data of lineitem, orders, partsupp, default is 5. + + Eg. + $0 load data using default value. + $0 -c 10 load lineitem, orders, partsupp table data using parallelism 10. + " + exit 1 +} + +OPTS=$(getopt \ + -n "$0" \ + -o '' \ + -o 'hc:' \ + -- "$@") + +eval set -- "${OPTS}" + +PARALLEL=5 +HELP=0 + +if [[ $# == 0 ]]; then + usage +fi + +while true; do + case "$1" in + -h) + HELP=1 + shift + ;; + -c) + PARALLEL=$2 + shift 2 + ;; + --) + shift + break + ;; + *) + echo "Internal error" + exit 1 + ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage +fi + +# check if tpcds-data exists +if [[ ! -d "${TPCDS_DATA_DIR}"/ ]]; then + echo "${TPCDS_DATA_DIR} does not exist. Run sh gen-tpcds-data.sh first." + exit 1 +fi + +check_prerequest() { + local CMD=$1 + local NAME=$2 + if ! ${CMD} &>/dev/null; then + echo "${NAME} is missing. This script depends on cURL to load data to Doris." + exit 1 + fi +} + +check_prerequest "curl --version" "curl" +check_prerequest "jq --version" "jq" + +# load tables +source "${CURDIR}/../conf/doris-cluster.conf" +export MYSQL_PWD=${PASSWORD} + +echo "Parallelism: ${PARALLEL}" +echo "FE_HOST: ${FE_HOST}" +echo "FE_HTTP_PORT: ${FE_HTTP_PORT}" +echo "USER: ${USER}" +echo "DB: ${DB}" + +declare -A table_columns=( + ['call_center']='cc_call_center_sk, cc_call_center_id, cc_rec_start_date, cc_rec_end_date, cc_closed_date_sk, cc_open_date_sk, cc_name, cc_class, cc_employees, cc_sq_ft, cc_hours, cc_manager, cc_mkt_id, cc_mkt_class, cc_mkt_desc, cc_market_manager, cc_division, cc_division_name, cc_company, cc_company_name, cc_street_number, cc_street_name, cc_street_type, cc_suite_number, cc_city, cc_county, cc_state, cc_zip, cc_country, cc_gmt_offset, cc_tax_percentage' + ['catalog_page']='cp_catalog_page_sk, cp_catalog_page_id, cp_start_date_sk, cp_end_date_sk, cp_department, cp_catalog_number, cp_catalog_page_number, cp_description, cp_type' + ['catalog_returns']='cr_returned_date_sk, cr_returned_time_sk, cr_item_sk, cr_refunded_customer_sk, cr_refunded_cdemo_sk, cr_refunded_hdemo_sk, cr_refunded_addr_sk, cr_returning_customer_sk, cr_returning_cdemo_sk, cr_returning_hdemo_sk, cr_returning_addr_sk, cr_call_center_sk, cr_catalog_page_sk, cr_ship_mode_sk, cr_warehouse_sk, cr_reason_sk, cr_order_number, cr_return_quantity, cr_return_amount, cr_return_tax, cr_return_amt_inc_tax, cr_fee, cr_return_ship_cost, cr_refunded_cash, cr_reversed_charge, cr_store_credit, cr_net_loss' + ['catalog_sales']='cs_sold_date_sk, cs_sold_time_sk, cs_ship_date_sk, cs_bill_customer_sk, cs_bill_cdemo_sk, cs_bill_hdemo_sk, cs_bill_addr_sk, cs_ship_customer_sk, cs_ship_cdemo_sk, cs_ship_hdemo_sk, cs_ship_addr_sk, cs_call_center_sk, cs_catalog_page_sk, cs_ship_mode_sk, cs_warehouse_sk, cs_item_sk, cs_promo_sk, cs_order_number, cs_quantity, cs_wholesale_cost, cs_list_price, cs_sales_price, cs_ext_discount_amt, cs_ext_sales_price, cs_ext_wholesale_cost, cs_ext_list_price, cs_ext_tax, cs_coupon_amt, cs_ext_ship_cost, cs_net_paid, cs_net_paid_inc_tax, cs_net_paid_inc_ship, cs_net_paid_inc_ship_tax, cs_net_profit' + ['customer_address']='ca_address_sk, ca_address_id, ca_street_number, ca_street_name, ca_street_type, ca_suite_number, ca_city, ca_county, ca_state, ca_zip, ca_country, ca_gmt_offset, ca_location_type' + ['customer_demographics']='cd_demo_sk, cd_gender, cd_marital_status, cd_education_status, cd_purchase_estimate, cd_credit_rating, cd_dep_count, cd_dep_employed_count, cd_dep_college_count' + ['customer']='c_customer_sk, c_customer_id, c_current_cdemo_sk, c_current_hdemo_sk, c_current_addr_sk, c_first_shipto_date_sk, c_first_sales_date_sk, c_salutation, c_first_name, c_last_name, c_preferred_cust_flag, c_birth_day, c_birth_month, c_birth_year, c_birth_country, c_login, c_email_address, c_last_review_date_sk' + ['date_dim']='d_date_sk, d_date_id, d_date, d_month_seq, d_week_seq, d_quarter_seq, d_year, d_dow, d_moy, d_dom, d_qoy, d_fy_year, d_fy_quarter_seq, d_fy_week_seq, d_day_name, d_quarter_name, d_holiday, d_weekend, d_following_holiday, d_first_dom, d_last_dom, d_same_day_ly, d_same_day_lq, d_current_day, d_current_week, d_current_month, d_current_quarter, d_current_year' + ['dbgen_version']='dv_version, dv_create_date, dv_create_time, dv_cmdline_args' + ['household_demographics']='hd_demo_sk, hd_income_band_sk, hd_buy_potential, hd_dep_count, hd_vehicle_count' + ['income_band']='ib_income_band_sk, ib_lower_bound, ib_upper_bound' + ['inventory']='inv_date_sk, inv_item_sk, inv_warehouse_sk, inv_quantity_on_hand' + ['item']='i_item_sk, i_item_id, i_rec_start_date, i_rec_end_date, i_item_desc, i_current_price, i_wholesale_cost, i_brand_id, i_brand, i_class_id, i_class, i_category_id, i_category, i_manufact_id, i_manufact, i_size, i_formulation, i_color, i_units, i_container, i_manager_id, i_product_name' + ['promotion']='p_promo_sk, p_promo_id, p_start_date_sk, p_end_date_sk, p_item_sk, p_cost, p_response_targe, p_promo_name, p_channel_dmail, p_channel_email, p_channel_catalog, p_channel_tv, p_channel_radio, p_channel_press, p_channel_event, p_channel_demo, p_channel_details, p_purpose, p_discount_active' + ['reason']='r_reason_sk, r_reason_id, r_reason_desc' + ['ship_mode']='sm_ship_mode_sk, sm_ship_mode_id, sm_type, sm_code, sm_carrier, sm_contract' + ['store']='s_store_sk, s_store_id, s_rec_start_date, s_rec_end_date, s_closed_date_sk, s_store_name, s_number_employees, s_floor_space, s_hours, s_manager, s_market_id, s_geography_class, s_market_desc, s_market_manager, s_division_id, s_division_name, s_company_id, s_company_name, s_street_number, s_street_name, s_street_type, s_suite_number, s_city, s_county, s_state, s_zip, s_country, s_gmt_offset, s_tax_precentage' + ['store_returns']='sr_returned_date_sk, sr_return_time_sk, sr_item_sk, sr_customer_sk, sr_cdemo_sk, sr_hdemo_sk, sr_addr_sk, sr_store_sk, sr_reason_sk, sr_ticket_number, sr_return_quantity, sr_return_amt, sr_return_tax, sr_return_amt_inc_tax, sr_fee, sr_return_ship_cost, sr_refunded_cash, sr_reversed_charge, sr_store_credit, sr_net_loss' + ['store_sales']='ss_sold_date_sk, ss_sold_time_sk, ss_item_sk, ss_customer_sk, ss_cdemo_sk, ss_hdemo_sk, ss_addr_sk, ss_store_sk, ss_promo_sk, ss_ticket_number, ss_quantity, ss_wholesale_cost, ss_list_price, ss_sales_price, ss_ext_discount_amt, ss_ext_sales_price, ss_ext_wholesale_cost, ss_ext_list_price, ss_ext_tax, ss_coupon_amt, ss_net_paid, ss_net_paid_inc_tax, ss_net_profit' + ['time_dim']='t_time_sk, t_time_id, t_time, t_hour, t_minute, t_second, t_am_pm, t_shift, t_sub_shift, t_meal_time' + ['warehouse']='w_warehouse_sk, w_warehouse_id, w_warehouse_name, w_warehouse_sq_ft, w_street_number, w_street_name, w_street_type, w_suite_number, w_city, w_county, w_state, w_zip, w_country, w_gmt_offset' + ['web_page']='wp_web_page_sk, wp_web_page_id, wp_rec_start_date, wp_rec_end_date, wp_creation_date_sk, wp_access_date_sk, wp_autogen_flag, wp_customer_sk, wp_url, wp_type, wp_char_count, wp_link_count, wp_image_count, wp_max_ad_count' + ['web_returns']='wr_returned_date_sk, wr_returned_time_sk, wr_item_sk, wr_refunded_customer_sk, wr_refunded_cdemo_sk, wr_refunded_hdemo_sk, wr_refunded_addr_sk, wr_returning_customer_sk, wr_returning_cdemo_sk, wr_returning_hdemo_sk, wr_returning_addr_sk, wr_web_page_sk, wr_reason_sk, wr_order_number, wr_return_quantity, wr_return_amt, wr_return_tax, wr_return_amt_inc_tax, wr_fee, wr_return_ship_cost, wr_refunded_cash, wr_reversed_charge, wr_account_credit, wr_net_loss' + ['web_sales']='ws_sold_date_sk, ws_sold_time_sk, ws_ship_date_sk, ws_item_sk, ws_bill_customer_sk, ws_bill_cdemo_sk, ws_bill_hdemo_sk, ws_bill_addr_sk, ws_ship_customer_sk, ws_ship_cdemo_sk, ws_ship_hdemo_sk, ws_ship_addr_sk, ws_web_page_sk, ws_web_site_sk, ws_ship_mode_sk, ws_warehouse_sk, ws_promo_sk, ws_order_number, ws_quantity, ws_wholesale_cost, ws_list_price, ws_sales_price, ws_ext_discount_amt, ws_ext_sales_price, ws_ext_wholesale_cost, ws_ext_list_price, ws_ext_tax, ws_coupon_amt, ws_ext_ship_cost, ws_net_paid, ws_net_paid_inc_tax, ws_net_paid_inc_ship, ws_net_paid_inc_ship_tax, ws_net_profit' + ['web_site']='web_site_sk, web_site_id, web_rec_start_date, web_rec_end_date, web_name, web_open_date_sk, web_close_date_sk, web_class, web_manager, web_mkt_id, web_mkt_class, web_mkt_desc, web_market_manager, web_company_id, web_company_name, web_street_number, web_street_name, web_street_type, web_suite_number, web_city, web_county, web_state, web_zip, web_country, web_gmt_offset, web_tax_percentage' +) + +# set parallelism + +# 以PID为名, 防止创建命名管道时与已有文件重名,从而失败 +fifo="/tmp/$$.fifo" +# 创建命名管道 +mkfifo "${fifo}" +# 以读写方式打开命名管道,文件标识符fd为3,fd可取除0,1,2,5外0-9中的任意数字 +exec 3<>"${fifo}" +# 删除文件, 也可不删除, 不影响后面操作 +rm -rf "${fifo}" + +# 在fd3中放置$PARALLEL个空行作为令牌 +for ((i = 1; i <= PARALLEL; i++)); do + echo >&3 +done + +# start load +start_time=$(date +%s) +echo "Start time: $(date)" +for table_name in ${!table_columns[*]}; do + # 领取令牌, 即从fd3中读取行, 每次一行 + # 对管道,读一行便少一行,每次只能读取一行 + # 所有行读取完毕, 执行挂起, 直到管道再次有可读行 + # 因此实现了进程数量控制 + read -r -u3 + + # 要批量执行的命令放在大括号内, 后台运行 + { + for file in "${TPCDS_DATA_DIR}/${table_name}"*.dat; do + ret=$(curl \ + --location-trusted \ + -u "${USER}":"${PASSWORD:=}" \ + -H "column_separator:|" \ + -H "columns: ${table_columns[${table_name}]}" \ + -T "${file}" \ + http://"${FE_HOST}":"${FE_HTTP_PORT:=8030}"/api/"${DB}"/"${table_name}"/_stream_load 2>/dev/null) + if [[ $(echo "${ret}" | jq ".Status") == '"Success"' ]]; then + echo "----loaded ${file}" + else + echo -e "\033[31m----load ${file} FAIL...\033[0m" + fi + done + sleep 2 + # 归还令牌, 即进程结束后,再写入一行,使挂起的循环继续执行 + echo >&3 + } & +done + +# 等待所有的后台子进程结束 +wait +# 删除文件标识符 +exec 3>&- + +end_time=$(date +%s) +echo "End time: $(date)" + +echo "Finish load tpcds data, Time taken: $((end_time - start_time)) seconds" diff --git a/tools/tpcds-tools/bin/run-tpcds-queries.sh b/tools/tpcds-tools/bin/run-tpcds-queries.sh new file mode 100755 index 0000000000..46cc77db86 --- /dev/null +++ b/tools/tpcds-tools/bin/run-tpcds-queries.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################################## +# This script is used to run TPC-DS 99 queries +############################################################## + +set -eo pipefail + +ROOT=$(dirname "$0") +ROOT=$( + cd "${ROOT}" + pwd +) + +CURDIR="${ROOT}" +TPCDS_QUERIES_DIR="${CURDIR}/../queries" + +usage() { + echo " +This script is used to run TPC-DS 99 queries, +will use mysql client to connect Doris server which parameter is specified in doris-cluster.conf file. +Usage: $0 + " + exit 1 +} + +OPTS=$(getopt \ + -n "$0" \ + -o '' \ + -- "$@") + +eval set -- "${OPTS}" +HELP=0 + +if [[ $# == 0 ]]; then + usage +fi + +while true; do + case "$1" in + -h) + HELP=1 + shift + ;; + --) + shift + break + ;; + *) + echo "Internal error" + exit 1 + ;; + esac +done + +if [[ ${HELP} -eq 1 ]]; then + usage +fi + +check_prerequest() { + local CMD=$1 + local NAME=$2 + if ! ${CMD}; then + echo "${NAME} is missing. This script depends on mysql to create tables in Doris." + exit 1 + fi +} + +check_prerequest "mysql --version" "mysql" + +#shellcheck source=/dev/null +source "${CURDIR}/../conf/doris-cluster.conf" +export MYSQL_PWD=${PASSWORD:-} + +echo "FE_HOST: ${FE_HOST:='127.0.0.1'}" +echo "FE_QUERY_PORT: ${FE_QUERY_PORT:='9030'}" +echo "USER: ${USER:='root'}" +echo "DB: ${DB:='tpcds'}" +echo "Time Unit: ms" + +run_sql() { + echo "$*" + mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -D"${DB}" -e "$*" +} + +echo '============================================' +run_sql "show variables;" +echo '============================================' +run_sql "show table status;" +echo '============================================' + +sum=0 +IFS=';' +i=1 +query_strs=$(cat "${TPCDS_QUERIES_DIR}/tpcds_queries.sql") +for query_str in ${query_strs}; do + # echo '============================================' + # echo "${query_str} " + # echo '============================================' + total=0 + run=3 + # Each query is executed ${run} times and takes the average time + for ((j = 0; j < run; j++)); do + # if [[ $i -lt 70 ]]; then continue; fi ######### + start=$(date +%s%3N) + mysql -h"${FE_HOST}" -u"${USER}" -P"${FE_QUERY_PORT}" -D"${DB}" --comments -e"${query_str}" >/dev/null + end=$(date +%s%3N) + total=$((total + end - start)) + done + cost=$((total / run)) + echo "q${i}: ${cost} ms" + sum=$((sum + cost)) + i=$((i + 1)) +done <"${TPCDS_QUERIES_DIR}/tpcds_queries.sql" +echo "Total cost: ${sum} ms" + +echo 'Finish tpcds queries.' diff --git a/tools/tpcds-tools/conf/doris-cluster.conf b/tools/tpcds-tools/conf/doris-cluster.conf new file mode 100644 index 0000000000..fd737356c2 --- /dev/null +++ b/tools/tpcds-tools/conf/doris-cluster.conf @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Any of FE host +export FE_HOST='127.0.0.1' +# http_port in fe.conf +export FE_HTTP_PORT=8030 +# query_port in fe.conf +export FE_QUERY_PORT=9030 +# Doris username +export USER='root' +# Doris password +export PASSWORD='' +# The database where TPC-DS tables located +export DB='tpcds' diff --git a/tools/tpcds-tools/ddl/create-tpcds-tables.sql b/tools/tpcds-tools/ddl/create-tpcds-tables.sql new file mode 100644 index 0000000000..c29db69a2d --- /dev/null +++ b/tools/tpcds-tools/ddl/create-tpcds-tables.sql @@ -0,0 +1,630 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +CREATE TABLE IF NOT EXISTS customer_demographics ( + cd_demo_sk bigint not null, + cd_gender char(1), + cd_marital_status char(1), + cd_education_status char(20), + cd_purchase_estimate integer, + cd_credit_rating char(10), + cd_dep_count integer, + cd_dep_employed_count integer, + cd_dep_college_count integer +) +DUPLICATE KEY(cd_demo_sk) +DISTRIBUTED BY HASH(cd_gender) BUCKETS 12 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS reason ( + r_reason_sk bigint not null, + r_reason_id char(16) not null, + r_reason_desc char(100) + ) +DUPLICATE KEY(r_reason_sk) +DISTRIBUTED BY HASH(r_reason_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS date_dim ( + d_date_sk bigint not null, + d_date_id char(16) not null, + d_date date, + d_month_seq integer, + d_week_seq integer, + d_quarter_seq integer, + d_year integer, + d_dow integer, + d_moy integer, + d_dom integer, + d_qoy integer, + d_fy_year integer, + d_fy_quarter_seq integer, + d_fy_week_seq integer, + d_day_name char(9), + d_quarter_name char(6), + d_holiday char(1), + d_weekend char(1), + d_following_holiday char(1), + d_first_dom integer, + d_last_dom integer, + d_same_day_ly integer, + d_same_day_lq integer, + d_current_day char(1), + d_current_week char(1), + d_current_month char(1), + d_current_quarter char(1), + d_current_year char(1) +) +DUPLICATE KEY(d_date_sk) +DISTRIBUTED BY HASH(d_date_sk) BUCKETS 12 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS warehouse ( + w_warehouse_sk bigint not null, + w_warehouse_id char(16) not null, + w_warehouse_name varchar(20), + w_warehouse_sq_ft integer, + w_street_number char(10), + w_street_name varchar(60), + w_street_type char(15), + w_suite_number char(10), + w_city varchar(60), + w_county varchar(30), + w_state char(2), + w_zip char(10), + w_country varchar(20), + w_gmt_offset decimal(5,2) +) +DUPLICATE KEY(w_warehouse_sk) +DISTRIBUTED BY HASH(w_warehouse_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS catalog_sales ( + cs_item_sk bigint not null, + cs_order_number bigint not null, + cs_sold_date_sk bigint, + cs_sold_time_sk bigint, + cs_ship_date_sk bigint, + cs_bill_customer_sk bigint, + cs_bill_cdemo_sk bigint, + cs_bill_hdemo_sk bigint, + cs_bill_addr_sk bigint, + cs_ship_customer_sk bigint, + cs_ship_cdemo_sk bigint, + cs_ship_hdemo_sk bigint, + cs_ship_addr_sk bigint, + cs_call_center_sk bigint, + cs_catalog_page_sk bigint, + cs_ship_mode_sk bigint, + cs_warehouse_sk bigint, + cs_promo_sk bigint, + cs_quantity integer, + cs_wholesale_cost decimal(7,2), + cs_list_price decimal(7,2), + cs_sales_price decimal(7,2), + cs_ext_discount_amt decimal(7,2), + cs_ext_sales_price decimal(7,2), + cs_ext_wholesale_cost decimal(7,2), + cs_ext_list_price decimal(7,2), + cs_ext_tax decimal(7,2), + cs_coupon_amt decimal(7,2), + cs_ext_ship_cost decimal(7,2), + cs_net_paid decimal(7,2), + cs_net_paid_inc_tax decimal(7,2), + cs_net_paid_inc_ship decimal(7,2), + cs_net_paid_inc_ship_tax decimal(7,2), + cs_net_profit decimal(7,2) +) +DUPLICATE KEY(cs_item_sk, cs_order_number) +DISTRIBUTED BY HASH(cs_item_sk, cs_order_number) BUCKETS 32 +PROPERTIES ( + "replication_num" = "1", + "colocate_with" = "catalog" +); +CREATE TABLE IF NOT EXISTS call_center ( + cc_call_center_sk bigint not null, + cc_call_center_id char(16) not null, + cc_rec_start_date date, + cc_rec_end_date date, + cc_closed_date_sk integer, + cc_open_date_sk integer, + cc_name varchar(50), + cc_class varchar(50), + cc_employees integer, + cc_sq_ft integer, + cc_hours char(20), + cc_manager varchar(40), + cc_mkt_id integer, + cc_mkt_class char(50), + cc_mkt_desc varchar(100), + cc_market_manager varchar(40), + cc_division integer, + cc_division_name varchar(50), + cc_company integer, + cc_company_name char(50), + cc_street_number char(10), + cc_street_name varchar(60), + cc_street_type char(15), + cc_suite_number char(10), + cc_city varchar(60), + cc_county varchar(30), + cc_state char(2), + cc_zip char(10), + cc_country varchar(20), + cc_gmt_offset decimal(5,2), + cc_tax_percentage decimal(5,2) +) +DUPLICATE KEY(cc_call_center_sk) +DISTRIBUTED BY HASH(cc_call_center_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); + +CREATE TABLE IF NOT EXISTS inventory ( + inv_date_sk bigint not null, + inv_item_sk bigint not null, + inv_warehouse_sk bigint, + inv_quantity_on_hand integer +) +DUPLICATE KEY(inv_date_sk, inv_item_sk, inv_warehouse_sk) +DISTRIBUTED BY HASH(inv_date_sk, inv_item_sk, inv_warehouse_sk) BUCKETS 32 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS catalog_returns ( + cr_item_sk bigint not null, + cr_order_number bigint not null, + cr_returned_date_sk bigint, + cr_returned_time_sk bigint, + cr_refunded_customer_sk bigint, + cr_refunded_cdemo_sk bigint, + cr_refunded_hdemo_sk bigint, + cr_refunded_addr_sk bigint, + cr_returning_customer_sk bigint, + cr_returning_cdemo_sk bigint, + cr_returning_hdemo_sk bigint, + cr_returning_addr_sk bigint, + cr_call_center_sk bigint, + cr_catalog_page_sk bigint, + cr_ship_mode_sk bigint, + cr_warehouse_sk bigint, + cr_reason_sk bigint, + cr_return_quantity integer, + cr_return_amount decimal(7,2), + cr_return_tax decimal(7,2), + cr_return_amt_inc_tax decimal(7,2), + cr_fee decimal(7,2), + cr_return_ship_cost decimal(7,2), + cr_refunded_cash decimal(7,2), + cr_reversed_charge decimal(7,2), + cr_store_credit decimal(7,2), + cr_net_loss decimal(7,2) +) +DUPLICATE KEY(cr_item_sk, cr_order_number) +DISTRIBUTED BY HASH(cr_item_sk, cr_order_number) BUCKETS 32 +PROPERTIES ( + "replication_num" = "1", + "colocate_with" = "catalog" +); + +CREATE TABLE IF NOT EXISTS household_demographics ( + hd_demo_sk bigint not null, + hd_income_band_sk bigint, + hd_buy_potential char(15), + hd_dep_count integer, + hd_vehicle_count integer +) +DUPLICATE KEY(hd_demo_sk) +DISTRIBUTED BY HASH(hd_demo_sk) BUCKETS 3 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS customer_address ( + ca_address_sk bigint not null, + ca_address_id char(16) not null, + ca_street_number char(10), + ca_street_name varchar(60), + ca_street_type char(15), + ca_suite_number char(10), + ca_city varchar(60), + ca_county varchar(30), + ca_state char(2), + ca_zip char(10), + ca_country varchar(20), + ca_gmt_offset decimal(5,2), + ca_location_type char(20) +) +DUPLICATE KEY(ca_address_sk) +DISTRIBUTED BY HASH(ca_address_sk) BUCKETS 12 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS income_band ( + ib_income_band_sk bigint not null, + ib_lower_bound integer, + ib_upper_bound integer +) +DUPLICATE KEY(ib_income_band_sk) +DISTRIBUTED BY HASH(ib_income_band_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS catalog_page ( + cp_catalog_page_sk bigint not null, + cp_catalog_page_id char(16) not null, + cp_start_date_sk integer, + cp_end_date_sk integer, + cp_department varchar(50), + cp_catalog_number integer, + cp_catalog_page_number integer, + cp_description varchar(100), + cp_type varchar(100) +) +DUPLICATE KEY(cp_catalog_page_sk) +DISTRIBUTED BY HASH(cp_catalog_page_sk) BUCKETS 3 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS item ( + i_item_sk bigint not null, + i_item_id char(16) not null, + i_rec_start_date date, + i_rec_end_date date, + i_item_desc varchar(200), + i_current_price decimal(7,2), + i_wholesale_cost decimal(7,2), + i_brand_id integer, + i_brand char(50), + i_class_id integer, + i_class char(50), + i_category_id integer, + i_category char(50), + i_manufact_id integer, + i_manufact char(50), + i_size char(20), + i_formulation char(20), + i_color char(20), + i_units char(10), + i_container char(10), + i_manager_id integer, + i_product_name char(50) +) +DUPLICATE KEY(i_item_sk) +DISTRIBUTED BY HASH(i_item_sk) BUCKETS 12 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS web_returns ( + wr_item_sk bigint not null, + wr_order_number bigint not null, + wr_returned_date_sk bigint, + wr_returned_time_sk bigint, + wr_refunded_customer_sk bigint, + wr_refunded_cdemo_sk bigint, + wr_refunded_hdemo_sk bigint, + wr_refunded_addr_sk bigint, + wr_returning_customer_sk bigint, + wr_returning_cdemo_sk bigint, + wr_returning_hdemo_sk bigint, + wr_returning_addr_sk bigint, + wr_web_page_sk bigint, + wr_reason_sk bigint, + wr_return_quantity integer, + wr_return_amt decimal(7,2), + wr_return_tax decimal(7,2), + wr_return_amt_inc_tax decimal(7,2), + wr_fee decimal(7,2), + wr_return_ship_cost decimal(7,2), + wr_refunded_cash decimal(7,2), + wr_reversed_charge decimal(7,2), + wr_account_credit decimal(7,2), + wr_net_loss decimal(7,2) +) +DUPLICATE KEY(wr_item_sk, wr_order_number) +DISTRIBUTED BY HASH(wr_item_sk, wr_order_number) BUCKETS 32 +PROPERTIES ( + "replication_num" = "1", + "colocate_with" = "web" +); +CREATE TABLE IF NOT EXISTS web_site ( + web_site_sk bigint not null, + web_site_id char(16) not null, + web_rec_start_date date, + web_rec_end_date date, + web_name varchar(50), + web_open_date_sk bigint, + web_close_date_sk bigint, + web_class varchar(50), + web_manager varchar(40), + web_mkt_id integer, + web_mkt_class varchar(50), + web_mkt_desc varchar(100), + web_market_manager varchar(40), + web_company_id integer, + web_company_name char(50), + web_street_number char(10), + web_street_name varchar(60), + web_street_type char(15), + web_suite_number char(10), + web_city varchar(60), + web_county varchar(30), + web_state char(2), + web_zip char(10), + web_country varchar(20), + web_gmt_offset decimal(5,2), + web_tax_percentage decimal(5,2) +) +DUPLICATE KEY(web_site_sk) +DISTRIBUTED BY HASH(web_site_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS promotion ( + p_promo_sk bigint not null, + p_promo_id char(16) not null, + p_start_date_sk bigint, + p_end_date_sk bigint, + p_item_sk bigint, + p_cost decimal(15,2), + p_response_targe integer, + p_promo_name char(50), + p_channel_dmail char(1), + p_channel_email char(1), + p_channel_catalog char(1), + p_channel_tv char(1), + p_channel_radio char(1), + p_channel_press char(1), + p_channel_event char(1), + p_channel_demo char(1), + p_channel_details varchar(100), + p_purpose char(15), + p_discount_active char(1) +) +DUPLICATE KEY(p_promo_sk) +DISTRIBUTED BY HASH(p_promo_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS web_sales ( + ws_item_sk bigint not null, + ws_order_number bigint not null, + ws_sold_date_sk bigint, + ws_sold_time_sk bigint, + ws_ship_date_sk bigint, + ws_bill_customer_sk bigint, + ws_bill_cdemo_sk bigint, + ws_bill_hdemo_sk bigint, + ws_bill_addr_sk bigint, + ws_ship_customer_sk bigint, + ws_ship_cdemo_sk bigint, + ws_ship_hdemo_sk bigint, + ws_ship_addr_sk bigint, + ws_web_page_sk bigint, + ws_web_site_sk bigint, + ws_ship_mode_sk bigint, + ws_warehouse_sk bigint, + ws_promo_sk bigint, + ws_quantity integer, + ws_wholesale_cost decimal(7,2), + ws_list_price decimal(7,2), + ws_sales_price decimal(7,2), + ws_ext_discount_amt decimal(7,2), + ws_ext_sales_price decimal(7,2), + ws_ext_wholesale_cost decimal(7,2), + ws_ext_list_price decimal(7,2), + ws_ext_tax decimal(7,2), + ws_coupon_amt decimal(7,2), + ws_ext_ship_cost decimal(7,2), + ws_net_paid decimal(7,2), + ws_net_paid_inc_tax decimal(7,2), + ws_net_paid_inc_ship decimal(7,2), + ws_net_paid_inc_ship_tax decimal(7,2), + ws_net_profit decimal(7,2) +) +DUPLICATE KEY(ws_item_sk, ws_order_number) +DISTRIBUTED BY HASH(ws_item_sk, ws_order_number) BUCKETS 32 +PROPERTIES ( + "replication_num" = "1", + "colocate_with" = "web" +); +CREATE TABLE IF NOT EXISTS store ( + s_store_sk bigint not null, + s_store_id char(16) not null, + s_rec_start_date date, + s_rec_end_date date, + s_closed_date_sk bigint, + s_store_name varchar(50), + s_number_employees integer, + s_floor_space integer, + s_hours char(20), + s_manager varchar(40), + s_market_id integer, + s_geography_class varchar(100), + s_market_desc varchar(100), + s_market_manager varchar(40), + s_division_id integer, + s_division_name varchar(50), + s_company_id integer, + s_company_name varchar(50), + s_street_number varchar(10), + s_street_name varchar(60), + s_street_type char(15), + s_suite_number char(10), + s_city varchar(60), + s_county varchar(30), + s_state char(2), + s_zip char(10), + s_country varchar(20), + s_gmt_offset decimal(5,2), + s_tax_precentage decimal(5,2) +) +DUPLICATE KEY(s_store_sk) +DISTRIBUTED BY HASH(s_store_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS time_dim ( + t_time_sk bigint not null, + t_time_id char(16) not null, + t_time integer, + t_hour integer, + t_minute integer, + t_second integer, + t_am_pm char(2), + t_shift char(20), + t_sub_shift char(20), + t_meal_time char(20) +) +DUPLICATE KEY(t_time_sk) +DISTRIBUTED BY HASH(t_time_sk) BUCKETS 12 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS web_page ( + wp_web_page_sk bigint not null, + wp_web_page_id char(16) not null, + wp_rec_start_date date, + wp_rec_end_date date, + wp_creation_date_sk bigint, + wp_access_date_sk bigint, + wp_autogen_flag char(1), + wp_customer_sk bigint, + wp_url varchar(100), + wp_type char(50), + wp_char_count integer, + wp_link_count integer, + wp_image_count integer, + wp_max_ad_count integer +) +DUPLICATE KEY(wp_web_page_sk) +DISTRIBUTED BY HASH(wp_web_page_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS store_returns ( + sr_item_sk bigint not null, + sr_ticket_number bigint not null, + sr_returned_date_sk bigint, + sr_return_time_sk bigint, + sr_customer_sk bigint, + sr_cdemo_sk bigint, + sr_hdemo_sk bigint, + sr_addr_sk bigint, + sr_store_sk bigint, + sr_reason_sk bigint, + sr_return_quantity integer, + sr_return_amt decimal(7,2), + sr_return_tax decimal(7,2), + sr_return_amt_inc_tax decimal(7,2), + sr_fee decimal(7,2), + sr_return_ship_cost decimal(7,2), + sr_refunded_cash decimal(7,2), + sr_reversed_charge decimal(7,2), + sr_store_credit decimal(7,2), + sr_net_loss decimal(7,2) +) +duplicate key(sr_item_sk, sr_ticket_number) +distributed by hash (sr_item_sk, sr_ticket_number) buckets 32 +properties ( + "replication_num" = "1", + "colocate_with" = "store" +); +CREATE TABLE IF NOT EXISTS store_sales ( + ss_item_sk bigint not null, + ss_ticket_number bigint not null, + ss_sold_date_sk bigint, + ss_sold_time_sk bigint, + ss_customer_sk bigint, + ss_cdemo_sk bigint, + ss_hdemo_sk bigint, + ss_addr_sk bigint, + ss_store_sk bigint, + ss_promo_sk bigint, + ss_quantity integer, + ss_wholesale_cost decimal(7,2), + ss_list_price decimal(7,2), + ss_sales_price decimal(7,2), + ss_ext_discount_amt decimal(7,2), + ss_ext_sales_price decimal(7,2), + ss_ext_wholesale_cost decimal(7,2), + ss_ext_list_price decimal(7,2), + ss_ext_tax decimal(7,2), + ss_coupon_amt decimal(7,2), + ss_net_paid decimal(7,2), + ss_net_paid_inc_tax decimal(7,2), + ss_net_profit decimal(7,2) +) +DUPLICATE KEY(ss_item_sk, ss_ticket_number) +DISTRIBUTED BY HASH(ss_item_sk, ss_ticket_number) BUCKETS 32 +PROPERTIES ( + "replication_num" = "1", + "colocate_with" = "store" +); +CREATE TABLE IF NOT EXISTS ship_mode ( + sm_ship_mode_sk bigint not null, + sm_ship_mode_id char(16) not null, + sm_type char(30), + sm_code char(10), + sm_carrier char(20), + sm_contract char(20) +) +DUPLICATE KEY(sm_ship_mode_sk) +DISTRIBUTED BY HASH(sm_ship_mode_sk) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS customer ( + c_customer_sk bigint not null, + c_customer_id char(16) not null, + c_current_cdemo_sk bigint, + c_current_hdemo_sk bigint, + c_current_addr_sk bigint, + c_first_shipto_date_sk bigint, + c_first_sales_date_sk bigint, + c_salutation char(10), + c_first_name char(20), + c_last_name char(30), + c_preferred_cust_flag char(1), + c_birth_day integer, + c_birth_month integer, + c_birth_year integer, + c_birth_country varchar(20), + c_login char(13), + c_email_address char(50), + c_last_review_date_sk bigint +) +DUPLICATE KEY(c_customer_sk) +DISTRIBUTED BY HASH(c_customer_id) BUCKETS 12 +PROPERTIES ( + "replication_num" = "1" +); +CREATE TABLE IF NOT EXISTS dbgen_version +( + dv_version varchar(16) , + dv_create_date date , + dv_create_time datetime , + dv_cmdline_args varchar(200) +) +DUPLICATE KEY(dv_version) +DISTRIBUTED BY HASH(dv_version) BUCKETS 1 +PROPERTIES ( + "replication_num" = "1" +);