diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl index a4b27bdd6c..45d9bbf359 100644 --- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl +++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl @@ -23,24 +23,25 @@ start-worker.sh spark://doris--spark-iceberg:7077 start-history-server.sh start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby" +# The creation of a Spark SQL client is time-consuming, +# and reopening a new client for each SQL file execution leads to significant overhead. +# To reduce the time spent on creating clients, +# we group these files together and execute them using a single client. +# This approach can reduce the time from 150s to 40s. +START_TIME1=$(date +%s) +find /mnt/scripts/create_preinstalled_scripts/iceberg -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> iceberg_total.sql +spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f iceberg_total.sql +END_TIME1=$(date +%s) +EXECUTION_TIME1=$((END_TIME1 - START_TIME1)) +echo "Script iceberg total: {} executed in $EXECUTION_TIME1 seconds" -ls /mnt/scripts/create_preinstalled_scripts/iceberg/*.sql | xargs -n 1 -I {} bash -c ' - START_TIME=$(date +%s) - spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f {} - END_TIME=$(date +%s) - EXECUTION_TIME=$((END_TIME - START_TIME)) - echo "Script: {} executed in $EXECUTION_TIME seconds" -' - -ls /mnt/scripts/create_preinstalled_scripts/paimon/*.sql | xargs -n 1 -I {} bash -c ' - START_TIME=$(date +%s) - spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f {} - END_TIME=$(date +%s) - EXECUTION_TIME=$((END_TIME - START_TIME)) - echo "Script: {} executed in $EXECUTION_TIME seconds" -' - +START_TIME2=$(date +%s) +find /mnt/scripts/create_preinstalled_scripts/paimon -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> paimon_total.sql +spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f paimon_total.sql +END_TIME2=$(date +%s) +EXECUTION_TIME2=$((END_TIME2 - START_TIME2)) +echo "Script paimon total: {} executed in $EXECUTION_TIME2 seconds" touch /mnt/SUCCESS; diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl index 38491f645a..fa4f8d1cca 100644 --- a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl +++ b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl @@ -18,6 +18,7 @@ version: "3" services: + spark-iceberg: image: tabulario/spark-iceberg container_name: doris--spark-iceberg @@ -47,6 +48,19 @@ services: interval: 5s timeout: 120s retries: 120 + + postgres: + image: postgis/postgis:14-3.3 + container_name: doris--postgres + environment: + POSTGRES_PASSWORD: 123456 + POSTGRES_USER: root + POSTGRES_DB: iceberg + volumes: + - ./data/input/pgdata:/var/lib/postgresql/data + networks: + - doris--iceberg + rest: image: tabulario/iceberg-rest container_name: doris--iceberg-rest @@ -54,6 +68,8 @@ services: - ${REST_CATALOG_PORT}:8181 volumes: - ./data:/mnt/data + depends_on: + - postgres environment: - AWS_ACCESS_KEY_ID=admin - AWS_SECRET_ACCESS_KEY=password @@ -61,6 +77,9 @@ services: - CATALOG_WAREHOUSE=s3a://warehouse/wh/ - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO - CATALOG_S3_ENDPOINT=http://minio:9000 + - CATALOG_URI=jdbc:postgresql://postgres:5432/iceberg + - CATALOG_JDBC_USER=root + - CATALOG_JDBC_PASSWORD=123456 networks: - doris--iceberg entrypoint: /bin/bash /mnt/data/input/script/rest_init.sh