branch-2.1: [opt](iceberg docker)Use PostgreSQL as the backend for the Iceberg REST server. #46289 (#46576)

Cherry-picked from #46289

Co-authored-by: wuwenchi <wuwenchi@selectdb.com>
This commit is contained in:
github-actions[bot]
2025-01-09 22:30:03 +08:00
committed by GitHub
parent 26bdd23e05
commit 72cdedc47f
2 changed files with 36 additions and 16 deletions

View File

@ -23,24 +23,25 @@ start-worker.sh spark://doris--spark-iceberg:7077
start-history-server.sh
start-thriftserver.sh --driver-java-options "-Dderby.system.home=/tmp/derby"
# The creation of a Spark SQL client is time-consuming,
# and reopening a new client for each SQL file execution leads to significant overhead.
# To reduce the time spent on creating clients,
# we group these files together and execute them using a single client.
# This approach can reduce the time from 150s to 40s.
START_TIME1=$(date +%s)
find /mnt/scripts/create_preinstalled_scripts/iceberg -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> iceberg_total.sql
spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f iceberg_total.sql
END_TIME1=$(date +%s)
EXECUTION_TIME1=$((END_TIME1 - START_TIME1))
echo "Script iceberg total: {} executed in $EXECUTION_TIME1 seconds"
ls /mnt/scripts/create_preinstalled_scripts/iceberg/*.sql | xargs -n 1 -I {} bash -c '
START_TIME=$(date +%s)
spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions -f {}
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
'
ls /mnt/scripts/create_preinstalled_scripts/paimon/*.sql | xargs -n 1 -I {} bash -c '
START_TIME=$(date +%s)
spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f {}
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
'
START_TIME2=$(date +%s)
find /mnt/scripts/create_preinstalled_scripts/paimon -name '*.sql' | sed 's|^|source |' | sed 's|$|;|'> paimon_total.sql
spark-sql --master spark://doris--spark-iceberg:7077 --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f paimon_total.sql
END_TIME2=$(date +%s)
EXECUTION_TIME2=$((END_TIME2 - START_TIME2))
echo "Script paimon total: {} executed in $EXECUTION_TIME2 seconds"
touch /mnt/SUCCESS;

View File

@ -18,6 +18,7 @@
version: "3"
services:
spark-iceberg:
image: tabulario/spark-iceberg
container_name: doris--spark-iceberg
@ -47,6 +48,19 @@ services:
interval: 5s
timeout: 120s
retries: 120
postgres:
image: postgis/postgis:14-3.3
container_name: doris--postgres
environment:
POSTGRES_PASSWORD: 123456
POSTGRES_USER: root
POSTGRES_DB: iceberg
volumes:
- ./data/input/pgdata:/var/lib/postgresql/data
networks:
- doris--iceberg
rest:
image: tabulario/iceberg-rest
container_name: doris--iceberg-rest
@ -54,6 +68,8 @@ services:
- ${REST_CATALOG_PORT}:8181
volumes:
- ./data:/mnt/data
depends_on:
- postgres
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
@ -61,6 +77,9 @@ services:
- CATALOG_WAREHOUSE=s3a://warehouse/wh/
- CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
- CATALOG_S3_ENDPOINT=http://minio:9000
- CATALOG_URI=jdbc:postgresql://postgres:5432/iceberg
- CATALOG_JDBC_USER=root
- CATALOG_JDBC_PASSWORD=123456
networks:
- doris--iceberg
entrypoint: /bin/bash /mnt/data/input/script/rest_init.sh