Files
doris/fe/be-java-extensions/hudi-scanner/pom.xml
Ashin Gau 4158253799 [feature](hudi) support hudi time travel in external table (#21739)
Support hudi time travel in external table:
```
select * from hudi_table for time as of '20230712221248';
```
PR(https://github.com/apache/doris/pull/15418) supports to take timestamp or version as the snapshot ID in iceberg, but hudi only has timestamp as the snapshot ID. Therefore, when querying hudi table with `for version as of`, error will be thrown like:
```
ERROR 1105 (HY000): errCode = 2, detailMessage = Hudi table only supports timestamp as snapshot ID
```
The supported formats of timestamp in hudi are: 'yyyy-MM-dd HH:mm:ss[.SSS]' or 'yyyy-MM-dd' or 'yyyyMMddHHmmss[SSS]', which is consistent with the [time-travel-query.](https://hudi.apache.org/docs/quick-start-guide#time-travel-query)

## Partitioning Strategies
Before this PR, hudi's partitions need to be synchronized to hive through [hive-sync-tool](https://hudi.apache.org/docs/syncing_metastore/#hive-sync-tool), or by setting very complex synchronization parameters in [spark conf](https://hudi.apache.org/docs/syncing_metastore/#sync-template). These processes are exceptionally complex and unnecessary, unless you want to query hudi data through hive.

In addition, partitions are changed in time travel. We cannot guarantee the correctness of time travel through partition synchronization.

So this PR directly obtain partitions by reading hudi meta information. Caching and updating table partition information through hudi instant timestamp, and reusing Doris' partition pruning.
2023-07-13 22:30:07 +08:00

269 lines
10 KiB
XML

<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>be-java-extensions</artifactId>
<groupId>org.apache.doris</groupId>
<version>${revision}</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-scanner</artifactId>
<properties>
<doris.home>${basedir}/../../</doris.home>
<fe_ut_parallel>1</fe_ut_parallel>
<scala.version>2.12.15</scala.version>
<scala.binary.version>2.12</scala.binary.version>
<spark.version>3.2.0</spark.version>
<sparkbundle.version>3.2</sparkbundle.version>
<hudi.version>0.13.0</hudi.version>
<janino.version>3.0.16</janino.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
<version>${hudi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-client</artifactId>
<version>${hudi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark3-common</artifactId>
<version>${hudi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark3.2.x_${scala.binary.version}</artifactId>
<version>${hudi.version}</version>
<exclusions>
<exclusion>
<artifactId>json4s-ast_2.11</artifactId>
<groupId>org.json4s</groupId>
</exclusion>
<exclusion>
<artifactId>json4s-core_2.11</artifactId>
<groupId>org.json4s</groupId>
</exclusion>
<exclusion>
<artifactId>json4s-jackson_2.11</artifactId>
<groupId>org.json4s</groupId>
</exclusion>
<exclusion>
<artifactId>json4s-scalap_2.11</artifactId>
<groupId>org.json4s</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.10.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<artifactId>jackson-module-scala_2.12</artifactId>
<groupId>com.fasterxml.jackson.module</groupId>
</exclusion>
<exclusion>
<artifactId>hadoop-client-api</artifactId>
<groupId>org.apache.hadoop</groupId>
</exclusion>
<exclusion>
<artifactId>hadoop-client-runtime</artifactId>
<groupId>org.apache.hadoop</groupId>
</exclusion>
</exclusions>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
</exclusion>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-launcher_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<!-- version of spark's janino is error -->
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
<version>${janino.version}</version>
<exclusions>
<exclusion>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>${janino.version}</version>
</dependency>
<dependency>
<!-- version of spark's jackson module is error -->
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
<version>${jackson.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.doris</groupId>
<artifactId>java-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
</dependencies>
<build>
<finalName>hudi-scanner</finalName>
<sourceDirectory>src/main/java</sourceDirectory>
<testSourceDirectory>src/test/java</testSourceDirectory>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<testResources>
<testResource>
<directory>src/test/resources</directory>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.7.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-unchecked</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<id>default-compile</id>
<phase>none</phase>
</execution>
<execution>
<id>default-testCompile</id>
<phase>none</phase>
</execution>
<execution>
<id>java-compile</id>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<phase>compile</phase>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>src/main/resources/package.xml</descriptor>
</descriptors>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>