From 73de61ed8439e856c576500c9023d8a70ddd57a5 Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Tue, 19 Mar 2024 14:26:10 +0800 Subject: [PATCH] [opt](hive) skip hidden file and dir (#32412) When query hive table, we should skip all hidden dirs and files, like: ``` /visible/.hidden/path /visible/.hidden.txt ``` --- .../.test_hidden_dir/wrong_file | 1 + .../test_hidden_file/.hidden_file | 1 + .../datasource/hive/HiveMetaStoreCache.java | 24 ++++++++-- .../doris/datasource/PathVisibleTest.java | 47 +++++++++++++++++++ 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/.test_hidden_dir/wrong_file create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/test_hidden_file/.hidden_file create mode 100644 fe/fe-core/src/test/java/org/apache/doris/datasource/PathVisibleTest.java diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/.test_hidden_dir/wrong_file b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/.test_hidden_dir/wrong_file new file mode 100644 index 0000000000..d37af4bbc5 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/.test_hidden_dir/wrong_file @@ -0,0 +1 @@ +wrong file diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/test_hidden_file/.hidden_file b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/test_hidden_file/.hidden_file new file mode 100644 index 0000000000..136c05e0d0 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_all_types/test_hidden_file/.hidden_file @@ -0,0 +1 @@ +hidden diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index f2a7019709..23c9de3be1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -50,6 +50,7 @@ import org.apache.doris.planner.ColumnBound; import org.apache.doris.planner.ListPartitionPrunerV2; import org.apache.doris.planner.PartitionPrunerV2Base.UniqueId; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.cache.CacheBuilder; @@ -63,7 +64,6 @@ import com.google.common.collect.RangeMap; import com.google.common.collect.Streams; import com.google.common.collect.TreeRangeMap; import lombok.Data; -import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; @@ -1035,14 +1035,16 @@ public class HiveMetaStoreCache { this.acidInfo = acidInfo; } - private boolean isFileVisible(Path path) { - if (path == null || StringUtils.isEmpty(path.toString())) { + @VisibleForTesting + public static boolean isFileVisible(Path path) { + if (path == null) { return false; } - if (path.getName().startsWith(".") || path.getName().startsWith("_")) { + String pathStr = path.toUri().toString(); + if (containsHiddenPath(pathStr) || path.getName().startsWith("_")) { return false; } - for (String name : path.toString().split("/")) { + for (String name : pathStr.split("/")) { if (isGeneratedPath(name)) { return false; } @@ -1050,6 +1052,18 @@ public class HiveMetaStoreCache { return true; } + private static boolean containsHiddenPath(String path) { + if (path.startsWith(".")) { + return true; + } + for (int i = 0; i < path.length() - 1; i++) { + if (path.charAt(i) == '/' && path.charAt(i + 1) == '.') { + return true; + } + } + return false; + } + private static boolean isGeneratedPath(String name) { return "_temporary".equals(name) // generated by spark || "_imapala_insert_staging".equals(name) // generated by impala diff --git a/fe/fe-core/src/test/java/org/apache/doris/datasource/PathVisibleTest.java b/fe/fe-core/src/test/java/org/apache/doris/datasource/PathVisibleTest.java new file mode 100644 index 0000000000..0937bbc3cc --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/datasource/PathVisibleTest.java @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.datasource; + +import org.apache.doris.datasource.hive.HiveMetaStoreCache.FileCacheValue; + +import org.apache.hadoop.fs.Path; +import org.junit.Assert; +import org.junit.Test; + +public class PathVisibleTest { + @Test + public void shouldReturnFalseWhenPathIsNull() { + Assert.assertFalse(FileCacheValue.isFileVisible(null)); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("s3://visible/.hidden/path"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("/visible/.hidden/path"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("hdfs://visible/path/.file"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("/visible/path/_temporary_xx"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("/visible/path/_imapala_insert_staging"))); + + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("/visible//.hidden/path"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("s3://visible/.hidden/path"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("///visible/path/.file"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("/visible/path///_temporary_xx"))); + Assert.assertFalse(FileCacheValue.isFileVisible(new Path("hdfs://visible//path/_imapala_insert_staging"))); + + Assert.assertTrue(FileCacheValue.isFileVisible(new Path("s3://visible/path"))); + Assert.assertTrue(FileCacheValue.isFileVisible(new Path("path"))); + Assert.assertTrue(FileCacheValue.isFileVisible(new Path("hdfs://visible/path./1.txt"))); + Assert.assertTrue(FileCacheValue.isFileVisible(new Path("/1.txt"))); + } +}