[fix](Outfile) fix bug that the fileSize is not correct when outfile is completed (#22951)
This commit is contained in:
@ -467,7 +467,11 @@ Status VFileResultWriter::_create_new_file_if_exceed_size() {
|
||||
Status VFileResultWriter::_close_file_writer(bool done) {
|
||||
if (_vfile_writer) {
|
||||
_vfile_writer->close();
|
||||
COUNTER_UPDATE(_written_data_bytes, _current_written_bytes);
|
||||
// we can not use _current_written_bytes to COUNTER_UPDATE(_written_data_bytes, _current_written_bytes)
|
||||
// because it will call `write()` function of orc/parquet function in `_vfile_writer->close()`
|
||||
// and the real written_len will increase
|
||||
// and _current_written_bytes will less than _vfile_writer->written_len()
|
||||
COUNTER_UPDATE(_written_data_bytes, _vfile_writer->written_len());
|
||||
_vfile_writer.reset(nullptr);
|
||||
} else if (_file_writer_impl) {
|
||||
_file_writer_impl->close();
|
||||
|
||||
@ -77,7 +77,7 @@ illustrate:
|
||||
File related properties
|
||||
column_separator: column separator,is only for CSV format <version since="1.2.0">Support mulit-bytes, such as: "\\x01", "abc"</version>
|
||||
line_delimiter: line delimiter,is only for CSV format <version since="1.2.0">Support mulit-bytes, such as: "\\x01", "abc"</version>
|
||||
max_file_size: the size limit of a single file, if the result exceeds this value, it will be cut into multiple files.
|
||||
max_file_size: the size limit of a single file, if the result exceeds this value, it will be cut into multiple files, the value range of max_file_size is [5MB, 2GB] and the default is 1GB. (When specified that the file format is ORC, the size of the actual division file will be a multiples of 64MB, such as: specify max_file_size = 5MB, and actually use 64MB as the division; specify max_file_size = 65MB, and will actually use 128MB as cut division points.)
|
||||
delete_existing_files: default `false`. If it is specified as true, you will first delete all files specified in the directory specified by the file_path, and then export the data to the directory.For example: "file_path" = "/user/tmp", then delete all files and directory under "/user/"; "file_path" = "/user/tmp/", then delete all files and directory under "/user/tmp/"
|
||||
|
||||
Broker related properties need to be prefixed with `broker.`:
|
||||
|
||||
@ -82,7 +82,7 @@ INTO OUTFILE "file_path"
|
||||
文件相关的属性
|
||||
column_separator: 列分隔符,只支持csv格式。<version since="1.2.0">支持多字节分隔符,如:"\\x01", "abc"</version>
|
||||
line_delimiter: 行分隔符,只支持csv格式。<version since="1.2.0">支持多字节分隔符,如:"\\x01", "abc"</version>
|
||||
max_file_size: 单个文件大小限制,如果结果超过这个值,将切割成多个文件。
|
||||
max_file_size: 单个文件大小限制,如果结果超过这个值,将切割成多个文件, max_file_size取值范围是[5MB, 2GB], 默认为1GB。(当指定导出为orc文件格式时,实际切分文件的大小将是64MB的倍数,如:指定max_file_size = 5MB, 实际将以64MB为切分;指定max_file_size = 65MB, 实际将以128MB为切分)
|
||||
delete_existing_files: 默认为false,若指定为true,则会先删除file_path指定的目录下的所有文件,然后导出数据到该目录下。例如:"file_path" = "/user/tmp", 则会删除"/user/"下所有文件及目录;"file_path" = "/user/tmp/", 则会删除"/user/tmp/"下所有文件及目录
|
||||
|
||||
Broker 相关属性需加前缀 `broker.`:
|
||||
|
||||
@ -0,0 +1,115 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
suite("test_outfile_orc_max_file_size", "p2") {
|
||||
String nameNodeHost = context.config.otherConfigs.get("extHiveHmsHost")
|
||||
String hdfsPort = context.config.otherConfigs.get("extHdfsPort")
|
||||
String fs = "hdfs://${nameNodeHost}:${hdfsPort}"
|
||||
String user_name = context.config.otherConfigs.get("extHiveHmsUser")
|
||||
|
||||
// the path used to load data
|
||||
def load_data_path = "/user/export_test/test_orc_max_file_size.orc"
|
||||
// the path used to export data
|
||||
def outFilePath = """/user/export_test/test_max_file_size/test_orc/exp_"""
|
||||
|
||||
def create_table = {table_name ->
|
||||
sql """ DROP TABLE IF EXISTS ${table_name} """
|
||||
sql """
|
||||
CREATE TABLE IF NOT EXISTS ${table_name} (
|
||||
`user_id` LARGEINT NOT NULL COMMENT "用户id",
|
||||
`date` DATE NOT NULL COMMENT "数据灌入日期时间",
|
||||
`datetime` DATETIME NOT NULL COMMENT "数据灌入日期时间",
|
||||
`city` VARCHAR(20) COMMENT "用户所在城市",
|
||||
`age` INT COMMENT "用户年龄",
|
||||
`sex` INT COMMENT "用户性别",
|
||||
`bool_col` boolean COMMENT "",
|
||||
`int_col` int COMMENT "",
|
||||
`bigint_col` bigint COMMENT "",
|
||||
`largeint_col` largeint COMMENT "",
|
||||
`float_col` float COMMENT "",
|
||||
`double_col` double COMMENT "",
|
||||
`char_col` CHAR(10) COMMENT "",
|
||||
`decimal_col` decimal COMMENT ""
|
||||
)
|
||||
DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1");
|
||||
"""
|
||||
}
|
||||
|
||||
def table_export_name = "test_export_max_file_size"
|
||||
|
||||
create_table(table_export_name)
|
||||
|
||||
// load data
|
||||
sql """
|
||||
insert into ${table_export_name}
|
||||
select * from hdfs(
|
||||
"uri" = "hdfs://${nameNodeHost}:${hdfsPort}${load_data_path}",
|
||||
"fs.defaultFS" = "${fs}",
|
||||
"hadoop.username" = "${user_name}",
|
||||
"format" = "orc");
|
||||
"""
|
||||
|
||||
def test_outfile_orc_success = {maxFileSize, isDelete, fileNumber, totalRows ->
|
||||
table = sql """
|
||||
select * from ${table_export_name}
|
||||
into outfile "${fs}${outFilePath}"
|
||||
FORMAT AS ORC
|
||||
PROPERTIES(
|
||||
"fs.defaultFS"="${fs}",
|
||||
"hadoop.username" = "${user_name}",
|
||||
"max_file_size" = "${maxFileSize}",
|
||||
"delete_existing_files"="${isDelete}"
|
||||
);
|
||||
"""
|
||||
assertTrue(table.size() == 1)
|
||||
assertTrue(table[0].size == 4)
|
||||
log.info("outfile result = " + table[0])
|
||||
assertEquals(table[0][0], fileNumber)
|
||||
assertEquals(table[0][1], totalRows)
|
||||
}
|
||||
|
||||
def test_outfile_orc_fail = {maxFileSize, isDelete ->
|
||||
test {
|
||||
sql """
|
||||
select * from ${table_export_name}
|
||||
into outfile "${fs}${outFilePath}"
|
||||
FORMAT AS ORC
|
||||
PROPERTIES(
|
||||
"fs.defaultFS"="${fs}",
|
||||
"hadoop.username" = "${user_name}",
|
||||
"max_file_size" = "${maxFileSize}",
|
||||
"delete_existing_files"="${isDelete}"
|
||||
);
|
||||
"""
|
||||
|
||||
// other check will not work because already declared a check callback
|
||||
exception "max file size should between 5MB and 2GB"
|
||||
|
||||
// callback
|
||||
check { result, exception, startTime, endTime ->
|
||||
assertTrue(exception != null)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_outfile_orc_fail('3MB', true)
|
||||
test_outfile_orc_fail('2.1GB', true)
|
||||
test_outfile_orc_success('5MB', true, 3, 2000000)
|
||||
test_outfile_orc_success('63MB', true, 3, 2000000)
|
||||
test_outfile_orc_success('64MB', true, 3, 2000000)
|
||||
test_outfile_orc_success('80MB', true, 2, 2000000)
|
||||
}
|
||||
Reference in New Issue
Block a user