[enhance](hive) Add regression-test cases for hive text ddl and hive text insert and fix reading null string bug #42200 (#42273)

cherry pick from #42200

Co-authored-by: Socrates <suxiaogang223@icloud.com>
This commit is contained in:
Rayner Chen
2024-10-22 23:56:57 +08:00
committed by GitHub
parent d7c3369ce7
commit 157d67e7ca
4 changed files with 200 additions and 54 deletions

View File

@ -624,7 +624,7 @@ template <bool from_json>
Status CsvReader::deserialize_nullable_string(IColumn& column, Slice& slice) {
auto& null_column = assert_cast<ColumnNullable&>(column);
if (!(from_json && _options.converted_from_string && slice.trim_double_quotes())) {
if (slice.size == 2 && slice[0] == '\\' && slice[1] == 'N') {
if (slice.compare(Slice(_options.null_format, _options.null_len)) == 0) {
null_column.insert_data(nullptr, 0);
return Status::OK();
}

View File

@ -560,7 +560,14 @@ CREATE TABLE `all_types_text`(
`t_array_string_all_nulls` array<string>,
`dt` int)
stored as textfile
TBLPROPERTIES("line.delim"="\n", "field.delim"="\1");
TBLPROPERTIES(
'field.delim'='\t',
'line.delim'='\n',
'collection.delim'=',',
'mapkey.delim'=':',
'escape.delim'='|',
'serialization.null.format'='null'
);
CREATE TABLE all_types_par_text(
`boolean_col` boolean,
@ -628,4 +635,11 @@ CREATE TABLE all_types_par_text(
PARTITIONED BY (
`dt` int)
stored as textfile
TBLPROPERTIES("line.delim"="\n", "field.delim"="\1");
TBLPROPERTIES(
'field.delim'='\t',
'line.delim'='\n',
'collection.delim'=',',
'mapkey.delim'=':',
'escape.delim'='|',
'serialization.null.format'='null'
);

View File

@ -0,0 +1,57 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !default_properties --
1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"}
2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"}
3 Charlie \N {"keyC":"valueC", "keyD":"valueD"}
-- !hive_docker_default_properties --
1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"}
2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"}
3 Charlie \N {"keyC":"valueC","keyD":"valueD"}
-- !standard_properties --
1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"}
2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"}
3 Charlie \N {"keyC":"valueC", "keyD":"valueD"}
-- !hive_docker_standard_properties --
1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"}
2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"}
3 Charlie \N {"keyC":"valueC","keyD":"valueD"}
-- !different_properties --
1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"}
2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"}
3 Charlie \N {"keyC":"valueC", "keyD":"valueD"}
-- !hive_docker_different_properties --
1 Alice ["tag1,tag2"] {"key1":"value1,key2:value2\\u00042"}
-- !default_properties --
1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"}
2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"}
3 Charlie \N {"keyC":"valueC", "keyD":"valueD"}
-- !hive_docker_default_properties --
1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"}
2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"}
3 Charlie \N {"keyC":"valueC","keyD":"valueD"}
-- !standard_properties --
1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"}
2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"}
3 Charlie \N {"keyC":"valueC", "keyD":"valueD"}
-- !hive_docker_standard_properties --
1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2"}
2 Bob ["tagA","tagB"] {"keyA":"valueA","keyB":"valueB"}
3 Charlie \N {"keyC":"valueC","keyD":"valueD"}
-- !different_properties --
1 Alice ["tag1", "tag2"] {"key1":"value1", "key2":"value2"}
2 Bob ["tagA", "tagB"] {"keyA":"valueA", "keyB":"valueB"}
3 Charlie \N {"keyC":"valueC", "keyD":"valueD"}
-- !hive_docker_different_properties --
1 Alice ["tag1","tag2"] {"key1":"value1","key2":"value2\\u00042"}

View File

@ -17,62 +17,137 @@
suite("test_hive_ddl_text_format", "p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
String hms_port = context.config.otherConfigs.get("hive3HmsPort")
String hdfs_port = context.config.otherConfigs.get("hive3HdfsPort")
String catalog_name = "test_hive_ddl_text_format"
String table_name = "table_with_pars";
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
return;
}
sql """drop catalog if exists ${catalog_name};"""
for (String hivePrefix : ["hive2", "hive3"]) {
setHivePrefix(hivePrefix)
try{
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")
String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
String catalog_name = "test_hive_ddl_text_format"
String table_name = "table_with_pars";
sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}',
'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}',
'use_meta_cache' = 'true'
sql """drop catalog if exists ${catalog_name};"""
sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}',
'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}',
'use_meta_cache' = 'true'
);
"""
logger.info("catalog " + catalog_name + " created")
sql """switch ${catalog_name};"""
logger.info("switched to catalog " + catalog_name)
sql """use `default`;"""
sql """ drop table if exists text_table_default_properties """
sql """
create table text_table_default_properties (
id int,
`name` string,
tags array<string>,
attributes map<string, string>
) PROPERTIES (
'file_format'='text'
);
"""
logger.info("catalog " + catalog_name + " created")
sql """switch ${catalog_name};"""
logger.info("switched to catalog " + catalog_name)
sql """use `default`;"""
"""
sql """
INSERT INTO text_table_default_properties VALUES
(1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')),
(2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')),
(3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD'));
"""
order_qt_default_properties """ select * from text_table_default_properties """
sql """ drop table if exists tb_text """
sql """
create table tb_text (
id int,
`name` string
) PROPERTIES (
'compression'='gzip',
'file_format'='text',
'field.delim'='\t',
'line.delim'='\n',
'collection.delim'=';',
'mapkey.delim'=':',
'serialization.null.format'='\\N'
);
"""
order_qt_hive_docker_default_properties""" select * from text_table_default_properties """
String serde = "'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'"
String input_format = "'org.apache.hadoop.mapred.TextInputFormat'"
String output_format = "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'"
String doris_fileformat = "'doris.file_format'='text'"
String filed_delim = "'field.delim'"
String line_delim = "'line.delim'"
String mapkey_delim = "'mapkey.delim'"
sql """ drop table if exists text_table_standard_properties """
// Escape characters need to be considered in groovy scripts
sql """
create table text_table_standard_properties (
id int,
`name` string,
tags array<string>,
attributes map<string, string>
) PROPERTIES (
'compression'='plain',
'file_format'='text',
'field.delim'='\\1',
'line.delim'='\\n',
'collection.delim'='\\2',
'mapkey.delim'='\\3',
'escape.delim'= '\\\\',
'serialization.null.format'='\\\\N'
);
"""
sql """
INSERT INTO text_table_standard_properties VALUES
(1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')),
(2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')),
(3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD'));
"""
order_qt_standard_properties """ select * from text_table_standard_properties """
order_qt_hive_docker_standard_properties """ select * from text_table_standard_properties order by id; """
def create_tbl_res = sql """ show create table tb_text """
String res = create_tbl_res.toString()
logger.info("${res}")
assertTrue(res.containsIgnoreCase("${serde}"))
assertTrue(res.containsIgnoreCase("${input_format}"))
assertTrue(res.containsIgnoreCase("${output_format}"))
assertTrue(res.containsIgnoreCase("${doris_fileformat}"))
assertTrue(res.containsIgnoreCase("${filed_delim}"))
assertTrue(res.containsIgnoreCase("${filed_delim}"))
assertTrue(res.containsIgnoreCase("${line_delim}"))
assertTrue(res.containsIgnoreCase("${mapkey_delim}"))
sql """ drop table if exists text_table_different_properties """
sql """
create table text_table_different_properties (
id int,
`name` string,
tags array<string>,
attributes map<string, string>
) PROPERTIES (
'compression'='gzip',
'file_format'='text',
'field.delim'='A',
'line.delim'='\\4',
'collection.delim'=',',
'mapkey.delim'=':',
'escape.delim'='|',
'serialization.null.format'='null'
);
"""
sql """
INSERT INTO text_table_different_properties VALUES
(1, 'Alice', array('tag1', 'tag2'), map('key1', 'value1', 'key2', 'value2')),
(2, 'Bob', array('tagA', 'tagB'), map('keyA', 'valueA', 'keyB', 'valueB')),
(3, 'Charlie', NULL, map('keyC', 'valueC', 'keyD', 'valueD'));
"""
order_qt_different_properties """ select * from text_table_different_properties """
order_qt_hive_docker_different_properties """ select * from text_table_different_properties order by id; """
String serde = "'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'"
String input_format = "'org.apache.hadoop.mapred.TextInputFormat'"
String output_format = "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'"
String doris_fileformat = "'doris.file_format'='text'"
String filed_delim = "'field.delim'"
String line_delim = "'line.delim'"
String mapkey_delim = "'mapkey.delim'"
String collection_delim = "'collection.delim'"
String escape_delim = "'escape.delim'"
String serialization_null_format = "'serialization.null.format'"
def create_tbl_res = sql """ show create table text_table_standard_properties """
String res = create_tbl_res.toString()
logger.info("${res}")
assertTrue(res.containsIgnoreCase("${serde}"))
assertTrue(res.containsIgnoreCase("${input_format}"))
assertTrue(res.containsIgnoreCase("${output_format}"))
assertTrue(res.containsIgnoreCase("${doris_fileformat}"))
assertTrue(res.containsIgnoreCase("${filed_delim}"))
assertTrue(res.containsIgnoreCase("${filed_delim}"))
assertTrue(res.containsIgnoreCase("${line_delim}"))
assertTrue(res.containsIgnoreCase("${mapkey_delim}"))
assertTrue(res.containsIgnoreCase("${collection_delim}"))
assertTrue(res.containsIgnoreCase("${escape_delim}"))
assertTrue(res.containsIgnoreCase("${serialization_null_format}"))
} finally {
}
}
}