[enhance](hive) support reading hive table with OpenCSVSerde #42257 (#42940)

cherry pick from #42257

Co-authored-by: Socrates <suxiaogang223@icloud.com>
This commit is contained in:
Mingyu Chen (Rayner)
2024-10-31 11:12:07 +08:00
committed by GitHub
parent d67007c72d
commit cdd32d9582
5 changed files with 148 additions and 37 deletions

View File

@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def");
insert into serde_test5 values(1, "abc"),(2, "def");
insert into serde_test6 values(1, "abc"),(2, "def");
insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\"");
CREATE TABLE test_open_csv_default_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE;
CREATE TABLE test_open_csv_standard_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
STORED AS TEXTFILE;
CREATE TABLE test_open_csv_custom_prop (
id INT,
name STRING,
age INT,
salary DOUBLE,
is_active BOOLEAN,
hire_date DATE,
last_login TIMESTAMP,
rating FLOAT,
description STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = "\t",
"quoteChar" = "\'",
"escapeChar" = "|"
)
STORED AS TEXTFILE;
INSERT INTO TABLE test_open_csv_default_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
INSERT INTO TABLE test_open_csv_standard_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
INSERT INTO TABLE test_open_csv_custom_prop VALUES
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');

View File

@ -19,6 +19,7 @@ package org.apache.doris.datasource.hive;
import com.google.common.collect.ImmutableSet;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.OpenCSVSerde;
import java.util.HashMap;
import java.util.Map;
@ -27,15 +28,12 @@ import java.util.Set;
public class HiveProperties {
public static final String PROP_FIELD_DELIMITER = "field.delim";
public static final String PROP_SEPARATOR_CHAR = "separatorChar";
public static final String PROP_SERIALIZATION_FORMAT = "serialization.format";
public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01"
public static final String PROP_LINE_DELIMITER = "line.delim";
public static final String DEFAULT_LINE_DELIMITER = "\n";
public static final String PROP_QUOTE_CHAR = "quoteChar";
public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim";
public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim";
public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
@ -49,6 +47,14 @@ public class HiveProperties {
public static final String PROP_NULL_FORMAT = "serialization.null.format";
public static final String DEFAULT_NULL_FORMAT = "\\N";
// The following properties are used for OpenCsvSerde.
public static final String PROP_SEPARATOR_CHAR = OpenCSVSerde.SEPARATORCHAR;
public static final String DEFAULT_SEPARATOR_CHAR = ",";
public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR;
public static final String DEFAULT_QUOTE_CHAR = "\"";
public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
public static final String DEFAULT_ESCAPE_CHAR = "\\";
public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
PROP_FIELD_DELIMITER,
PROP_COLLECTION_DELIMITER_HIVE2,
@ -59,37 +65,33 @@ public class HiveProperties {
PROP_QUOTE_CHAR,
PROP_MAP_KV_DELIMITER,
PROP_ESCAPE_DELIMITER,
PROP_NULL_FORMAT
);
PROP_ESCAPE_CHAR,
PROP_NULL_FORMAT);
public static String getFieldDelimiter(Table table) {
// This method is used for text format.
// If you need compatibility with csv format, please use `getColumnSeparator`.
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
}
public static String getColumnSeparator(Table table) {
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> columnSeparator = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat));
public static String getSeparatorChar(Table table) {
Optional<String> separatorChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_SEPARATOR_CHAR, separatorChar);
}
public static String getLineDelimiter(Table table) {
Optional<String> lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_LINE_DELIMITER, lineDelim));
DEFAULT_LINE_DELIMITER, lineDelim));
}
public static String getMapKvDelimiter(Table table) {
Optional<String> mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER);
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
}
public static String getCollectionDelimiter(Table table) {
@ -101,14 +103,6 @@ public class HiveProperties {
DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3));
}
public static Optional<String> getQuoteChar(Table table) {
Map<String, String> serdeParams = table.getSd().getSerdeInfo().getParameters();
if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
return Optional.of(serdeParams.get(PROP_QUOTE_CHAR));
}
return Optional.empty();
}
public static Optional<String> getEscapeDelimiter(Table table) {
Optional<String> escapeDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER);
if (escapeDelim.isPresent()) {
@ -127,6 +121,16 @@ public class HiveProperties {
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT, nullFormat);
}
public static String getQuoteChar(Table table) {
Optional<String> quoteChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar);
}
public static String getEscapeChar(Table table) {
Optional<String> escapeChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR);
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, escapeChar);
}
// Set properties to table
public static void setTableProperties(Table table, Map<String, String> properties) {
HashMap<String, String> serdeProps = new HashMap<>();

View File

@ -385,20 +385,36 @@ public class HiveScanNode extends FileQueryScanNode {
protected TFileAttributes getFileAttributes() throws UserException {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
Table table = hmsTable.getRemoteTable();
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
// 4. set collection delimiter
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
// 5. set quote char
HiveProperties.getQuoteChar(table).ifPresent(d -> textParams.setEnclose(d.getBytes()[0]));
// 6. set escape delimiter
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 7. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
// TODO: separate hive text table and OpenCsv table
String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
// set properties of LazySimpleSerDe
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
// 4. set collection delimiter
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
// 5. set escape delimiter
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 6. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
} else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
// set set properties of OpenCSVSerde
// 1. set column separator
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set enclose char
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
// 4. set escape char
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
} else {
throw new UserException(
"unsupported hive table serde: " + serDeLib);
}
TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);

View File

@ -39,6 +39,18 @@ b 2.2
3 aaa
4 "null"
-- !test_open_csv_default_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""
-- !test_open_csv_standard_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
-- !test_open_csv_custom_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
-- !1 --
a 1.1
b 2.2
@ -79,3 +91,14 @@ b 2.2
3 aaa
4 "null"
-- !test_open_csv_default_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""
-- !test_open_csv_standard_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
-- !test_open_csv_custom_prop --
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
2 Jane,Smith FALSE 2020-05-20 "Project Manager"

View File

@ -51,6 +51,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte
hive_docker """truncate table regression.serde_test8;"""
sql """insert into ${catalog_name}.regression.serde_test8 select * from ${catalog_name}.regression.serde_test7;"""
qt_9 """select * from ${catalog_name}.regression.serde_test8 order by id;"""
qt_test_open_csv_default_prop """select * from ${catalog_name}.regression.test_open_csv_default_prop order by id;"""
qt_test_open_csv_standard_prop """select * from ${catalog_name}.regression.test_open_csv_standard_prop order by id;"""
qt_test_open_csv_custom_prop """select * from ${catalog_name}.regression.test_open_csv_custom_prop order by id;"""
}
}