cherry pick from #42257 Co-authored-by: Socrates <suxiaogang223@icloud.com>
This commit is contained in:
committed by
GitHub
parent
d67007c72d
commit
cdd32d9582
@ -106,3 +106,67 @@ insert into serde_test4 values(1, "abc"),(2, "def");
|
||||
insert into serde_test5 values(1, "abc"),(2, "def");
|
||||
insert into serde_test6 values(1, "abc"),(2, "def");
|
||||
insert into serde_test7 values(1, null),(2, "|||"),(3, "aaa"),(4, "\"null\"");
|
||||
|
||||
CREATE TABLE test_open_csv_default_prop (
|
||||
id INT,
|
||||
name STRING,
|
||||
age INT,
|
||||
salary DOUBLE,
|
||||
is_active BOOLEAN,
|
||||
hire_date DATE,
|
||||
last_login TIMESTAMP,
|
||||
rating FLOAT,
|
||||
description STRING
|
||||
)
|
||||
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
|
||||
STORED AS TEXTFILE;
|
||||
|
||||
CREATE TABLE test_open_csv_standard_prop (
|
||||
id INT,
|
||||
name STRING,
|
||||
age INT,
|
||||
salary DOUBLE,
|
||||
is_active BOOLEAN,
|
||||
hire_date DATE,
|
||||
last_login TIMESTAMP,
|
||||
rating FLOAT,
|
||||
description STRING
|
||||
)
|
||||
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
|
||||
WITH SERDEPROPERTIES (
|
||||
"separatorChar" = ",",
|
||||
"quoteChar" = "\"",
|
||||
"escapeChar" = "\\"
|
||||
)
|
||||
STORED AS TEXTFILE;
|
||||
|
||||
CREATE TABLE test_open_csv_custom_prop (
|
||||
id INT,
|
||||
name STRING,
|
||||
age INT,
|
||||
salary DOUBLE,
|
||||
is_active BOOLEAN,
|
||||
hire_date DATE,
|
||||
last_login TIMESTAMP,
|
||||
rating FLOAT,
|
||||
description STRING
|
||||
)
|
||||
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
|
||||
WITH SERDEPROPERTIES (
|
||||
"separatorChar" = "\t",
|
||||
"quoteChar" = "\'",
|
||||
"escapeChar" = "|"
|
||||
)
|
||||
STORED AS TEXTFILE;
|
||||
|
||||
INSERT INTO TABLE test_open_csv_default_prop VALUES
|
||||
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
|
||||
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
|
||||
|
||||
INSERT INTO TABLE test_open_csv_standard_prop VALUES
|
||||
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
|
||||
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
|
||||
|
||||
INSERT INTO TABLE test_open_csv_custom_prop VALUES
|
||||
(1, 'John Doe', 28, 50000.75, true, '2022-01-15', '2023-10-21 14:30:00', 4.5, 'Senior Developer'),
|
||||
(2, 'Jane,Smith', NULL, NULL, false, '2020-05-20', NULL, NULL, '\"Project Manager\"');
|
||||
|
||||
@ -19,6 +19,7 @@ package org.apache.doris.datasource.hive;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.apache.hadoop.hive.metastore.api.Table;
|
||||
import org.apache.hadoop.hive.serde2.OpenCSVSerde;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@ -27,15 +28,12 @@ import java.util.Set;
|
||||
|
||||
public class HiveProperties {
|
||||
public static final String PROP_FIELD_DELIMITER = "field.delim";
|
||||
public static final String PROP_SEPARATOR_CHAR = "separatorChar";
|
||||
public static final String PROP_SERIALIZATION_FORMAT = "serialization.format";
|
||||
public static final String DEFAULT_FIELD_DELIMITER = "\1"; // "\x01"
|
||||
|
||||
public static final String PROP_LINE_DELIMITER = "line.delim";
|
||||
public static final String DEFAULT_LINE_DELIMITER = "\n";
|
||||
|
||||
public static final String PROP_QUOTE_CHAR = "quoteChar";
|
||||
|
||||
public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim";
|
||||
public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim";
|
||||
public static final String DEFAULT_COLLECTION_DELIMITER = "\2";
|
||||
@ -49,6 +47,14 @@ public class HiveProperties {
|
||||
public static final String PROP_NULL_FORMAT = "serialization.null.format";
|
||||
public static final String DEFAULT_NULL_FORMAT = "\\N";
|
||||
|
||||
// The following properties are used for OpenCsvSerde.
|
||||
public static final String PROP_SEPARATOR_CHAR = OpenCSVSerde.SEPARATORCHAR;
|
||||
public static final String DEFAULT_SEPARATOR_CHAR = ",";
|
||||
public static final String PROP_QUOTE_CHAR = OpenCSVSerde.QUOTECHAR;
|
||||
public static final String DEFAULT_QUOTE_CHAR = "\"";
|
||||
public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
|
||||
public static final String DEFAULT_ESCAPE_CHAR = "\\";
|
||||
|
||||
public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
|
||||
PROP_FIELD_DELIMITER,
|
||||
PROP_COLLECTION_DELIMITER_HIVE2,
|
||||
@ -59,37 +65,33 @@ public class HiveProperties {
|
||||
PROP_QUOTE_CHAR,
|
||||
PROP_MAP_KV_DELIMITER,
|
||||
PROP_ESCAPE_DELIMITER,
|
||||
PROP_NULL_FORMAT
|
||||
);
|
||||
PROP_ESCAPE_CHAR,
|
||||
PROP_NULL_FORMAT);
|
||||
|
||||
public static String getFieldDelimiter(Table table) {
|
||||
// This method is used for text format.
|
||||
// If you need compatibility with csv format, please use `getColumnSeparator`.
|
||||
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
|
||||
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
|
||||
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
|
||||
DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
|
||||
}
|
||||
|
||||
public static String getColumnSeparator(Table table) {
|
||||
Optional<String> fieldDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
|
||||
Optional<String> columnSeparator = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
|
||||
Optional<String> serFormat = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
|
||||
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
|
||||
DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator, serFormat));
|
||||
public static String getSeparatorChar(Table table) {
|
||||
Optional<String> separatorChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SEPARATOR_CHAR);
|
||||
return HiveMetaStoreClientHelper.firstPresentOrDefault(
|
||||
DEFAULT_SEPARATOR_CHAR, separatorChar);
|
||||
}
|
||||
|
||||
|
||||
public static String getLineDelimiter(Table table) {
|
||||
Optional<String> lineDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_LINE_DELIMITER);
|
||||
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
|
||||
DEFAULT_LINE_DELIMITER, lineDelim));
|
||||
DEFAULT_LINE_DELIMITER, lineDelim));
|
||||
}
|
||||
|
||||
public static String getMapKvDelimiter(Table table) {
|
||||
Optional<String> mapkvDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_MAP_KV_DELIMITER);
|
||||
return HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
|
||||
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
|
||||
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
|
||||
}
|
||||
|
||||
public static String getCollectionDelimiter(Table table) {
|
||||
@ -101,14 +103,6 @@ public class HiveProperties {
|
||||
DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3));
|
||||
}
|
||||
|
||||
public static Optional<String> getQuoteChar(Table table) {
|
||||
Map<String, String> serdeParams = table.getSd().getSerdeInfo().getParameters();
|
||||
if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
|
||||
return Optional.of(serdeParams.get(PROP_QUOTE_CHAR));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public static Optional<String> getEscapeDelimiter(Table table) {
|
||||
Optional<String> escapeDelim = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_DELIMITER);
|
||||
if (escapeDelim.isPresent()) {
|
||||
@ -127,6 +121,16 @@ public class HiveProperties {
|
||||
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_NULL_FORMAT, nullFormat);
|
||||
}
|
||||
|
||||
public static String getQuoteChar(Table table) {
|
||||
Optional<String> quoteChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_QUOTE_CHAR);
|
||||
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_QUOTE_CHAR, quoteChar);
|
||||
}
|
||||
|
||||
public static String getEscapeChar(Table table) {
|
||||
Optional<String> escapeChar = HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_ESCAPE_CHAR);
|
||||
return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, escapeChar);
|
||||
}
|
||||
|
||||
// Set properties to table
|
||||
public static void setTableProperties(Table table, Map<String, String> properties) {
|
||||
HashMap<String, String> serdeProps = new HashMap<>();
|
||||
|
||||
@ -385,20 +385,36 @@ public class HiveScanNode extends FileQueryScanNode {
|
||||
protected TFileAttributes getFileAttributes() throws UserException {
|
||||
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
|
||||
Table table = hmsTable.getRemoteTable();
|
||||
// 1. set column separator
|
||||
textParams.setColumnSeparator(HiveProperties.getColumnSeparator(table));
|
||||
// 2. set line delimiter
|
||||
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
|
||||
// 3. set mapkv delimiter
|
||||
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
|
||||
// 4. set collection delimiter
|
||||
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
|
||||
// 5. set quote char
|
||||
HiveProperties.getQuoteChar(table).ifPresent(d -> textParams.setEnclose(d.getBytes()[0]));
|
||||
// 6. set escape delimiter
|
||||
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
|
||||
// 7. set null format
|
||||
textParams.setNullFormat(HiveProperties.getNullFormat(table));
|
||||
// TODO: separate hive text table and OpenCsv table
|
||||
String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
|
||||
if (serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
|
||||
// set properties of LazySimpleSerDe
|
||||
// 1. set column separator
|
||||
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
|
||||
// 2. set line delimiter
|
||||
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
|
||||
// 3. set mapkv delimiter
|
||||
textParams.setMapkvDelimiter(HiveProperties.getMapKvDelimiter(table));
|
||||
// 4. set collection delimiter
|
||||
textParams.setCollectionDelimiter(HiveProperties.getCollectionDelimiter(table));
|
||||
// 5. set escape delimiter
|
||||
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
|
||||
// 6. set null format
|
||||
textParams.setNullFormat(HiveProperties.getNullFormat(table));
|
||||
} else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
|
||||
// set set properties of OpenCSVSerde
|
||||
// 1. set column separator
|
||||
textParams.setColumnSeparator(HiveProperties.getSeparatorChar(table));
|
||||
// 2. set line delimiter
|
||||
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
|
||||
// 3. set enclose char
|
||||
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
|
||||
// 4. set escape char
|
||||
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
|
||||
} else {
|
||||
throw new UserException(
|
||||
"unsupported hive table serde: " + serDeLib);
|
||||
}
|
||||
|
||||
TFileAttributes fileAttributes = new TFileAttributes();
|
||||
fileAttributes.setTextParams(textParams);
|
||||
|
||||
@ -39,6 +39,18 @@ b 2.2
|
||||
3 aaa
|
||||
4 "null"
|
||||
|
||||
-- !test_open_csv_default_prop --
|
||||
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
|
||||
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""
|
||||
|
||||
-- !test_open_csv_standard_prop --
|
||||
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
|
||||
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
|
||||
|
||||
-- !test_open_csv_custom_prop --
|
||||
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
|
||||
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
|
||||
|
||||
-- !1 --
|
||||
a 1.1
|
||||
b 2.2
|
||||
@ -79,3 +91,14 @@ b 2.2
|
||||
3 aaa
|
||||
4 "null"
|
||||
|
||||
-- !test_open_csv_default_prop --
|
||||
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
|
||||
2 Jane,Smith FALSE 2020-05-20 ""Project Manager""
|
||||
|
||||
-- !test_open_csv_standard_prop --
|
||||
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
|
||||
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
|
||||
|
||||
-- !test_open_csv_custom_prop --
|
||||
1 John Doe 28 50000.75 TRUE 2022-01-15 2023-10-21 14:30:00 4.5 Senior Developer
|
||||
2 Jane,Smith FALSE 2020-05-20 "Project Manager"
|
||||
|
||||
@ -51,6 +51,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte
|
||||
hive_docker """truncate table regression.serde_test8;"""
|
||||
sql """insert into ${catalog_name}.regression.serde_test8 select * from ${catalog_name}.regression.serde_test7;"""
|
||||
qt_9 """select * from ${catalog_name}.regression.serde_test8 order by id;"""
|
||||
|
||||
qt_test_open_csv_default_prop """select * from ${catalog_name}.regression.test_open_csv_default_prop order by id;"""
|
||||
qt_test_open_csv_standard_prop """select * from ${catalog_name}.regression.test_open_csv_standard_prop order by id;"""
|
||||
qt_test_open_csv_custom_prop """select * from ${catalog_name}.regression.test_open_csv_custom_prop order by id;"""
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user