[Improvement](outfile) Support ORC format in outfile (#13019)

This commit is contained in:
Gabriel
2022-10-08 20:56:32 +08:00
committed by GitHub
parent 344377beb7
commit 869fe2bc5d
10 changed files with 838 additions and 32 deletions

View File

@ -20,11 +20,13 @@ package org.apache.doris.analysis;
import org.apache.doris.backup.HdfsStorage;
import org.apache.doris.backup.S3Storage;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.FeNameFormat;
import org.apache.doris.common.Pair;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.BrokerUtil;
import org.apache.doris.common.util.ParseUtil;
@ -64,6 +66,7 @@ public class OutFileClause {
public static final Map<String, TParquetDataType> PARQUET_DATA_TYPE_MAP = Maps.newHashMap();
public static final Map<String, TParquetCompressionType> PARQUET_COMPRESSION_TYPE_MAP = Maps.newHashMap();
public static final Map<String, TParquetVersion> PARQUET_VERSION_MAP = Maps.newHashMap();
public static final Set<String> ORC_DATA_TYPE = Sets.newHashSet();
static {
RESULT_COL_NAMES.add("FileNumber");
@ -100,6 +103,15 @@ public class OutFileClause {
PARQUET_VERSION_MAP.put("v1", TParquetVersion.PARQUET_1_0);
PARQUET_VERSION_MAP.put("latest", TParquetVersion.PARQUET_2_LATEST);
ORC_DATA_TYPE.add("bigint");
ORC_DATA_TYPE.add("boolean");
ORC_DATA_TYPE.add("double");
ORC_DATA_TYPE.add("float");
ORC_DATA_TYPE.add("int");
ORC_DATA_TYPE.add("smallint");
ORC_DATA_TYPE.add("string");
ORC_DATA_TYPE.add("tinyint");
}
public static final String LOCAL_FILE_PREFIX = "file:///";
@ -114,6 +126,7 @@ public class OutFileClause {
private static final String PROP_MAX_FILE_SIZE = "max_file_size";
private static final String PROP_SUCCESS_FILE_NAME = "success_file_name";
private static final String PARQUET_PROP_PREFIX = "parquet.";
private static final String ORC_PROP_PREFIX = "orc.";
private static final String SCHEMA = "schema";
private static final long DEFAULT_MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 * 1024; // 1GB
@ -138,6 +151,8 @@ public class OutFileClause {
private List<TParquetSchema> parquetSchemas = new ArrayList<>();
private List<Pair<String, String>> orcSchemas = new ArrayList<>();
private boolean isAnalyzed = false;
private String headerType = "";
@ -207,6 +222,9 @@ public class OutFileClause {
case "parquet":
fileFormatType = TFileFormatType.FORMAT_PARQUET;
break;
case "orc":
fileFormatType = TFileFormatType.FORMAT_ORC;
break;
case "csv_with_names":
headerType = FeConstants.csv_with_names;
fileFormatType = TFileFormatType.FORMAT_CSV_PLAIN;
@ -230,6 +248,146 @@ public class OutFileClause {
if (isParquetFormat()) {
analyzeForParquetFormat(resultExprs, colLabels);
} else if (isOrcFormat()) {
analyzeForOrcFormat(resultExprs, colLabels);
}
}
private void genOrcSchema(List<Expr> resultExprs, List<String> colLabels) throws AnalysisException {
Preconditions.checkState(this.parquetSchemas.isEmpty());
for (int i = 0; i < resultExprs.size(); ++i) {
Expr expr = resultExprs.get(i);
String type = "";
switch (expr.getType().getPrimitiveType()) {
case BOOLEAN:
case TINYINT:
case SMALLINT:
case INT:
case BIGINT:
case FLOAT:
case DOUBLE:
case STRING:
type = expr.getType().getPrimitiveType().toString().toLowerCase();
break;
case HLL:
case BITMAP:
if (!(ConnectContext.get() != null && ConnectContext.get()
.getSessionVariable().isReturnObjectDataAsBinary())) {
break;
}
type = "string";
break;
case DATE:
case DATETIME:
case DATETIMEV2:
case DATEV2:
case CHAR:
case VARCHAR:
type = "string";
break;
case DECIMALV2:
if (!expr.getType().isWildcardDecimal()) {
type = String.format("decimal(%d, %d)", ScalarType.MAX_DECIMAL128_PRECISION,
((ScalarType) expr.getType()).decimalScale());
} else {
throw new AnalysisException("currently ORC writer do not support WildcardDecimal!");
}
break;
case DECIMAL32:
case DECIMAL64:
case DECIMAL128:
if (!expr.getType().isWildcardDecimal()) {
type = String.format("decimal(%d, %d)", ((ScalarType) expr.getType()).getPrecision(),
((ScalarType) expr.getType()).decimalScale());
} else {
throw new AnalysisException("currently ORC writer do not support WildcardDecimal!");
}
break;
default:
throw new AnalysisException("currently orc do not support column type: "
+ expr.getType().getPrimitiveType());
}
orcSchemas.add(Pair.of(colLabels.get(i), type));
}
}
private String serializeOrcSchema() {
StringBuilder sb = new StringBuilder();
sb.append("struct<");
this.orcSchemas.forEach(pair -> sb.append(pair.first + ":" + pair.second + ","));
if (!this.orcSchemas.isEmpty()) {
return sb.substring(0, sb.length() - 1) + ">";
} else {
return sb.toString() + ">";
}
}
private void analyzeForOrcFormat(List<Expr> resultExprs, List<String> colLabels) throws AnalysisException {
if (this.orcSchemas.isEmpty()) {
genOrcSchema(resultExprs, colLabels);
}
// check schema number
if (resultExprs.size() != this.orcSchemas.size()) {
throw new AnalysisException("Orc schema number does not equal to select item number");
}
// check type
for (int i = 0; i < this.orcSchemas.size(); ++i) {
Pair<String, String> schema = this.orcSchemas.get(i);
Type resultType = resultExprs.get(i).getType();
switch (resultType.getPrimitiveType()) {
case BOOLEAN:
case TINYINT:
case SMALLINT:
case INT:
case BIGINT:
case FLOAT:
case DOUBLE:
case STRING:
if (!schema.second.equals(resultType.getPrimitiveType().toString().toLowerCase())) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use " + resultType.getPrimitiveType().toString() + ","
+ " but the type of column " + i + " is " + schema.second);
}
break;
case DATE:
case DATETIME:
case DATETIMEV2:
case DATEV2:
case CHAR:
case VARCHAR:
if (!schema.second.equals("string")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use string, but the definition type of column " + i + " is "
+ schema.second);
}
break;
case DECIMAL32:
case DECIMAL64:
case DECIMAL128:
case DECIMALV2:
if (!schema.second.startsWith("decimal")) {
throw new AnalysisException("project field type is " + resultType.getPrimitiveType().toString()
+ ", should use string, but the definition type of column " + i + " is "
+ schema.second);
}
break;
case HLL:
case BITMAP:
if (ConnectContext.get() != null && ConnectContext.get()
.getSessionVariable().isReturnObjectDataAsBinary()) {
if (!schema.second.equals("string")) {
throw new AnalysisException("project field type is HLL/BITMAP, should use string, "
+ "but the definition type of column " + i + " is " + schema.second);
}
} else {
throw new AnalysisException("Orc format does not support column type: "
+ resultType.getPrimitiveType());
}
break;
default:
throw new AnalysisException("Orc format does not support column type: "
+ resultType.getPrimitiveType());
}
}
}
@ -438,6 +596,10 @@ public class OutFileClause {
getParquetProperties(processedPropKeys);
}
if (this.fileFormatType == TFileFormatType.FORMAT_ORC) {
getOrcProperties(processedPropKeys);
}
if (processedPropKeys.size() != properties.size()) {
LOG.debug("{} vs {}", processedPropKeys, properties);
throw new AnalysisException("Unknown properties: " + properties.keySet().stream()
@ -575,6 +737,41 @@ public class OutFileClause {
processedPropKeys.add(SCHEMA);
}
private void getOrcProperties(Set<String> processedPropKeys) throws AnalysisException {
// check schema. if schema is not set, Doris will gen schema by select items
String schema = properties.get(SCHEMA);
if (schema == null) {
return;
}
if (schema.isEmpty()) {
throw new AnalysisException("Orc schema property should not be empty");
}
schema = schema.replace(" ", "");
schema = schema.toLowerCase();
String[] schemas = schema.split(";");
for (String item : schemas) {
String[] properties = item.split(",");
if (properties.length != 2) {
throw new AnalysisException("must only contains type and column name");
}
if (!ORC_DATA_TYPE.contains(properties[1]) && !properties[1].startsWith("decimal")) {
throw new AnalysisException("data type is not supported:" + properties[1]);
} else if (!ORC_DATA_TYPE.contains(properties[1]) && properties[1].startsWith("decimal")) {
String errorMsg = "Format of decimal type must be decimal(%d,%d)";
String precisionAndScale = properties[1].substring(0, "decimal".length()).trim();
if (!precisionAndScale.startsWith("(") || !precisionAndScale.endsWith(")")) {
throw new AnalysisException(errorMsg);
}
String[] str = precisionAndScale.substring(1, precisionAndScale.length() - 1).split(",");
if (str.length != 2) {
throw new AnalysisException(errorMsg);
}
}
orcSchemas.add(Pair.of(properties[0], properties[1]));
}
processedPropKeys.add(SCHEMA);
}
private boolean isCsvFormat() {
return fileFormatType == TFileFormatType.FORMAT_CSV_BZ2
|| fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE
@ -589,6 +786,10 @@ public class OutFileClause {
return fileFormatType == TFileFormatType.FORMAT_PARQUET;
}
private boolean isOrcFormat() {
return fileFormatType == TFileFormatType.FORMAT_ORC;
}
@Override
public OutFileClause clone() {
return new OutFileClause(this);
@ -635,6 +836,9 @@ public class OutFileClause {
sinkOptions.setParquetVersion(parquetVersion);
sinkOptions.setParquetSchemas(parquetSchemas);
}
if (isOrcFormat()) {
sinkOptions.setOrcSchema(serializeOrcSchema());
}
return sinkOptions;
}
}