[Enhencement](Outfile/Export) Export data to csv file format with BOM (#30533)

The UTF8 format of the Windows system has BOM. 

We add a new user property to `Outfile/Export`。Therefore, when exporting Doris data, users can choose whether to bring BOM on the beginning of the CSV file.

**Usage:**
```sql
-- outfile:
select * from demo.student
into outfile "file:///xxx/export/exp_"
format as csv
properties(
    "column_separator" = ",",
    "with_bom" = "true"
);

-- Export:
EXPORT TABLE student TO "file:///xx/tmpdata/export/exp_"
PROPERTIES(
    "format" = "csv",
    "with_bom" = "true"
);
```
This commit is contained in:
Tiewei Fang
2024-02-09 17:15:55 +08:00
committed by yiguolei
parent ce892d04e5
commit f65844fae4
17 changed files with 406 additions and 9 deletions

View File

@ -103,6 +103,7 @@ public class ExportStmt extends StatementBase {
private String maxFileSize;
private String deleteExistingFiles;
private String withBom;
private SessionVariable sessionVariables;
private String qualifiedUser;
@ -228,6 +229,7 @@ public class ExportStmt extends StatementBase {
exportJob.setParallelism(this.parallelism);
exportJob.setMaxFileSize(this.maxFileSize);
exportJob.setDeleteExistingFiles(this.deleteExistingFiles);
exportJob.setWithBom(this.withBom);
if (columns != null) {
Splitter split = Splitter.on(',').trimResults().omitEmptyStrings();
@ -354,6 +356,9 @@ public class ExportStmt extends StatementBase {
// generate a random label
this.label = "export_" + UUID.randomUUID();
}
// with bom
this.withBom = properties.getOrDefault(OutFileClause.PROP_WITH_BOM, "false");
}
private void checkColumns() throws DdlException {

View File

@ -136,6 +136,7 @@ public class OutFileClause {
private static final String PROP_SUCCESS_FILE_NAME = "success_file_name";
public static final String PROP_DELETE_EXISTING_FILES = "delete_existing_files";
public static final String PROP_FILE_SUFFIX = "file_suffix";
public static final String PROP_WITH_BOM = "with_bom";
private static final String PARQUET_PROP_PREFIX = "parquet.";
private static final String SCHEMA = "schema";
@ -155,6 +156,7 @@ public class OutFileClause {
private long maxFileSizeBytes = DEFAULT_MAX_FILE_SIZE_BYTES;
private boolean deleteExistingFiles = false;
private String fileSuffix = "";
private boolean withBom = false;
private BrokerDesc brokerDesc = null;
// True if result is written to local disk.
// If set to true, the brokerDesc must be null.
@ -566,6 +568,11 @@ public class OutFileClause {
processedPropKeys.add(PROP_FILE_SUFFIX);
}
if (properties.containsKey(PROP_WITH_BOM)) {
withBom = Boolean.valueOf(properties.get(PROP_WITH_BOM)).booleanValue();
processedPropKeys.add(PROP_WITH_BOM);
}
if (properties.containsKey(PROP_SUCCESS_FILE_NAME)) {
successFileName = properties.get(PROP_SUCCESS_FILE_NAME);
FeNameFormat.checkOutfileSuccessFileName("file name", successFileName);
@ -805,6 +812,7 @@ public class OutFileClause {
sinkOptions.setMaxFileSizeBytes(maxFileSizeBytes);
sinkOptions.setDeleteExistingFiles(deleteExistingFiles);
sinkOptions.setFileSuffix(fileSuffix);
sinkOptions.setWithBom(withBom);
if (brokerDesc != null) {
sinkOptions.setBrokerProperties(brokerDesc.getProperties());

View File

@ -166,6 +166,8 @@ public class ExportJob implements Writable {
@SerializedName("tabletsNum")
private Integer tabletsNum;
@SerializedName("withBom")
private String withBom;
private TableRef tableRef;
@ -219,6 +221,7 @@ public class ExportJob implements Writable {
this.columnSeparator = "\t";
this.lineDelimiter = "\n";
this.columns = "";
this.withBom = "false";
}
public ExportJob(long jobId) {
@ -554,6 +557,7 @@ public class ExportJob implements Writable {
if (!deleteExistingFiles.isEmpty()) {
outfileProperties.put(OutFileClause.PROP_DELETE_EXISTING_FILES, deleteExistingFiles);
}
outfileProperties.put(OutFileClause.PROP_WITH_BOM, withBom);
// broker properties
// outfile clause's broker properties need 'broker.' prefix

View File

@ -349,6 +349,7 @@ public class ExportMgr {
infoMap.put("broker", job.getBrokerDesc().getName());
infoMap.put("column_separator", job.getColumnSeparator());
infoMap.put("format", job.getFormat());
infoMap.put("with_bom", job.getWithBom());
infoMap.put("line_delimiter", job.getLineDelimiter());
infoMap.put("columns", job.getColumns());
infoMap.put("tablet_num", job.getTabletsNum());

View File

@ -40,7 +40,6 @@ import org.apache.doris.scheduler.executor.TransientTaskExecutor;
import org.apache.doris.thrift.TUniqueId;
import com.google.common.collect.Lists;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import java.util.List;
@ -56,7 +55,6 @@ public class ExportTaskExecutor implements TransientTaskExecutor {
ExportJob exportJob;
@Setter
Long taskId;
private StmtExecutor stmtExecutor;
@ -205,4 +203,8 @@ public class ExportTaskExecutor implements TransientTaskExecutor {
}
return Optional.empty();
}
public void setTaskId(Long taskId) {
this.taskId = taskId;
}
}

View File

@ -88,6 +88,7 @@ public class ExportCommand extends Command implements ForwardWithSync {
.add(PropertyAnalyzer.PROPERTIES_LINE_DELIMITER)
.add(PropertyAnalyzer.PROPERTIES_TIMEOUT)
.add("format")
.add(OutFileClause.PROP_WITH_BOM)
.build();
private final List<String> nameParts;
@ -267,6 +268,9 @@ public class ExportCommand extends Command implements ForwardWithSync {
exportJob.setFormat(fileProperties.getOrDefault(LoadStmt.KEY_IN_PARAM_FORMAT_TYPE, "csv")
.toLowerCase());
// set withBom
exportJob.setWithBom(fileProperties.getOrDefault(OutFileClause.PROP_WITH_BOM, "false"));
// set parallelism
int parallelism;
try {