[Feature](avro) Support Apache Avro file format (#19990)

support read avro file by hdfs() or s3() .
```sql
select * from s3(
         "uri" = "http://127.0.0.1:9312/test2/person.avro",
         "ACCESS_KEY" = "ak",
         "SECRET_KEY" = "sk",
         "FORMAT" = "avro");
+--------+--------------+-------------+-----------------+
| name   | boolean_type | double_type | long_type       |
+--------+--------------+-------------+-----------------+
| Alyssa |            1 |     10.0012 | 100000000221133 |
| Ben    |            0 |    5555.999 |      4009990000 |
| lisi   |            0 | 5992225.999 |      9099933330 |
+--------+--------------+-------------+-----------------+

select * from hdfs(
                "uri" = "hdfs://127.0.0.1:9000/input/person2.avro",
                "fs.defaultFS" = "hdfs://127.0.0.1:9000",
                "hadoop.username" = "doris",
                "format" = "avro");
+--------+--------------+-------------+-----------+
| name   | boolean_type | double_type | long_type |
+--------+--------------+-------------+-----------+
| Alyssa |            1 |  8888.99999 |  89898989 |
+--------+--------------+-------------+-----------+
```

current avro reader only support common data type, the complex data types will be supported later.
This commit is contained in:
DongLiang-0
2023-06-28 21:15:35 +08:00
committed by GitHub
parent 4e082a803f
commit a6b51ec19a
31 changed files with 1286 additions and 30 deletions

View File

@ -55,6 +55,10 @@ under the License.
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -21,6 +21,7 @@ package org.apache.doris.common.jni;
import org.apache.doris.common.jni.vec.ColumnType;
import org.apache.doris.common.jni.vec.ColumnValue;
import org.apache.doris.common.jni.vec.ScanPredicate;
import org.apache.doris.common.jni.vec.TableSchema;
import org.apache.doris.common.jni.vec.VectorTable;
import java.io.IOException;
@ -43,6 +44,9 @@ public abstract class JniScanner {
// Scan data and save as vector table
protected abstract int getNext() throws IOException;
// parse table schema
protected abstract TableSchema parseTableSchema() throws UnsupportedOperationException;
protected void initTableInfo(ColumnType[] requiredTypes, String[] requiredFields, ScanPredicate[] predicates,
int batchSize) {
this.types = requiredTypes;
@ -63,6 +67,11 @@ public abstract class JniScanner {
return vectorTable;
}
public String getTableSchema() throws IOException {
TableSchema tableSchema = parseTableSchema();
return tableSchema.getTableSchema();
}
public long getNextBatchMeta() throws IOException {
if (vectorTable == null) {
vectorTable = new VectorTable(types, fields, predicates, batchSize);
@ -95,7 +104,7 @@ public abstract class JniScanner {
return vectorTable.getMetaAddress();
}
protected void resetTable() {
public void resetTable() {
if (vectorTable != null) {
vectorTable.reset();
}
@ -105,7 +114,7 @@ public abstract class JniScanner {
vectorTable.releaseColumn(fieldId);
}
protected void releaseTable() {
public void releaseTable() {
if (vectorTable != null) {
vectorTable.close();
}

View File

@ -21,6 +21,7 @@ package org.apache.doris.common.jni;
import org.apache.doris.common.jni.vec.ColumnType;
import org.apache.doris.common.jni.vec.ColumnValue;
import org.apache.doris.common.jni.vec.ScanPredicate;
import org.apache.doris.common.jni.vec.TableSchema;
import org.apache.log4j.Logger;
@ -143,7 +144,7 @@ public class MockJniScanner extends JniScanner {
private static final Logger LOG = Logger.getLogger(MockJniScanner.class);
private final int mockRows;
private int mockRows;
private int readRows = 0;
private final MockColumnValue columnValue = new MockColumnValue();
@ -195,4 +196,9 @@ public class MockJniScanner extends JniScanner {
readRows += rows;
return rows;
}
@Override
protected TableSchema parseTableSchema() throws UnsupportedOperationException {
return null;
}
}

View File

@ -0,0 +1,83 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.common.jni.vec;
import org.apache.doris.thrift.TPrimitiveType;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.List;
/**
* Used to parse the file structure of table-value-function type.
* like avro file.
*/
public class TableSchema {
private final List<SchemaColumn> schemaColumns;
private final ObjectMapper objectMapper;
public TableSchema(List<SchemaColumn> schemaColumns) {
this.schemaColumns = schemaColumns;
this.objectMapper = new ObjectMapper();
}
public String getTableSchema() throws IOException {
try {
return objectMapper.writeValueAsString(schemaColumns);
} catch (JsonProcessingException e) {
throw new IOException(e);
}
}
public static class SchemaColumn {
private String name;
private int type;
private SchemaColumn childColumn;
public SchemaColumn() {
}
public String getName() {
return name;
}
public SchemaColumn getChildColumn() {
return childColumn;
}
public int getType() {
return type;
}
public void setName(String name) {
this.name = name;
}
public void setType(TPrimitiveType type) {
this.type = type.getValue();
}
public void addChildColumn(SchemaColumn childColumn) {
this.childColumn = childColumn;
}
}
}