[feature-wip](multi-catalog)(step2)support read max compute data by JNI (#19819)

Issue Number: #19679
This commit is contained in:
slothever
2023-06-05 22:10:08 +08:00
committed by GitHub
parent e576d533b2
commit b7fc17da68
28 changed files with 931 additions and 78 deletions

View File

@ -41,6 +41,11 @@ public class HudiColumnValue implements ColumnValue {
return ((PrimitiveObjectInspector) fieldInspector).getPrimitiveJavaObject(fieldData);
}
@Override
public boolean isNull() {
return false;
}
@Override
public boolean getBoolean() {
return (boolean) inspectObject();

View File

@ -72,6 +72,7 @@ public abstract class JniScanner {
throw e;
}
if (numRows == 0) {
releaseTable();
return 0;
}
return getMetaAddress(numRows);
@ -83,7 +84,9 @@ public abstract class JniScanner {
}
protected void resetTable() {
vectorTable.reset();
if (vectorTable != null) {
vectorTable.reset();
}
}
protected void releaseColumn(int fieldId) {

View File

@ -0,0 +1,251 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.jni;
import org.apache.doris.jni.vec.ColumnType;
import org.apache.doris.jni.vec.MaxComputeColumnValue;
import org.apache.doris.jni.vec.ScanPredicate;
import com.aliyun.odps.Column;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.ArrowRecordReader;
import com.aliyun.odps.tunnel.TableTunnel;
import com.aliyun.odps.type.TypeInfo;
import com.aliyun.odps.type.TypeInfoFactory;
import com.google.common.base.Strings;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
/**
* MaxComputeJ JniScanner. BE will read data from the scanner object.
*/
public class MaxComputeJniScanner extends JniScanner {
private Odps odps;
private TableTunnel tunnel;
private static final Logger LOG = Logger.getLogger(MaxComputeJniScanner.class);
private static final String odpsUrlTemplate = "http://service.{}.maxcompute.aliyun.com/api";
private static final String tunnelUrlTemplate = "http://dt.{}.maxcompute.aliyun-inc.com";
private static final String REGION = "region";
private static final String PROJECT = "project";
private static final String TABLE = "table";
private static final String ACCESS_KEY = "access_key";
private static final String SECRET_KEY = "secret_key";
private static final String START_OFFSET = "start_offset";
private static final String SPLIT_SIZE = "split_size";
private static final String PUBLIC_ACCESS = "public_access";
private final String project;
private final String table;
private MaxComputeColumnValue columnValue;
private long remainBatchRows = 0;
private long totalRows = 0;
private TableTunnel.DownloadSession session;
private ArrowRecordReader curReader;
private List<Column> columns;
private Map<String, Integer> readColumnsId;
private long startOffset = -1L;
private long splitSize = -1L;
public MaxComputeJniScanner(int batchSize, Map<String, String> params) {
String region = Objects.requireNonNull(params.get(REGION), "required property '" + REGION + "'.");
project = Objects.requireNonNull(params.get(PROJECT), "required property '" + PROJECT + "'.");
table = Objects.requireNonNull(params.get(TABLE), "required property '" + TABLE + "'.");
if (!Strings.isNullOrEmpty(params.get(START_OFFSET))
&& !Strings.isNullOrEmpty(params.get(SPLIT_SIZE))) {
startOffset = Long.parseLong(params.get(START_OFFSET));
splitSize = Long.parseLong(params.get(SPLIT_SIZE));
}
String accessKey = Objects.requireNonNull(params.get(ACCESS_KEY), "required property '" + ACCESS_KEY + "'.");
String secretKey = Objects.requireNonNull(params.get(SECRET_KEY), "required property '" + SECRET_KEY + "'.");
odps = new Odps(new AliyunAccount(accessKey, secretKey));
odps.setEndpoint(odpsUrlTemplate.replace("{}", region));
odps.setDefaultProject(project);
tunnel = new TableTunnel(odps);
String tunnelUrl = tunnelUrlTemplate.replace("{}", region);
boolean enablePublicAccess = Boolean.parseBoolean(params.getOrDefault(PUBLIC_ACCESS, "false"));
if (enablePublicAccess) {
tunnelUrl = tunnelUrlTemplate.replace("-inc", "");
}
tunnel.setEndpoint(tunnelUrl);
String[] requiredFields = params.get("required_fields").split(",");
String[] types = params.get("columns_types").split("#");
ColumnType[] columnTypes = new ColumnType[types.length];
for (int i = 0; i < types.length; i++) {
columnTypes[i] = ColumnType.parseType(requiredFields[i], types[i]);
}
ScanPredicate[] predicates = new ScanPredicate[0];
if (params.containsKey("push_down_predicates")) {
long predicatesAddress = Long.parseLong(params.get("push_down_predicates"));
if (predicatesAddress != 0) {
predicates = ScanPredicate.parseScanPredicates(predicatesAddress, columnTypes);
LOG.info("MaxComputeJniScanner gets pushed-down predicates: " + ScanPredicate.dump(predicates));
}
}
initTableInfo(columnTypes, requiredFields, predicates, batchSize);
}
@Override
protected void initTableInfo(ColumnType[] requiredTypes, String[] requiredFields, ScanPredicate[] predicates,
int batchSize) {
super.initTableInfo(requiredTypes, requiredFields, predicates, batchSize);
columns = new ArrayList<>();
readColumnsId = new HashMap<>();
for (int i = 0; i < fields.length; i++) {
if (!Strings.isNullOrEmpty(fields[i])) {
columns.add(createOdpsColumn(i, types[i]));
readColumnsId.put(fields[i], i);
}
}
// reorder columns
List<Column> columnList = odps.tables().get(table).getSchema().getColumns();
Map<String, Integer> columnRank = new HashMap<>();
for (int i = 0; i < columnList.size(); i++) {
columnRank.put(columnList.get(i).getName(), i);
}
// Downloading columns data from Max compute only supports the order of table metadata.
// We might get an error message if no sort here: Column reorder is not supported in legacy arrow mode.
columns.sort((Comparator.comparing(o -> columnRank.get(o.getName()))));
}
@Override
public void open() throws IOException {
if (columns.isEmpty()) {
return;
}
try {
session = tunnel.createDownloadSession(project, table);
if (splitSize > 0) {
totalRows = Math.min(splitSize, session.getRecordCount());
} else {
totalRows = session.getRecordCount();
}
long start = startOffset == -1L ? 0 : startOffset;
curReader = session.openArrowRecordReader(start, totalRows, columns);
} catch (Exception e) {
throw new IOException(e);
}
remainBatchRows = totalRows;
}
private Column createOdpsColumn(int colIdx, ColumnType dorisType) {
TypeInfo odpsType;
switch (dorisType.getType()) {
case BOOLEAN:
odpsType = TypeInfoFactory.BOOLEAN;
break;
case TINYINT:
odpsType = TypeInfoFactory.TINYINT;
break;
case SMALLINT:
odpsType = TypeInfoFactory.SMALLINT;
break;
case INT:
odpsType = TypeInfoFactory.INT;
break;
case BIGINT:
odpsType = TypeInfoFactory.BIGINT;
break;
case DECIMAL32:
case DECIMAL64:
case DECIMAL128:
case DECIMALV2:
odpsType = TypeInfoFactory.getDecimalTypeInfo(dorisType.getPrecision(), dorisType.getScale());
break;
case FLOAT:
odpsType = TypeInfoFactory.FLOAT;
break;
case DOUBLE:
odpsType = TypeInfoFactory.DOUBLE;
break;
case DATETIMEV2:
odpsType = TypeInfoFactory.DATETIME;
break;
case DATEV2:
odpsType = TypeInfoFactory.DATE;
break;
case CHAR:
odpsType = TypeInfoFactory.getCharTypeInfo(dorisType.getLength());
break;
case VARCHAR:
odpsType = TypeInfoFactory.getVarcharTypeInfo(dorisType.getLength());
break;
case STRING:
odpsType = TypeInfoFactory.getPrimitiveTypeInfo(OdpsType.STRING);
break;
default:
throw new RuntimeException("Unsupported transform for column type: " + dorisType.getType());
}
return new Column(fields[colIdx], odpsType);
}
@Override
public void close() throws IOException {
remainBatchRows = 0;
totalRows = 0;
startOffset = -1;
splitSize = -1;
if (curReader != null) {
curReader.close();
}
}
@Override
protected int getNext() throws IOException {
if (curReader == null) {
return 0;
}
columnValue = new MaxComputeColumnValue();
int expectedRows = (int) Math.min(batchSize, remainBatchRows);
int realRows = readVectors(expectedRows);
if (remainBatchRows <= 0) {
return 0;
}
remainBatchRows -= realRows;
return realRows;
}
private int readVectors(int expectedRows) throws IOException {
VectorSchemaRoot batch;
int curReadRows = 0;
while (curReadRows < expectedRows && (batch = curReader.read()) != null) {
List<FieldVector> fieldVectors = batch.getFieldVectors();
int batchRows = 0;
for (FieldVector column : fieldVectors) {
columnValue.reset(column);
// LOG.warn("MCJNI read getClass: " + column.getClass());
batchRows = column.getValueCount();
for (int j = 0; j < batchRows; j++) {
appendData(readColumnsId.get(column.getName()), columnValue);
}
}
curReadRows += batchRows;
}
return curReadRows;
}
}

View File

@ -49,6 +49,11 @@ public class MockJniScanner extends JniScanner {
this.j = j;
}
@Override
public boolean isNull() {
return false;
}
@Override
public boolean getBoolean() {
return (i + j) % 2 == 0;

View File

@ -27,38 +27,40 @@ import java.util.List;
* Column value in vector column
*/
public interface ColumnValue {
public boolean getBoolean();
boolean isNull();
boolean getBoolean();
// tinyint
public byte getByte();
byte getByte();
// smallint
public short getShort();
short getShort();
public int getInt();
int getInt();
public float getFloat();
float getFloat();
// bigint
public long getLong();
long getLong();
public double getDouble();
double getDouble();
public BigInteger getBigInteger();
BigInteger getBigInteger();
public BigDecimal getDecimal();
BigDecimal getDecimal();
public String getString();
String getString();
public LocalDate getDate();
LocalDate getDate();
public LocalDateTime getDateTime();
LocalDateTime getDateTime();
public byte[] getBytes();
byte[] getBytes();
public void unpackArray(List<ColumnValue> values);
void unpackArray(List<ColumnValue> values);
public void unpackMap(List<ColumnValue> keys, List<ColumnValue> values);
void unpackMap(List<ColumnValue> keys, List<ColumnValue> values);
public void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> values);
void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> values);
}

View File

@ -0,0 +1,185 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.jni.vec;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.DateDayVector;
import org.apache.arrow.vector.DateMilliVector;
import org.apache.arrow.vector.DecimalVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.SmallIntVector;
import org.apache.arrow.vector.TinyIntVector;
import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.util.DecimalUtility;
import org.apache.log4j.Logger;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
/**
* MaxCompute Column value in vector column
*/
public class MaxComputeColumnValue implements ColumnValue {
private static final Logger LOG = Logger.getLogger(MaxComputeColumnValue.class);
private int idx;
private FieldVector column;
public MaxComputeColumnValue() {
idx = 0;
}
public void reset(FieldVector column) {
this.column = column;
this.idx = 0;
}
@Override
public boolean isNull() {
return column.isNull(idx);
}
private void skippedIfNull() {
// null has been process by appendValue with isNull()
try {
if (column.isNull(idx)) {
idx++;
}
} catch (IndexOutOfBoundsException e) {
// skip left rows
idx++;
}
}
@Override
public boolean getBoolean() {
skippedIfNull();
TinyIntVector tinyIntCol = (TinyIntVector) column;
return tinyIntCol.get(idx++) > 0;
}
@Override
public byte getByte() {
skippedIfNull();
TinyIntVector tinyIntCol = (TinyIntVector) column;
return tinyIntCol.get(idx++);
}
@Override
public short getShort() {
skippedIfNull();
SmallIntVector smallIntCol = (SmallIntVector) column;
return smallIntCol.get(idx++);
}
@Override
public int getInt() {
skippedIfNull();
IntVector intCol = (IntVector) column;
return intCol.get(idx++);
}
@Override
public float getFloat() {
skippedIfNull();
Float4Vector floatCol = (Float4Vector) column;
return floatCol.get(idx++);
}
@Override
public long getLong() {
skippedIfNull();
BigIntVector longCol = (BigIntVector) column;
return longCol.get(idx++);
}
@Override
public double getDouble() {
skippedIfNull();
Float8Vector doubleCol = (Float8Vector) column;
return doubleCol.get(idx++);
}
@Override
public BigInteger getBigInteger() {
skippedIfNull();
BigIntVector longCol = (BigIntVector) column;
return BigInteger.valueOf(longCol.get(idx++));
}
@Override
public BigDecimal getDecimal() {
skippedIfNull();
DecimalVector decimalCol = (DecimalVector) column;
return DecimalUtility.getBigDecimalFromArrowBuf(column.getDataBuffer(), idx++,
decimalCol.getScale(), DecimalVector.TYPE_WIDTH);
}
@Override
public String getString() {
skippedIfNull();
VarCharVector varcharCol = (VarCharVector) column;
String v = varcharCol.getObject(idx++).toString();
return v == null ? new String(new byte[0]) : v;
}
@Override
public LocalDate getDate() {
skippedIfNull();
DateDayVector dateCol = (DateDayVector) column;
Integer intVal = dateCol.getObject(idx++);
return LocalDate.ofEpochDay(intVal == null ? 0 : intVal);
}
@Override
public LocalDateTime getDateTime() {
skippedIfNull();
DateMilliVector datetimeCol = (DateMilliVector) column;
LocalDateTime v = datetimeCol.getObject(idx++);
return v == null ? LocalDateTime.MIN : v;
}
@Override
public byte[] getBytes() {
skippedIfNull();
VarBinaryVector binaryCol = (VarBinaryVector) column;
byte[] v = binaryCol.getObject(idx++);
return v == null ? new byte[0] : v;
}
@Override
public void unpackArray(List<ColumnValue> values) {
}
@Override
public void unpackMap(List<ColumnValue> keys, List<ColumnValue> values) {
}
@Override
public void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> values) {
}
}

View File

@ -122,6 +122,11 @@ public class ScanPredicate {
return inspectObject().toString();
}
@Override
public boolean isNull() {
return false;
}
@Override
public boolean getBoolean() {
return (boolean) inspectObject();

View File

@ -551,7 +551,7 @@ public class VectorColumn {
public void appendValue(ColumnValue o) {
ColumnType.Type typeValue = columnType.getType();
if (o == null) {
if (o == null || o.isNull()) {
appendNull(typeValue);
return;
}