[feature-wip](multi-catalog)(step2)support read max compute data by JNI (#19819)
Issue Number: #19679
This commit is contained in:
@ -41,6 +41,11 @@ public class HudiColumnValue implements ColumnValue {
|
||||
return ((PrimitiveObjectInspector) fieldInspector).getPrimitiveJavaObject(fieldData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNull() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getBoolean() {
|
||||
return (boolean) inspectObject();
|
||||
|
||||
@ -72,6 +72,7 @@ public abstract class JniScanner {
|
||||
throw e;
|
||||
}
|
||||
if (numRows == 0) {
|
||||
releaseTable();
|
||||
return 0;
|
||||
}
|
||||
return getMetaAddress(numRows);
|
||||
@ -83,7 +84,9 @@ public abstract class JniScanner {
|
||||
}
|
||||
|
||||
protected void resetTable() {
|
||||
vectorTable.reset();
|
||||
if (vectorTable != null) {
|
||||
vectorTable.reset();
|
||||
}
|
||||
}
|
||||
|
||||
protected void releaseColumn(int fieldId) {
|
||||
|
||||
@ -0,0 +1,251 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.jni;
|
||||
|
||||
import org.apache.doris.jni.vec.ColumnType;
|
||||
import org.apache.doris.jni.vec.MaxComputeColumnValue;
|
||||
import org.apache.doris.jni.vec.ScanPredicate;
|
||||
|
||||
import com.aliyun.odps.Column;
|
||||
import com.aliyun.odps.Odps;
|
||||
import com.aliyun.odps.OdpsType;
|
||||
import com.aliyun.odps.account.AliyunAccount;
|
||||
import com.aliyun.odps.data.ArrowRecordReader;
|
||||
import com.aliyun.odps.tunnel.TableTunnel;
|
||||
import com.aliyun.odps.type.TypeInfo;
|
||||
import com.aliyun.odps.type.TypeInfoFactory;
|
||||
import com.google.common.base.Strings;
|
||||
import org.apache.arrow.vector.FieldVector;
|
||||
import org.apache.arrow.vector.VectorSchemaRoot;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* MaxComputeJ JniScanner. BE will read data from the scanner object.
|
||||
*/
|
||||
public class MaxComputeJniScanner extends JniScanner {
|
||||
private Odps odps;
|
||||
private TableTunnel tunnel;
|
||||
|
||||
private static final Logger LOG = Logger.getLogger(MaxComputeJniScanner.class);
|
||||
private static final String odpsUrlTemplate = "http://service.{}.maxcompute.aliyun.com/api";
|
||||
private static final String tunnelUrlTemplate = "http://dt.{}.maxcompute.aliyun-inc.com";
|
||||
private static final String REGION = "region";
|
||||
private static final String PROJECT = "project";
|
||||
private static final String TABLE = "table";
|
||||
private static final String ACCESS_KEY = "access_key";
|
||||
private static final String SECRET_KEY = "secret_key";
|
||||
private static final String START_OFFSET = "start_offset";
|
||||
private static final String SPLIT_SIZE = "split_size";
|
||||
private static final String PUBLIC_ACCESS = "public_access";
|
||||
private final String project;
|
||||
private final String table;
|
||||
private MaxComputeColumnValue columnValue;
|
||||
private long remainBatchRows = 0;
|
||||
private long totalRows = 0;
|
||||
private TableTunnel.DownloadSession session;
|
||||
private ArrowRecordReader curReader;
|
||||
private List<Column> columns;
|
||||
private Map<String, Integer> readColumnsId;
|
||||
private long startOffset = -1L;
|
||||
private long splitSize = -1L;
|
||||
|
||||
public MaxComputeJniScanner(int batchSize, Map<String, String> params) {
|
||||
String region = Objects.requireNonNull(params.get(REGION), "required property '" + REGION + "'.");
|
||||
project = Objects.requireNonNull(params.get(PROJECT), "required property '" + PROJECT + "'.");
|
||||
table = Objects.requireNonNull(params.get(TABLE), "required property '" + TABLE + "'.");
|
||||
if (!Strings.isNullOrEmpty(params.get(START_OFFSET))
|
||||
&& !Strings.isNullOrEmpty(params.get(SPLIT_SIZE))) {
|
||||
startOffset = Long.parseLong(params.get(START_OFFSET));
|
||||
splitSize = Long.parseLong(params.get(SPLIT_SIZE));
|
||||
}
|
||||
String accessKey = Objects.requireNonNull(params.get(ACCESS_KEY), "required property '" + ACCESS_KEY + "'.");
|
||||
String secretKey = Objects.requireNonNull(params.get(SECRET_KEY), "required property '" + SECRET_KEY + "'.");
|
||||
odps = new Odps(new AliyunAccount(accessKey, secretKey));
|
||||
odps.setEndpoint(odpsUrlTemplate.replace("{}", region));
|
||||
odps.setDefaultProject(project);
|
||||
tunnel = new TableTunnel(odps);
|
||||
String tunnelUrl = tunnelUrlTemplate.replace("{}", region);
|
||||
boolean enablePublicAccess = Boolean.parseBoolean(params.getOrDefault(PUBLIC_ACCESS, "false"));
|
||||
if (enablePublicAccess) {
|
||||
tunnelUrl = tunnelUrlTemplate.replace("-inc", "");
|
||||
}
|
||||
tunnel.setEndpoint(tunnelUrl);
|
||||
String[] requiredFields = params.get("required_fields").split(",");
|
||||
String[] types = params.get("columns_types").split("#");
|
||||
ColumnType[] columnTypes = new ColumnType[types.length];
|
||||
for (int i = 0; i < types.length; i++) {
|
||||
columnTypes[i] = ColumnType.parseType(requiredFields[i], types[i]);
|
||||
}
|
||||
ScanPredicate[] predicates = new ScanPredicate[0];
|
||||
if (params.containsKey("push_down_predicates")) {
|
||||
long predicatesAddress = Long.parseLong(params.get("push_down_predicates"));
|
||||
if (predicatesAddress != 0) {
|
||||
predicates = ScanPredicate.parseScanPredicates(predicatesAddress, columnTypes);
|
||||
LOG.info("MaxComputeJniScanner gets pushed-down predicates: " + ScanPredicate.dump(predicates));
|
||||
}
|
||||
}
|
||||
initTableInfo(columnTypes, requiredFields, predicates, batchSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void initTableInfo(ColumnType[] requiredTypes, String[] requiredFields, ScanPredicate[] predicates,
|
||||
int batchSize) {
|
||||
super.initTableInfo(requiredTypes, requiredFields, predicates, batchSize);
|
||||
columns = new ArrayList<>();
|
||||
readColumnsId = new HashMap<>();
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
if (!Strings.isNullOrEmpty(fields[i])) {
|
||||
columns.add(createOdpsColumn(i, types[i]));
|
||||
readColumnsId.put(fields[i], i);
|
||||
}
|
||||
}
|
||||
// reorder columns
|
||||
List<Column> columnList = odps.tables().get(table).getSchema().getColumns();
|
||||
Map<String, Integer> columnRank = new HashMap<>();
|
||||
for (int i = 0; i < columnList.size(); i++) {
|
||||
columnRank.put(columnList.get(i).getName(), i);
|
||||
}
|
||||
// Downloading columns data from Max compute only supports the order of table metadata.
|
||||
// We might get an error message if no sort here: Column reorder is not supported in legacy arrow mode.
|
||||
columns.sort((Comparator.comparing(o -> columnRank.get(o.getName()))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void open() throws IOException {
|
||||
if (columns.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
session = tunnel.createDownloadSession(project, table);
|
||||
if (splitSize > 0) {
|
||||
totalRows = Math.min(splitSize, session.getRecordCount());
|
||||
} else {
|
||||
totalRows = session.getRecordCount();
|
||||
}
|
||||
long start = startOffset == -1L ? 0 : startOffset;
|
||||
curReader = session.openArrowRecordReader(start, totalRows, columns);
|
||||
} catch (Exception e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
remainBatchRows = totalRows;
|
||||
}
|
||||
|
||||
private Column createOdpsColumn(int colIdx, ColumnType dorisType) {
|
||||
TypeInfo odpsType;
|
||||
switch (dorisType.getType()) {
|
||||
case BOOLEAN:
|
||||
odpsType = TypeInfoFactory.BOOLEAN;
|
||||
break;
|
||||
case TINYINT:
|
||||
odpsType = TypeInfoFactory.TINYINT;
|
||||
break;
|
||||
case SMALLINT:
|
||||
odpsType = TypeInfoFactory.SMALLINT;
|
||||
break;
|
||||
case INT:
|
||||
odpsType = TypeInfoFactory.INT;
|
||||
break;
|
||||
case BIGINT:
|
||||
odpsType = TypeInfoFactory.BIGINT;
|
||||
break;
|
||||
case DECIMAL32:
|
||||
case DECIMAL64:
|
||||
case DECIMAL128:
|
||||
case DECIMALV2:
|
||||
odpsType = TypeInfoFactory.getDecimalTypeInfo(dorisType.getPrecision(), dorisType.getScale());
|
||||
break;
|
||||
case FLOAT:
|
||||
odpsType = TypeInfoFactory.FLOAT;
|
||||
break;
|
||||
case DOUBLE:
|
||||
odpsType = TypeInfoFactory.DOUBLE;
|
||||
break;
|
||||
case DATETIMEV2:
|
||||
odpsType = TypeInfoFactory.DATETIME;
|
||||
break;
|
||||
case DATEV2:
|
||||
odpsType = TypeInfoFactory.DATE;
|
||||
break;
|
||||
case CHAR:
|
||||
odpsType = TypeInfoFactory.getCharTypeInfo(dorisType.getLength());
|
||||
break;
|
||||
case VARCHAR:
|
||||
odpsType = TypeInfoFactory.getVarcharTypeInfo(dorisType.getLength());
|
||||
break;
|
||||
case STRING:
|
||||
odpsType = TypeInfoFactory.getPrimitiveTypeInfo(OdpsType.STRING);
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unsupported transform for column type: " + dorisType.getType());
|
||||
}
|
||||
return new Column(fields[colIdx], odpsType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
remainBatchRows = 0;
|
||||
totalRows = 0;
|
||||
startOffset = -1;
|
||||
splitSize = -1;
|
||||
if (curReader != null) {
|
||||
curReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int getNext() throws IOException {
|
||||
if (curReader == null) {
|
||||
return 0;
|
||||
}
|
||||
columnValue = new MaxComputeColumnValue();
|
||||
int expectedRows = (int) Math.min(batchSize, remainBatchRows);
|
||||
int realRows = readVectors(expectedRows);
|
||||
if (remainBatchRows <= 0) {
|
||||
return 0;
|
||||
}
|
||||
remainBatchRows -= realRows;
|
||||
return realRows;
|
||||
}
|
||||
|
||||
private int readVectors(int expectedRows) throws IOException {
|
||||
VectorSchemaRoot batch;
|
||||
int curReadRows = 0;
|
||||
while (curReadRows < expectedRows && (batch = curReader.read()) != null) {
|
||||
List<FieldVector> fieldVectors = batch.getFieldVectors();
|
||||
int batchRows = 0;
|
||||
for (FieldVector column : fieldVectors) {
|
||||
columnValue.reset(column);
|
||||
// LOG.warn("MCJNI read getClass: " + column.getClass());
|
||||
batchRows = column.getValueCount();
|
||||
for (int j = 0; j < batchRows; j++) {
|
||||
appendData(readColumnsId.get(column.getName()), columnValue);
|
||||
}
|
||||
}
|
||||
curReadRows += batchRows;
|
||||
}
|
||||
return curReadRows;
|
||||
}
|
||||
}
|
||||
@ -49,6 +49,11 @@ public class MockJniScanner extends JniScanner {
|
||||
this.j = j;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNull() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getBoolean() {
|
||||
return (i + j) % 2 == 0;
|
||||
|
||||
@ -27,38 +27,40 @@ import java.util.List;
|
||||
* Column value in vector column
|
||||
*/
|
||||
public interface ColumnValue {
|
||||
public boolean getBoolean();
|
||||
boolean isNull();
|
||||
|
||||
boolean getBoolean();
|
||||
|
||||
// tinyint
|
||||
public byte getByte();
|
||||
byte getByte();
|
||||
|
||||
// smallint
|
||||
public short getShort();
|
||||
short getShort();
|
||||
|
||||
public int getInt();
|
||||
int getInt();
|
||||
|
||||
public float getFloat();
|
||||
float getFloat();
|
||||
|
||||
// bigint
|
||||
public long getLong();
|
||||
long getLong();
|
||||
|
||||
public double getDouble();
|
||||
double getDouble();
|
||||
|
||||
public BigInteger getBigInteger();
|
||||
BigInteger getBigInteger();
|
||||
|
||||
public BigDecimal getDecimal();
|
||||
BigDecimal getDecimal();
|
||||
|
||||
public String getString();
|
||||
String getString();
|
||||
|
||||
public LocalDate getDate();
|
||||
LocalDate getDate();
|
||||
|
||||
public LocalDateTime getDateTime();
|
||||
LocalDateTime getDateTime();
|
||||
|
||||
public byte[] getBytes();
|
||||
byte[] getBytes();
|
||||
|
||||
public void unpackArray(List<ColumnValue> values);
|
||||
void unpackArray(List<ColumnValue> values);
|
||||
|
||||
public void unpackMap(List<ColumnValue> keys, List<ColumnValue> values);
|
||||
void unpackMap(List<ColumnValue> keys, List<ColumnValue> values);
|
||||
|
||||
public void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> values);
|
||||
void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> values);
|
||||
}
|
||||
|
||||
@ -0,0 +1,185 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.jni.vec;
|
||||
|
||||
import org.apache.arrow.vector.BigIntVector;
|
||||
import org.apache.arrow.vector.DateDayVector;
|
||||
import org.apache.arrow.vector.DateMilliVector;
|
||||
import org.apache.arrow.vector.DecimalVector;
|
||||
import org.apache.arrow.vector.FieldVector;
|
||||
import org.apache.arrow.vector.Float4Vector;
|
||||
import org.apache.arrow.vector.Float8Vector;
|
||||
import org.apache.arrow.vector.IntVector;
|
||||
import org.apache.arrow.vector.SmallIntVector;
|
||||
import org.apache.arrow.vector.TinyIntVector;
|
||||
import org.apache.arrow.vector.VarBinaryVector;
|
||||
import org.apache.arrow.vector.VarCharVector;
|
||||
import org.apache.arrow.vector.util.DecimalUtility;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.math.BigInteger;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* MaxCompute Column value in vector column
|
||||
*/
|
||||
public class MaxComputeColumnValue implements ColumnValue {
|
||||
private static final Logger LOG = Logger.getLogger(MaxComputeColumnValue.class);
|
||||
private int idx;
|
||||
private FieldVector column;
|
||||
|
||||
public MaxComputeColumnValue() {
|
||||
idx = 0;
|
||||
}
|
||||
|
||||
public void reset(FieldVector column) {
|
||||
this.column = column;
|
||||
this.idx = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNull() {
|
||||
return column.isNull(idx);
|
||||
}
|
||||
|
||||
private void skippedIfNull() {
|
||||
// null has been process by appendValue with isNull()
|
||||
try {
|
||||
if (column.isNull(idx)) {
|
||||
idx++;
|
||||
}
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
// skip left rows
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getBoolean() {
|
||||
skippedIfNull();
|
||||
TinyIntVector tinyIntCol = (TinyIntVector) column;
|
||||
return tinyIntCol.get(idx++) > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getByte() {
|
||||
skippedIfNull();
|
||||
TinyIntVector tinyIntCol = (TinyIntVector) column;
|
||||
return tinyIntCol.get(idx++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public short getShort() {
|
||||
skippedIfNull();
|
||||
SmallIntVector smallIntCol = (SmallIntVector) column;
|
||||
return smallIntCol.get(idx++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getInt() {
|
||||
skippedIfNull();
|
||||
IntVector intCol = (IntVector) column;
|
||||
return intCol.get(idx++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getFloat() {
|
||||
skippedIfNull();
|
||||
Float4Vector floatCol = (Float4Vector) column;
|
||||
return floatCol.get(idx++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLong() {
|
||||
skippedIfNull();
|
||||
BigIntVector longCol = (BigIntVector) column;
|
||||
return longCol.get(idx++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getDouble() {
|
||||
skippedIfNull();
|
||||
Float8Vector doubleCol = (Float8Vector) column;
|
||||
return doubleCol.get(idx++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BigInteger getBigInteger() {
|
||||
skippedIfNull();
|
||||
BigIntVector longCol = (BigIntVector) column;
|
||||
return BigInteger.valueOf(longCol.get(idx++));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BigDecimal getDecimal() {
|
||||
skippedIfNull();
|
||||
DecimalVector decimalCol = (DecimalVector) column;
|
||||
return DecimalUtility.getBigDecimalFromArrowBuf(column.getDataBuffer(), idx++,
|
||||
decimalCol.getScale(), DecimalVector.TYPE_WIDTH);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getString() {
|
||||
skippedIfNull();
|
||||
VarCharVector varcharCol = (VarCharVector) column;
|
||||
String v = varcharCol.getObject(idx++).toString();
|
||||
return v == null ? new String(new byte[0]) : v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LocalDate getDate() {
|
||||
skippedIfNull();
|
||||
DateDayVector dateCol = (DateDayVector) column;
|
||||
Integer intVal = dateCol.getObject(idx++);
|
||||
return LocalDate.ofEpochDay(intVal == null ? 0 : intVal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public LocalDateTime getDateTime() {
|
||||
skippedIfNull();
|
||||
DateMilliVector datetimeCol = (DateMilliVector) column;
|
||||
LocalDateTime v = datetimeCol.getObject(idx++);
|
||||
return v == null ? LocalDateTime.MIN : v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBytes() {
|
||||
skippedIfNull();
|
||||
VarBinaryVector binaryCol = (VarBinaryVector) column;
|
||||
byte[] v = binaryCol.getObject(idx++);
|
||||
return v == null ? new byte[0] : v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void unpackArray(List<ColumnValue> values) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void unpackMap(List<ColumnValue> keys, List<ColumnValue> values) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> values) {
|
||||
|
||||
}
|
||||
}
|
||||
@ -122,6 +122,11 @@ public class ScanPredicate {
|
||||
return inspectObject().toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNull() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getBoolean() {
|
||||
return (boolean) inspectObject();
|
||||
|
||||
@ -551,7 +551,7 @@ public class VectorColumn {
|
||||
|
||||
public void appendValue(ColumnValue o) {
|
||||
ColumnType.Type typeValue = columnType.getType();
|
||||
if (o == null) {
|
||||
if (o == null || o.isNull()) {
|
||||
appendNull(typeValue);
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user