[feature-wip](parquet-vec) Support parquet scanner in vectorized engine (#9433)

2022-05-17 09:37:17 +08:00
parent 7e8e14b3c6
commit bee5c2f8aa
24 changed files with 1621 additions and 48 deletions
--- a/be/src/exec/base_scanner.h
+++ b/be/src/exec/base_scanner.h
@ -35,6 +35,7 @@ class RuntimeState;
 class ExprContext;

 namespace vectorized {
+class VExprContext;
 class IColumn;
 using MutableColumnPtr = IColumn::MutablePtr;
 } // namespace vectorized
--- a/be/src/exec/broker_scan_node.cpp
+++ b/be/src/exec/broker_scan_node.cpp
@ -33,6 +33,7 @@
 #include "util/thread.h"
 #include "vec/exec/vbroker_scanner.h"
 #include "vec/exec/vjson_scanner.h"
+#include "vec/exec/vparquet_scanner.h"

 namespace doris {

@ -225,9 +226,15 @@ std::unique_ptr<BaseScanner> BrokerScanNode::create_scanner(const TBrokerScanRan
    BaseScanner* scan = nullptr;
    switch (scan_range.ranges[0].format_type) {
    case TFileFormatType::FORMAT_PARQUET:
-        scan = new ParquetScanner(_runtime_state, runtime_profile(), scan_range.params,
-                                  scan_range.ranges, scan_range.broker_addresses,
-                                  _pre_filter_texprs, counter);
+        if (_vectorized) {
+            scan = new vectorized::VParquetScanner(
+                    _runtime_state, runtime_profile(), scan_range.params, scan_range.ranges,
+                    scan_range.broker_addresses, _pre_filter_texprs, counter);
+        } else {
+            scan = new ParquetScanner(_runtime_state, runtime_profile(), scan_range.params,
+                                      scan_range.ranges, scan_range.broker_addresses,
+                                      _pre_filter_texprs, counter);
+        }
        break;
    case TFileFormatType::FORMAT_ORC:
        scan = new ORCScanner(_runtime_state, runtime_profile(), scan_range.params,
--- a/be/src/exec/parquet_reader.cpp
+++ b/be/src/exec/parquet_reader.cpp
@ -219,6 +219,22 @@ Status ParquetReaderWrap::read_record_batch(const std::vector<SlotDescriptor*>&
    return Status::OK();
 }

+Status ParquetReaderWrap::next_batch(std::shared_ptr<arrow::RecordBatch>* batch,
+                                     const std::vector<SlotDescriptor*>& tuple_slot_descs,
+                                     bool* eof) {
+    if (_batch->num_rows() == 0 || _current_line_of_batch != 0 || _current_line_of_group != 0) {
+        RETURN_IF_ERROR(read_record_batch(tuple_slot_descs, eof));
+    }
+    *batch = get_batch();
+    return Status::OK();
+}
+
+const std::shared_ptr<arrow::RecordBatch>& ParquetReaderWrap::get_batch() {
+    _current_line_of_batch += _batch->num_rows();
+    _current_line_of_group += _batch->num_rows();
+    return _batch;
+}
+
 Status ParquetReaderWrap::handle_timestamp(const std::shared_ptr<arrow::TimestampArray>& ts_array,
                                           uint8_t* buf, int32_t* wbytes) {
    const auto type = std::static_pointer_cast<arrow::TimestampType>(ts_array->type());
--- a/be/src/exec/parquet_reader.h
+++ b/be/src/exec/parquet_reader.h
@ -79,6 +79,8 @@ public:
    Status size(int64_t* size);
    Status init_parquet_reader(const std::vector<SlotDescriptor*>& tuple_slot_descs,
                               const std::string& timezone);
+    Status next_batch(std::shared_ptr<arrow::RecordBatch>* batch,
+                      const std::vector<SlotDescriptor*>& tuple_slot_descs, bool* eof);

 private:
    void fill_slot(Tuple* tuple, SlotDescriptor* slot_desc, MemPool* mem_pool, const uint8_t* value,
@ -86,6 +88,7 @@ private:
    Status column_indices(const std::vector<SlotDescriptor*>& tuple_slot_descs);
    Status set_field_null(Tuple* tuple, const SlotDescriptor* slot_desc);
    Status read_record_batch(const std::vector<SlotDescriptor*>& tuple_slot_descs, bool* eof);
+    const std::shared_ptr<arrow::RecordBatch>& get_batch();
    Status handle_timestamp(const std::shared_ptr<arrow::TimestampArray>& ts_array, uint8_t* buf,
                            int32_t* wbtyes);

--- a/be/src/exec/parquet_scanner.h
+++ b/be/src/exec/parquet_scanner.h
@ -65,11 +65,11 @@ public:
    // Close this scanner
    virtual void close();

-private:
+protected:
    // Read next buffer from reader
    Status open_next_reader();

-private:
+protected:
    //const TBrokerScanRangeParams& _params;
    const std::vector<TBrokerRangeDesc>& _ranges;
    const std::vector<TNetworkAddress>& _broker_addresses;
--- a/be/src/http/action/stream_load.cpp
+++ b/be/src/http/action/stream_load.cpp
@ -78,27 +78,29 @@ static TFileFormatType::type parse_format(const std::string& format_str,
        return parse_format("CSV", compress_type);
    }
    TFileFormatType::type format_type = TFileFormatType::FORMAT_UNKNOWN;
-    if (boost::iequals(format_str, "CSV")) {
+    if (iequal(format_str, "CSV")) {
        if (compress_type.empty()) {
            format_type = TFileFormatType::FORMAT_CSV_PLAIN;
        }
-        if (boost::iequals(compress_type, "GZ")) {
+        if (iequal(compress_type, "GZ")) {
            format_type = TFileFormatType::FORMAT_CSV_GZ;
-        } else if (boost::iequals(compress_type, "LZO")) {
+        } else if (iequal(compress_type, "LZO")) {
            format_type = TFileFormatType::FORMAT_CSV_LZO;
-        } else if (boost::iequals(compress_type, "BZ2")) {
+        } else if (iequal(compress_type, "BZ2")) {
            format_type = TFileFormatType::FORMAT_CSV_BZ2;
-        } else if (boost::iequals(compress_type, "LZ4FRAME")) {
+        } else if (iequal(compress_type, "LZ4FRAME")) {
            format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME;
-        } else if (boost::iequals(compress_type, "LZOP")) {
+        } else if (iequal(compress_type, "LZOP")) {
            format_type = TFileFormatType::FORMAT_CSV_LZOP;
-        } else if (boost::iequals(compress_type, "DEFLATE")) {
+        } else if (iequal(compress_type, "DEFLATE")) {
            format_type = TFileFormatType::FORMAT_CSV_DEFLATE;
        }
-    } else if (boost::iequals(format_str, "JSON")) {
+    } else if (iequal(format_str, "JSON")) {
        if (compress_type.empty()) {
            format_type = TFileFormatType::FORMAT_JSON;
        }
+    } else if (iequal(format_str, "PARQUET")) {
+        format_type = TFileFormatType::FORMAT_PARQUET;
    }
    return format_type;
 }
@ -264,12 +266,12 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, StreamLoadContext* ct

    // get format of this put
    if (!http_req->header(HTTP_COMPRESS_TYPE).empty() &&
-        boost::iequals(http_req->header(HTTP_FORMAT_KEY), "JSON")) {
+        iequal(http_req->header(HTTP_FORMAT_KEY), "JSON")) {
        return Status::InternalError("compress data of JSON format is not supported.");
    }
    std::string format_str = http_req->header(HTTP_FORMAT_KEY);
-    if (boost::iequals(format_str, BeConsts::CSV_WITH_NAMES) ||
-        boost::iequals(format_str, BeConsts::CSV_WITH_NAMES_AND_TYPES)) {
+    if (iequal(format_str, BeConsts::CSV_WITH_NAMES) ||
+        iequal(format_str, BeConsts::CSV_WITH_NAMES_AND_TYPES)) {
        ctx->header_type = format_str;
        //treat as CSV
        format_str = BeConsts::CSV;
@ -291,7 +293,7 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, StreamLoadContext* ct
    size_t json_max_body_bytes = config::streaming_load_json_max_mb * 1024 * 1024;
    bool read_json_by_line = false;
    if (!http_req->header(HTTP_READ_JSON_BY_LINE).empty()) {
-        if (boost::iequals(http_req->header(HTTP_READ_JSON_BY_LINE), "true")) {
+        if (iequal(http_req->header(HTTP_READ_JSON_BY_LINE), "true")) {
            read_json_by_line = true;
        }
    }
@ -440,9 +442,9 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
        request.__set_negative(false);
    }
    if (!http_req->header(HTTP_STRICT_MODE).empty()) {
-        if (boost::iequals(http_req->header(HTTP_STRICT_MODE), "false")) {
+        if (iequal(http_req->header(HTTP_STRICT_MODE), "false")) {
            request.__set_strictMode(false);
-        } else if (boost::iequals(http_req->header(HTTP_STRICT_MODE), "true")) {
+        } else if (iequal(http_req->header(HTTP_STRICT_MODE), "true")) {
            request.__set_strictMode(true);
        } else {
            return Status::InvalidArgument("Invalid strict mode format. Must be bool type");
@ -465,7 +467,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
        request.__set_json_root(http_req->header(HTTP_JSONROOT));
    }
    if (!http_req->header(HTTP_STRIP_OUTER_ARRAY).empty()) {
-        if (boost::iequals(http_req->header(HTTP_STRIP_OUTER_ARRAY), "true")) {
+        if (iequal(http_req->header(HTTP_STRIP_OUTER_ARRAY), "true")) {
            request.__set_strip_outer_array(true);
        } else {
            request.__set_strip_outer_array(false);
@ -474,7 +476,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
        request.__set_strip_outer_array(false);
    }
    if (!http_req->header(HTTP_NUM_AS_STRING).empty()) {
-        if (boost::iequals(http_req->header(HTTP_NUM_AS_STRING), "true")) {
+        if (iequal(http_req->header(HTTP_NUM_AS_STRING), "true")) {
            request.__set_num_as_string(true);
        } else {
            request.__set_num_as_string(false);
@ -483,7 +485,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
        request.__set_num_as_string(false);
    }
    if (!http_req->header(HTTP_FUZZY_PARSE).empty()) {
-        if (boost::iequals(http_req->header(HTTP_FUZZY_PARSE), "true")) {
+        if (iequal(http_req->header(HTTP_FUZZY_PARSE), "true")) {
            request.__set_fuzzy_parse(true);
        } else {
            request.__set_fuzzy_parse(false);
@ -493,7 +495,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
    }

    if (!http_req->header(HTTP_READ_JSON_BY_LINE).empty()) {
-        if (boost::iequals(http_req->header(HTTP_READ_JSON_BY_LINE), "true")) {
+        if (iequal(http_req->header(HTTP_READ_JSON_BY_LINE), "true")) {
            request.__set_read_json_by_line(true);
        } else {
            request.__set_read_json_by_line(false);
@ -517,7 +519,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
    }

    if (!http_req->header(HTTP_LOAD_TO_SINGLE_TABLET).empty()) {
-        if (boost::iequals(http_req->header(HTTP_LOAD_TO_SINGLE_TABLET), "true")) {
+        if (iequal(http_req->header(HTTP_LOAD_TO_SINGLE_TABLET), "true")) {
            request.__set_load_to_single_tablet(true);
        } else {
            request.__set_load_to_single_tablet(false);
--- a/be/src/olap/rowset/segment_v2/rle_page.h
+++ b/be/src/olap/rowset/segment_v2/rle_page.h
@ -104,6 +104,7 @@ public:

    void reset() override {
        _count = 0;
+        _finished = false;
        _rle_encoder->Clear();
        _rle_encoder->Reserve(RLE_PAGE_HEADER_SIZE, 0);
    }
--- a/be/src/vec/CMakeLists.txt
+++ b/be/src/vec/CMakeLists.txt
@ -102,6 +102,7 @@ set(VEC_FILES
  exec/vbroker_scan_node.cpp
  exec/vbroker_scanner.cpp
  exec/vjson_scanner.cpp
+  exec/vparquet_scanner.cpp
  exec/join/vhash_join_node.cpp
  exprs/vectorized_agg_fn.cpp
  exprs/vectorized_fn_call.cpp
@ -191,6 +192,7 @@ set(VEC_FILES
  runtime/vdata_stream_recvr.cpp
  runtime/vdata_stream_mgr.cpp
  runtime/vpartition_info.cpp
+  utils/arrow_column_to_doris_column.cpp
  runtime/vsorted_run_merger.cpp)

 add_library(Vec STATIC
--- a/be/src/vec/data_types/data_type_factory.cpp
+++ b/be/src/vec/data_types/data_type_factory.cpp
@ -260,4 +260,67 @@ DataTypePtr DataTypeFactory::create_data_type(const PColumnMeta& pcolumn) {
    return nested;
 }

+DataTypePtr DataTypeFactory::create_data_type(const arrow::Type::type& type, bool is_nullable) {
+    DataTypePtr nested = nullptr;
+    switch (type) {
+    case ::arrow::Type::BOOL:
+        nested = std::make_shared<vectorized::DataTypeUInt8>();
+        break;
+    case ::arrow::Type::INT8:
+        nested = std::make_shared<vectorized::DataTypeInt8>();
+        break;
+    case ::arrow::Type::UINT8:
+        nested = std::make_shared<vectorized::DataTypeUInt8>();
+        break;
+    case ::arrow::Type::INT16:
+        nested = std::make_shared<vectorized::DataTypeInt16>();
+        break;
+    case ::arrow::Type::UINT16:
+        nested = std::make_shared<vectorized::DataTypeUInt16>();
+        break;
+    case ::arrow::Type::INT32:
+        nested = std::make_shared<vectorized::DataTypeInt32>();
+        break;
+    case ::arrow::Type::UINT32:
+        nested = std::make_shared<vectorized::DataTypeUInt32>();
+        break;
+    case ::arrow::Type::INT64:
+        nested = std::make_shared<vectorized::DataTypeInt64>();
+        break;
+    case ::arrow::Type::UINT64:
+        nested = std::make_shared<vectorized::DataTypeUInt64>();
+        break;
+    case ::arrow::Type::HALF_FLOAT:
+    case ::arrow::Type::FLOAT:
+        nested = std::make_shared<vectorized::DataTypeFloat32>();
+        break;
+    case ::arrow::Type::DOUBLE:
+        nested = std::make_shared<vectorized::DataTypeFloat64>();
+        break;
+    case ::arrow::Type::DATE32:
+        nested = std::make_shared<vectorized::DataTypeDate>();
+        break;
+    case ::arrow::Type::DATE64:
+    case ::arrow::Type::TIMESTAMP:
+        nested = std::make_shared<vectorized::DataTypeDateTime>();
+        break;
+    case ::arrow::Type::BINARY:
+    case ::arrow::Type::FIXED_SIZE_BINARY:
+    case ::arrow::Type::STRING:
+        nested = std::make_shared<vectorized::DataTypeString>();
+        break;
+    case ::arrow::Type::DECIMAL:
+        nested = std::make_shared<vectorized::DataTypeDecimal<vectorized::Decimal128>>(27, 9);
+        break;
+    default:
+        DCHECK(false) << "invalid arrow type:" << (int)type;
+        break;
+    }
+
+    if (nested && is_nullable) {
+        return std::make_shared<vectorized::DataTypeNullable>(nested);
+    }
+    return nested;
+}
+
 } // namespace doris::vectorized
--- a/be/src/vec/data_types/data_type_factory.hpp
+++ b/be/src/vec/data_types/data_type_factory.hpp
@ -22,6 +22,7 @@
 #include <mutex>
 #include <string>

+#include "arrow/type.h"
 #include "gen_cpp/data.pb.h"
 #include "olap/field.h"
 #include "olap/tablet_schema.h"
@ -87,6 +88,8 @@ public:

    DataTypePtr create_data_type(const PColumnMeta& pcolumn);

+    DataTypePtr create_data_type(const arrow::Type::type& type, bool is_nullable);
+
 private:
    DataTypePtr _create_primitive_data_type(const FieldType& type) const;

--- a/be/src/vec/exec/vparquet_scanner.cpp
+++ b/be/src/vec/exec/vparquet_scanner.cpp
@ -0,0 +1,311 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/exec/vparquet_scanner.h"
+
+#include "exec/parquet_reader.h"
+#include "exprs/expr.h"
+#include "runtime/descriptors.h"
+#include "runtime/exec_env.h"
+#include "vec/data_types/data_type_factory.hpp"
+#include "vec/functions/simple_function_factory.h"
+#include "vec/utils/arrow_column_to_doris_column.h"
+
+namespace doris::vectorized {
+
+VParquetScanner::VParquetScanner(RuntimeState* state, RuntimeProfile* profile,
+                                 const TBrokerScanRangeParams& params,
+                                 const std::vector<TBrokerRangeDesc>& ranges,
+                                 const std::vector<TNetworkAddress>& broker_addresses,
+                                 const std::vector<TExpr>& pre_filter_texprs,
+                                 ScannerCounter* counter)
+        : ParquetScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs,
+                         counter),
+          _batch(nullptr),
+          _arrow_batch_cur_idx(0),
+          _num_of_columns_from_file(0) {}
+VParquetScanner::~VParquetScanner() {}
+
+Status VParquetScanner::open() {
+    RETURN_IF_ERROR(ParquetScanner::open());
+    if (_ranges.empty()) {
+        return Status::OK();
+    }
+    auto range = _ranges[0];
+    _num_of_columns_from_file = range.__isset.num_of_columns_from_file
+                                        ? implicit_cast<int>(range.num_of_columns_from_file)
+                                        : implicit_cast<int>(_src_slot_descs.size());
+
+    // check consistency
+    if (range.__isset.num_of_columns_from_file) {
+        int size = range.columns_from_path.size();
+        for (const auto& r : _ranges) {
+            if (r.columns_from_path.size() != size) {
+                return Status::InternalError("ranges have different number of columns.");
+            }
+        }
+    }
+    return Status::OK();
+}
+
+// get next available arrow batch
+Status VParquetScanner::_next_arrow_batch() {
+    _arrow_batch_cur_idx = 0;
+    // first, init file reader
+    if (_cur_file_reader == nullptr || _cur_file_eof) {
+        RETURN_IF_ERROR(open_next_reader());
+        _cur_file_eof = false;
+    }
+    // second, loop until find available arrow batch or EOF
+    while (!_scanner_eof) {
+        RETURN_IF_ERROR(_cur_file_reader->next_batch(&_batch, _src_slot_descs, &_cur_file_eof));
+        if (_cur_file_eof) {
+            RETURN_IF_ERROR(open_next_reader());
+            _cur_file_eof = false;
+            continue;
+        }
+        if (_batch->num_rows() == 0) {
+            continue;
+        }
+        return Status::OK();
+    }
+    return Status::EndOfFile("EOF");
+}
+
+Status VParquetScanner::_init_arrow_batch_if_necessary() {
+    // 1. init batch if first time
+    // 2. reset reader if end of file
+    Status status;
+    if (_scanner_eof) {
+        return Status::EndOfFile("EOF");
+    }
+    if (_batch == nullptr || _arrow_batch_cur_idx >= _batch->num_rows()) {
+        return _next_arrow_batch();
+    }
+    return status;
+}
+
+Status VParquetScanner::_init_src_block(Block* block) {
+    size_t batch_pos = 0;
+    block->clear();
+    for (auto i = 0; i < _num_of_columns_from_file; ++i) {
+        SlotDescriptor* slot_desc = _src_slot_descs[i];
+        if (slot_desc == nullptr) {
+            continue;
+        }
+        auto* array = _batch->column(batch_pos++).get();
+        // let src column be nullable for simplify converting
+        // TODO, support not nullable for exec efficiently
+        auto is_nullable = true;
+        DataTypePtr data_type =
+                DataTypeFactory::instance().create_data_type(array->type()->id(), is_nullable);
+        if (data_type == nullptr) {
+            return Status::NotSupported(
+                    fmt::format("Not support arrow type:{}", array->type()->name()));
+        }
+        MutableColumnPtr data_column = data_type->create_column();
+        block->insert(
+                ColumnWithTypeAndName(std::move(data_column), data_type, slot_desc->col_name()));
+    }
+    return Status::OK();
+}
+
+Status VParquetScanner::get_next(vectorized::Block* block, bool* eof) {
+    // overall of type converting:
+    // arrow type ==arrow_column_to_doris_column==> primitive type(PT0) ==cast_src_block==>
+    // primitive type(PT1) ==materialize_block==> dest primitive type
+
+    // first, we need to convert the arrow type to the corresponding internal type,
+    // such as arrow::INT16 to TYPE_SMALLINT(PT0).
+    // why need first step? we cannot convert the arrow type to type in src desc directly,
+    // it's too hard to achieve.
+
+    // second, convert PT0 to the type in src desc, such as TYPE_SMALLINT to TYPE_VARCHAR.(PT1)
+    // why need second step? the materialize step only accepts types specified in src desc.
+
+    // finally, through the materialized, convert to the type in dest desc, such as TYPE_DATETIME.
+    SCOPED_TIMER(_read_timer);
+    // init arrow batch
+    {
+        Status st = _init_arrow_batch_if_necessary();
+        if (!st.ok()) {
+            if (!st.is_end_of_file()) {
+                return st;
+            }
+            *eof = true;
+            return Status::OK();
+        }
+    }
+    Block src_block;
+    RETURN_IF_ERROR(_init_src_block(&src_block));
+    // convert arrow batch to block until reach the batch_size
+    while (!_scanner_eof) {
+        // cast arrow type to PT0 and append it to src block
+        // for example: arrow::Type::INT16 => TYPE_SMALLINT
+        RETURN_IF_ERROR(_append_batch_to_src_block(&src_block));
+        // finalize the src block if full
+        if (src_block.rows() >= _state->batch_size()) {
+            break;
+        }
+        auto status = _next_arrow_batch();
+        // if ok, append the batch to the src columns
+        if (status.ok()) {
+            continue;
+        }
+        // return error if not EOF
+        if (!status.is_end_of_file()) {
+            return status;
+        }
+        _cur_file_eof = true;
+        break;
+    }
+    COUNTER_UPDATE(_rows_read_counter, src_block.rows());
+    SCOPED_TIMER(_materialize_timer);
+    // cast PT0 => PT1
+    // for example: TYPE_SMALLINT => TYPE_VARCHAR
+    RETURN_IF_ERROR(_cast_src_block(&src_block));
+    // range of current file
+    _fill_columns_from_path(&src_block);
+    RETURN_IF_ERROR(_eval_conjunts(&src_block));
+    // materialize, src block => dest columns
+    RETURN_IF_ERROR(_materialize_block(&src_block, block));
+    *eof = _scanner_eof;
+    return Status::OK();
+}
+
+// eval conjuncts, for example: t1 > 1
+Status VParquetScanner::_eval_conjunts(Block* block) {
+    for (auto& vctx : _vpre_filter_ctxs) {
+        size_t orig_rows = block->rows();
+        RETURN_IF_ERROR(VExprContext::filter_block(vctx, block, block->columns()));
+        _counter->num_rows_unselected += orig_rows - block->rows();
+    }
+    return Status::OK();
+}
+
+void VParquetScanner::_fill_columns_from_path(Block* block) {
+    const TBrokerRangeDesc& range = _ranges.at(_next_range - 1);
+    if (range.__isset.num_of_columns_from_file) {
+        size_t start = range.num_of_columns_from_file;
+        size_t rows = block->rows();
+        for (size_t i = 0; i < range.columns_from_path.size(); ++i) {
+            auto slot_desc = _src_slot_descs.at(i + start);
+            if (slot_desc == nullptr) continue;
+            auto is_nullable = slot_desc->is_nullable();
+            DataTypePtr data_type =
+                    DataTypeFactory::instance().create_data_type(TYPE_VARCHAR, is_nullable);
+            MutableColumnPtr data_column = data_type->create_column();
+            const std::string& column_from_path = range.columns_from_path[i];
+            for (size_t i = 0; i < rows; ++i) {
+                data_column->insert_data(const_cast<char*>(column_from_path.c_str()),
+                                         column_from_path.size());
+            }
+            block->insert(ColumnWithTypeAndName(std::move(data_column), data_type,
+                                                slot_desc->col_name()));
+        }
+    }
+}
+
+Status VParquetScanner::_materialize_block(Block* block, Block* dest_block) {
+    int ctx_idx = 0;
+    size_t orig_rows = block->rows();
+    auto filter_column = ColumnUInt8::create(orig_rows, 1);
+    for (auto slot_desc : _dest_tuple_desc->slots()) {
+        if (!slot_desc->is_materialized()) {
+            continue;
+        }
+        int dest_index = ctx_idx++;
+
+        VExprContext* ctx = _dest_vexpr_ctx[dest_index];
+        int result_column_id = 0;
+        // PT1 => dest primitive type
+        RETURN_IF_ERROR(ctx->execute(block, &result_column_id));
+        ColumnPtr& ptr = block->safe_get_by_position(result_column_id).column;
+        if (!slot_desc->is_nullable()) {
+            if (auto* nullable_column = check_and_get_column<ColumnNullable>(*ptr)) {
+                if (nullable_column->has_null()) {
+                    // fill filter if src has null value and dest column is not nullable
+                    IColumn::Filter& filter = assert_cast<ColumnUInt8&>(*filter_column).get_data();
+                    const ColumnPtr& null_column_ptr = nullable_column->get_null_map_column_ptr();
+                    const auto& column_data =
+                            assert_cast<const ColumnUInt8&>(*null_column_ptr).get_data();
+                    for (size_t i = 0; i < null_column_ptr->size(); ++i) {
+                        filter[i] &= !column_data[i];
+                    }
+                }
+                ptr = nullable_column->get_nested_column_ptr();
+            }
+        }
+        dest_block->insert(vectorized::ColumnWithTypeAndName(
+                std::move(ptr), slot_desc->get_data_type_ptr(), slot_desc->col_name()));
+    }
+    size_t dest_size = dest_block->columns();
+    // do filter
+    dest_block->insert(vectorized::ColumnWithTypeAndName(
+            std::move(filter_column), std::make_shared<vectorized::DataTypeUInt8>(),
+            "filter column"));
+    RETURN_IF_ERROR(Block::filter_block(dest_block, dest_size, dest_size));
+    _counter->num_rows_filtered += orig_rows - dest_block->rows();
+    return Status::OK();
+}
+
+// arrow type ==arrow_column_to_doris_column==> primitive type(PT0) ==cast_src_block==>
+// primitive type(PT1) ==materialize_block==> dest primitive type
+Status VParquetScanner::_cast_src_block(Block* block) {
+    // cast primitive type(PT0) to primitive type(PT1)
+    for (size_t i = 0; i < _num_of_columns_from_file; ++i) {
+        SlotDescriptor* slot_desc = _src_slot_descs[i];
+        if (slot_desc == nullptr) {
+            continue;
+        }
+        auto& arg = block->get_by_name(slot_desc->col_name());
+        // remove nullable here, let the get_function decide whether nullable
+        auto return_type = slot_desc->get_data_type_ptr();
+        ColumnsWithTypeAndName arguments {
+                arg,
+                {DataTypeString().create_column_const(
+                         arg.column->size(), remove_nullable(return_type)->get_family_name()),
+                 std::make_shared<DataTypeString>(), ""}};
+        auto func_cast =
+                SimpleFunctionFactory::instance().get_function("CAST", arguments, return_type);
+        RETURN_IF_ERROR(func_cast->execute(nullptr, *block, {i}, i, arg.column->size()));
+        block->get_by_position(i).type = std::move(return_type);
+    }
+    return Status::OK();
+}
+
+Status VParquetScanner::_append_batch_to_src_block(Block* block) {
+    size_t num_elements = std::min<size_t>((_state->batch_size() - block->rows()),
+                                           (_batch->num_rows() - _arrow_batch_cur_idx));
+    size_t column_pos = 0;
+    for (auto i = 0; i < _num_of_columns_from_file; ++i) {
+        SlotDescriptor* slot_desc = _src_slot_descs[i];
+        if (slot_desc == nullptr) {
+            continue;
+        }
+        auto* array = _batch->column(column_pos++).get();
+        auto& column_with_type_and_name = block->get_by_name(slot_desc->col_name());
+        RETURN_IF_ERROR(arrow_column_to_doris_column(array, _arrow_batch_cur_idx,
+                                                     column_with_type_and_name, num_elements,
+                                                     _state->timezone()));
+    }
+
+    _arrow_batch_cur_idx += num_elements;
+    return Status::OK();
+}
+
+} // namespace doris::vectorized
--- a/be/src/vec/exec/vparquet_scanner.h
+++ b/be/src/vec/exec/vparquet_scanner.h
@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <arrow/array.h>
+#include <exec/parquet_scanner.h>
+
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "common/status.h"
+#include "gen_cpp/PlanNodes_types.h"
+#include "gen_cpp/Types_types.h"
+#include "runtime/mem_pool.h"
+#include "util/runtime_profile.h"
+
+namespace doris::vectorized {
+
+// VParquet scanner convert the data read from Parquet to doris's columns.
+class VParquetScanner : public ParquetScanner {
+public:
+    VParquetScanner(RuntimeState* state, RuntimeProfile* profile,
+                    const TBrokerScanRangeParams& params,
+                    const std::vector<TBrokerRangeDesc>& ranges,
+                    const std::vector<TNetworkAddress>& broker_addresses,
+                    const std::vector<TExpr>& pre_filter_texprs, ScannerCounter* counter);
+
+    virtual ~VParquetScanner();
+
+    // Open this scanner, will initialize information need to
+    Status open();
+
+    Status get_next(Block* block, bool* eof);
+
+private:
+    Status _next_arrow_batch();
+    Status _init_arrow_batch_if_necessary();
+    Status _init_src_block(Block* block);
+    Status _append_batch_to_src_block(Block* block);
+    Status _cast_src_block(Block* block);
+    Status _eval_conjunts(Block* block);
+    Status _materialize_block(Block* block, Block* dest_block);
+    void _fill_columns_from_path(Block* block);
+
+private:
+    std::shared_ptr<arrow::RecordBatch> _batch;
+    size_t _arrow_batch_cur_idx;
+    int _num_of_columns_from_file;
+};
+
+} // namespace doris::vectorized
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@ -965,8 +965,9 @@ private:
                   !(check_and_get_data_type<DataTypeDateTime>(from_type.get()) ||
                     check_and_get_data_type<DataTypeDate>(from_type.get()))) {
            function = FunctionConvertToTimeType<DataType, NameCast>::create();
-        } else
+        } else {
            function = FunctionTo<DataType>::Type::create();
+        }

        /// Check conversion using underlying function
        { function->get_return_type(ColumnsWithTypeAndName(1, {nullptr, from_type, ""})); }
--- a/be/src/vec/utils/arrow_column_to_doris_column.cpp
+++ b/be/src/vec/utils/arrow_column_to_doris_column.cpp
@ -0,0 +1,289 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/utils/arrow_column_to_doris_column.h"
+
+#include <arrow/array.h>
+#include <arrow/record_batch.h>
+#include <arrow/status.h>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/scalar.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "gutil/casts.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/runtime/vdatetime_value.h"
+
+#define FOR_ARROW_TYPES(M)                            \
+    M(::arrow::Type::BOOL, TYPE_BOOLEAN)              \
+    M(::arrow::Type::INT8, TYPE_TINYINT)              \
+    M(::arrow::Type::UINT8, TYPE_TINYINT)             \
+    M(::arrow::Type::INT16, TYPE_SMALLINT)            \
+    M(::arrow::Type::UINT16, TYPE_SMALLINT)           \
+    M(::arrow::Type::INT32, TYPE_INT)                 \
+    M(::arrow::Type::UINT32, TYPE_INT)                \
+    M(::arrow::Type::INT64, TYPE_BIGINT)              \
+    M(::arrow::Type::UINT64, TYPE_BIGINT)             \
+    M(::arrow::Type::HALF_FLOAT, TYPE_FLOAT)          \
+    M(::arrow::Type::FLOAT, TYPE_FLOAT)               \
+    M(::arrow::Type::DOUBLE, TYPE_DOUBLE)             \
+    M(::arrow::Type::BINARY, TYPE_VARCHAR)            \
+    M(::arrow::Type::FIXED_SIZE_BINARY, TYPE_VARCHAR) \
+    M(::arrow::Type::STRING, TYPE_VARCHAR)            \
+    M(::arrow::Type::TIMESTAMP, TYPE_DATETIME)        \
+    M(::arrow::Type::DATE32, TYPE_DATE)               \
+    M(::arrow::Type::DATE64, TYPE_DATETIME)           \
+    M(::arrow::Type::DECIMAL, TYPE_DECIMALV2)
+
+#define FOR_ARROW_NUMERIC_TYPES(M)      \
+    M(arrow::Type::UINT8, UInt8)        \
+    M(arrow::Type::INT8, Int8)          \
+    M(arrow::Type::INT16, Int16)        \
+    M(arrow::Type::UINT16, UInt16)      \
+    M(arrow::Type::INT32, Int32)        \
+    M(arrow::Type::UINT32, UInt32)      \
+    M(arrow::Type::UINT64, UInt64)      \
+    M(arrow::Type::INT64, Int64)        \
+    M(arrow::Type::HALF_FLOAT, Float32) \
+    M(arrow::Type::FLOAT, Float32)      \
+    M(arrow::Type::DOUBLE, Float64)
+
+namespace doris::vectorized {
+
+const PrimitiveType arrow_type_to_primitive_type(::arrow::Type::type type) {
+    switch (type) {
+#define DISPATCH(ARROW_TYPE, CPP_TYPE) \
+    case ARROW_TYPE:                   \
+        return CPP_TYPE;
+        FOR_ARROW_TYPES(DISPATCH)
+#undef DISPATCH
+    default:
+        break;
+    }
+    return INVALID_TYPE;
+}
+
+static size_t fill_nullable_column(const arrow::Array* array, size_t array_idx,
+                                   vectorized::ColumnNullable* nullable_column,
+                                   size_t num_elements) {
+    size_t null_elements_count = 0;
+    NullMap& map_data = nullable_column->get_null_map_data();
+    for (size_t i = 0; i < num_elements; ++i) {
+        auto is_null = array->IsNull(array_idx + i);
+        map_data.emplace_back(is_null);
+        null_elements_count += is_null;
+    }
+    return null_elements_count;
+}
+
+/// Inserts chars and offsets right into internal column data to reduce an overhead.
+/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
+/// Also internal strings are null terminated.
+static Status convert_column_with_string_data(const arrow::Array* array, size_t array_idx,
+                                              MutableColumnPtr& data_column, size_t num_elements) {
+    PaddedPODArray<UInt8>& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
+    PaddedPODArray<UInt32>& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
+
+    auto concrete_array = down_cast<const arrow::BinaryArray*>(array);
+    std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
+
+    for (size_t offset_i = array_idx; offset_i < array_idx + num_elements; ++offset_i) {
+        if (!concrete_array->IsNull(offset_i) && buffer) {
+            const auto* raw_data = buffer->data() + concrete_array->value_offset(offset_i);
+            column_chars_t.insert(raw_data, raw_data + concrete_array->value_length(offset_i));
+        }
+        column_chars_t.emplace_back('\0');
+
+        column_offsets.emplace_back(column_chars_t.size());
+    }
+    return Status::OK();
+}
+
+static Status convert_column_with_fixed_size_data(const arrow::Array* array, size_t array_idx,
+                                                  MutableColumnPtr& data_column,
+                                                  size_t num_elements) {
+    PaddedPODArray<UInt8>& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
+    PaddedPODArray<UInt32>& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
+
+    auto concrete_array = down_cast<const arrow::FixedSizeBinaryArray*>(array);
+    uint32_t width = concrete_array->byte_width();
+    const auto* array_data = concrete_array->GetValue(array_idx);
+
+    for (size_t offset_i = 0; offset_i < num_elements; ++offset_i) {
+        if (!concrete_array->IsNull(offset_i)) {
+            const auto* raw_data = array_data + (offset_i * width);
+            column_chars_t.insert(raw_data, raw_data + width);
+        }
+        column_chars_t.emplace_back('\0');
+        column_offsets.emplace_back(column_chars_t.size());
+    }
+    return Status::OK();
+}
+
+/// Inserts numeric data right into internal column data to reduce an overhead
+template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
+static Status convert_column_with_numeric_data(const arrow::Array* array, size_t array_idx,
+                                               MutableColumnPtr& data_column, size_t num_elements) {
+    auto& column_data = static_cast<VectorType&>(*data_column).get_data();
+    /// buffers[0] is a null bitmap and buffers[1] are actual values
+    std::shared_ptr<arrow::Buffer> buffer = array->data()->buffers[1];
+    const auto* raw_data = reinterpret_cast<const NumericType*>(buffer->data()) + array_idx;
+    column_data.insert(raw_data, raw_data + num_elements);
+    return Status::OK();
+}
+
+static Status convert_column_with_boolean_data(const arrow::Array* array, size_t array_idx,
+                                               MutableColumnPtr& data_column, size_t num_elements) {
+    auto& column_data = static_cast<ColumnVector<UInt8>&>(*data_column).get_data();
+    auto concrete_array = down_cast<const arrow::BooleanArray*>(array);
+    for (size_t bool_i = array_idx; bool_i < array_idx + num_elements; ++bool_i) {
+        column_data.emplace_back(concrete_array->Value(bool_i));
+    }
+    return Status::OK();
+}
+
+static int64_t time_unit_divisor(arrow::TimeUnit::type unit) {
+    // Doris only supports seconds
+    switch (unit) {
+    case arrow::TimeUnit::type::SECOND: {
+        return 1L;
+    }
+    case arrow::TimeUnit::type::MILLI: {
+        return 1000L;
+    }
+    case arrow::TimeUnit::type::MICRO: {
+        return 1000000L;
+    }
+    case arrow::TimeUnit::type::NANO: {
+        return 1000000000L;
+    }
+    default:
+        return 0L;
+    }
+}
+
+template <typename ArrowType>
+static Status convert_column_with_timestamp_data(const arrow::Array* array, size_t array_idx,
+                                                 MutableColumnPtr& data_column, size_t num_elements,
+                                                 const std::string& timezone) {
+    auto& column_data = static_cast<ColumnVector<Int64>&>(*data_column).get_data();
+    auto concrete_array = down_cast<const ArrowType*>(array);
+    int64_t divisor = 1;
+    int64_t multiplier = 1;
+    if constexpr (std::is_same_v<ArrowType, arrow::TimestampArray>) {
+        const auto type = std::static_pointer_cast<arrow::TimestampType>(array->type());
+        divisor = time_unit_divisor(type->unit());
+        if (divisor == 0L) {
+            return Status::InternalError(fmt::format("Invalid Time Type:{}", type->name()));
+        }
+    } else if constexpr (std::is_same_v<ArrowType, arrow::Date32Array>) {
+        multiplier = 24 * 60 * 60; // day => secs
+    } else if constexpr (std::is_same_v<ArrowType, arrow::Date64Array>) {
+        divisor = 1000; //ms => secs
+    }
+
+    for (size_t value_i = array_idx; value_i < array_idx + num_elements; ++value_i) {
+        VecDateTimeValue v;
+        v.from_unixtime(static_cast<Int64>(concrete_array->Value(value_i)) / divisor * multiplier,
+                        timezone);
+        if constexpr (std::is_same_v<ArrowType, arrow::Date32Array>) {
+            v.cast_to_date();
+        }
+        column_data.emplace_back(binary_cast<VecDateTimeValue, Int64>(v));
+    }
+    return Status::OK();
+}
+
+static Status convert_column_with_decimal_data(const arrow::Array* array, size_t array_idx,
+                                               MutableColumnPtr& data_column, size_t num_elements) {
+    auto& column_data =
+            static_cast<ColumnDecimal<vectorized::Decimal128>&>(*data_column).get_data();
+    auto concrete_array = down_cast<const arrow::DecimalArray*>(array);
+    const auto* arrow_decimal_type = static_cast<arrow::DecimalType*>(array->type().get());
+    // TODO check precision
+    //size_t precision = arrow_decimal_type->precision();
+    const auto scale = arrow_decimal_type->scale();
+
+    for (size_t value_i = array_idx; value_i < array_idx + num_elements; ++value_i) {
+        auto value =
+                *reinterpret_cast<const vectorized::Decimal128*>(concrete_array->Value(value_i));
+        // convert scale to 9
+        if (scale != 9) {
+            value = convert_decimals<vectorized::DataTypeDecimal<vectorized::Decimal128>,
+                                     vectorized::DataTypeDecimal<vectorized::Decimal128>>(value,
+                                                                                          scale, 9);
+        }
+        column_data.emplace_back(value);
+    }
+    return Status::OK();
+}
+
+Status arrow_column_to_doris_column(const arrow::Array* arrow_column, size_t arrow_batch_cur_idx,
+                                    ColumnWithTypeAndName& doirs_column, size_t num_elements,
+                                    const std::string& timezone) {
+    // src column always be nullable for simpify converting
+    assert(doirs_column.column->is_nullable());
+    MutableColumnPtr data_column = nullptr;
+    if (doirs_column.column->is_nullable()) {
+        auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
+                (*std::move(doirs_column.column)).mutate().get());
+        fill_nullable_column(arrow_column, arrow_batch_cur_idx, nullable_column, num_elements);
+        data_column = nullable_column->get_nested_column_ptr();
+    } else {
+        data_column = (*std::move(doirs_column.column)).mutate();
+    }
+    // process data
+    switch (arrow_column->type()->id()) {
+    case arrow::Type::STRING:
+    case arrow::Type::BINARY:
+        return convert_column_with_string_data(arrow_column, arrow_batch_cur_idx, data_column,
+                                               num_elements);
+    case arrow::Type::FIXED_SIZE_BINARY:
+        return convert_column_with_fixed_size_data(arrow_column, arrow_batch_cur_idx, data_column,
+                                                   num_elements);
+#define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE)             \
+    case ARROW_NUMERIC_TYPE:                                       \
+        return convert_column_with_numeric_data<CPP_NUMERIC_TYPE>( \
+                arrow_column, arrow_batch_cur_idx, data_column, num_elements);
+        FOR_ARROW_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+    case arrow::Type::BOOL:
+        return convert_column_with_boolean_data(arrow_column, arrow_batch_cur_idx, data_column,
+                                                num_elements);
+    case arrow::Type::DATE32:
+        return convert_column_with_timestamp_data<arrow::Date32Array>(
+                arrow_column, arrow_batch_cur_idx, data_column, num_elements, timezone);
+    case arrow::Type::DATE64:
+        return convert_column_with_timestamp_data<arrow::Date64Array>(
+                arrow_column, arrow_batch_cur_idx, data_column, num_elements, timezone);
+    case arrow::Type::TIMESTAMP:
+        return convert_column_with_timestamp_data<arrow::TimestampArray>(
+                arrow_column, arrow_batch_cur_idx, data_column, num_elements, timezone);
+    case arrow::Type::DECIMAL:
+        return convert_column_with_decimal_data(arrow_column, arrow_batch_cur_idx, data_column,
+                                                num_elements);
+    default:
+        break;
+    }
+    return Status::NotSupported(
+            fmt::format("Not support arrow type:{}", arrow_column->type()->name()));
+}
+} // namespace doris::vectorized
--- a/be/src/vec/utils/arrow_column_to_doris_column.h
+++ b/be/src/vec/utils/arrow_column_to_doris_column.h
@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <arrow/type.h>
+#include <arrow/type_fwd.h>
+
+#include <iostream>
+#include <memory>
+
+#include "common/status.h"
+#include "runtime/primitive_type.h"
+#include "vec/core/column_with_type_and_name.h"
+
+// This files contains some utilities to convert Doris internal
+// data format from Apache Arrow format.
+namespace doris::vectorized {
+
+const PrimitiveType arrow_type_to_primitive_type(::arrow::Type::type type);
+
+Status arrow_column_to_doris_column(const arrow::Array* arrow_column, size_t arrow_batch_cur_idx,
+                                    ColumnWithTypeAndName& doirs_column, size_t num_elements,
+                                    const std::string& timezone);
+
+} // namespace doris::vectorized
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@ -357,6 +357,7 @@ set(VEC_TEST_FILES
    vec/function/function_test_util.cpp
    vec/function/table_function_test.cpp
    vec/runtime/vdata_stream_test.cpp
+    vec/utils/arrow_column_to_doris_column_test.cpp
 )

 add_executable(doris_be_test
--- a/be/test/vec/utils/arrow_column_to_doris_column_test.cpp
+++ b/be/test/vec/utils/arrow_column_to_doris_column_test.cpp
@ -0,0 +1,609 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/utils/arrow_column_to_doris_column.h"
+
+#include <arrow/array.h>
+#include <arrow/builder.h>
+#include <arrow/memory_pool.h>
+#include <arrow/record_batch.h>
+#include <arrow/status.h>
+#include <arrow/testing/gtest_util.h>
+#include <arrow/testing/util.h>
+#include <arrow/util/bit_util.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/scalar.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "gutil/casts.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/data_types/data_type_factory.hpp"
+#include "vec/functions/simple_function_factory.h"
+#include "vec/runtime/vdatetime_value.h"
+
+namespace doris::vectorized {
+
+template <typename ArrowType, typename ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType>
+ArrowCppType string_to_arrow_datetime(std::shared_ptr<ArrowType> type, const std::string& value) {
+    VecDateTimeValue tv;
+    tv.from_date_str(value.c_str(), value.size());
+    int64_t unix_seconds = 0;
+    tv.unix_timestamp(&unix_seconds, "UTC");
+    if constexpr (std::is_same_v<ArrowType, arrow::TimestampType>) {
+        arrow::TimeUnit::type unit = type->unit();
+        VecDateTimeValue vdtv;
+        vdtv.from_unixtime(unix_seconds, "UTC");
+        vdtv.unix_timestamp(&unix_seconds, type->timezone());
+        switch (unit) {
+        case arrow::TimeUnit::SECOND:
+            return unix_seconds;
+        case arrow::TimeUnit::MILLI:
+            return unix_seconds * 1000L;
+        case arrow::TimeUnit::MICRO:
+            return unix_seconds * 1000'000L;
+        case arrow::TimeUnit::NANO:
+            return unix_seconds * 1000'000'000L;
+        default:
+            assert(false);
+        }
+    } else if constexpr (std::is_same_v<ArrowType, arrow::Date32Type>) {
+        return unix_seconds / (24 * 3600);
+    } else if constexpr (std::is_same_v<ArrowType, arrow::Date64Type>) {
+        return unix_seconds * 1000L;
+    } else {
+        assert(false);
+    }
+    return 0;
+}
+
+template <typename ArrowType, bool is_nullable,
+          typename ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType>
+std::shared_ptr<arrow::Array> create_constant_numeric_array(size_t num_elements, ArrowCppType value,
+                                                            std::shared_ptr<arrow::DataType> type,
+                                                            size_t& counter) {
+    std::vector<std::shared_ptr<arrow::Buffer>> buffers;
+    buffers.resize(2);
+    size_t null_bitmap_byte_size = (num_elements + 7) / 8;
+    size_t data_byte_size = num_elements * sizeof(value);
+    auto buffer0_res = arrow::AllocateBuffer(null_bitmap_byte_size);
+    buffers[0] = std::move(buffer0_res.ValueOrDie());
+    auto buffer1_res = arrow::AllocateBuffer(data_byte_size);
+    buffers[1] = std::move(buffer1_res.ValueOrDie());
+    auto* nulls = buffers[0]->mutable_data();
+    auto* data = (ArrowCppType*)buffers[1]->mutable_data();
+
+    for (auto i = 0; i < num_elements; ++i) {
+        if (is_nullable && (i % 2 == 0)) {
+            arrow::bit_util::ClearBit(nulls, i);
+        } else {
+            arrow::bit_util::SetBit(nulls, i);
+        }
+        data[i] = value;
+    }
+    counter += num_elements;
+    using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
+    auto array_data = std::make_shared<arrow::ArrayData>(type, num_elements, buffers);
+    auto array = std::make_shared<ArrayType>(array_data);
+    return std::static_pointer_cast<arrow::Array>(array);
+}
+
+template <typename ArrowType, typename ColumnType, bool is_nullable,
+          typename ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType>
+void test_arrow_to_datetime_column(std::shared_ptr<ArrowType> type, ColumnWithTypeAndName& column,
+                                   size_t num_elements, ArrowCppType arrow_datetime,
+                                   VecDateTimeValue datetime, size_t& counter) {
+    ASSERT_EQ(column.column->size(), counter);
+    auto array = create_constant_numeric_array<ArrowType, is_nullable>(num_elements, arrow_datetime,
+                                                                       type, counter);
+    std::string time_zone = "UTC";
+    if constexpr (std::is_same_v<ArrowType, arrow::TimestampType>) {
+        time_zone = type->timezone();
+    }
+    auto ret = arrow_column_to_doris_column(array.get(), 0, column, num_elements, time_zone);
+    ASSERT_EQ(ret.ok(), true);
+    ASSERT_EQ(column.column->size(), counter);
+    MutableColumnPtr data_column = nullptr;
+    vectorized::ColumnNullable* nullable_column = nullptr;
+    if (column.column->is_nullable()) {
+        nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
+                (*std::move(column.column)).mutate().get());
+        data_column = nullable_column->get_nested_column_ptr();
+    } else {
+        data_column = (*std::move(column.column)).mutate();
+    }
+    auto& datetime_data = static_cast<ColumnType&>(*data_column).get_data();
+    for (auto i = 0; i < num_elements; ++i) {
+        auto idx = counter - num_elements + i;
+        if (is_nullable) {
+            ASSERT_NE(nullable_column, nullptr);
+            NullMap& map_data = nullable_column->get_null_map_data();
+            if (i % 2 == 0) {
+                ASSERT_EQ(map_data[idx], true);
+            } else {
+                ASSERT_EQ(map_data[idx], false);
+                auto val = binary_cast<VecDateTimeValue, Int64>(datetime);
+                ASSERT_EQ(datetime_data[idx], val);
+            }
+        } else {
+            auto val = binary_cast<VecDateTimeValue, Int64>(datetime);
+            ASSERT_EQ(datetime_data[idx], val);
+        }
+    }
+}
+
+template <typename ArrowType, typename ColumnType, bool is_nullable>
+void test_datetime(std::shared_ptr<ArrowType> type, const std::vector<std::string>& test_cases,
+                   size_t num_elements) {
+    using ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType;
+    size_t counter = 0;
+    auto pt = arrow_type_to_primitive_type(type->id());
+    ASSERT_NE(pt, INVALID_TYPE);
+    DataTypePtr data_type = DataTypeFactory::instance().create_data_type(pt, true);
+    MutableColumnPtr data_column = data_type->create_column();
+    ColumnWithTypeAndName column(std::move(data_column), data_type, "test_datatime_column");
+    for (auto& value : test_cases) {
+        ArrowCppType arrow_datetime = string_to_arrow_datetime<ArrowType>(type, value);
+        VecDateTimeValue tv;
+        tv.from_date_str(value.c_str(), value.size());
+        test_arrow_to_datetime_column<ArrowType, ColumnType, is_nullable>(
+                type, column, num_elements, arrow_datetime, tv, counter);
+    }
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_date32_to_date) {
+    auto type = std::make_shared<arrow::Date32Type>();
+    std::vector<std::string> test_cases = {{"1970-01-01"}, {"2021-05-30"}, {"2022-05-08"}};
+    test_datetime<arrow::Date32Type, ColumnVector<Int64>, false>(type, test_cases, 32);
+    test_datetime<arrow::Date32Type, ColumnVector<Int64>, true>(type, test_cases, 32);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_date64_to_datetime) {
+    auto type = std::make_shared<arrow::Date64Type>();
+    std::vector<std::string> test_cases = {
+            {"1970-01-01 12:12:12"}, {"2021-05-30 22:22:22"}, {"2022-05-08 00:00:01"}};
+    test_datetime<arrow::Date64Type, ColumnVector<Int64>, false>(type, test_cases, 64);
+    test_datetime<arrow::Date64Type, ColumnVector<Int64>, true>(type, test_cases, 64);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_timestamp_to_datetime) {
+    auto type = std::make_shared<arrow::Date64Type>();
+    std::vector<std::string> test_cases = {
+            {"1970-01-01 12:12:12"}, {"2021-05-30 22:22:22"}, {"2022-05-08 00:00:01"}};
+    std::vector<std::string> zones = {"UTC",    "GMT",           "CST",          "+01:00",
+                                      "-09:00", "Asia/Shanghai", "Europe/Zurich"};
+    std::vector<arrow::TimeUnit::type> time_units = {arrow::TimeUnit::SECOND,
+                                                     arrow::TimeUnit::MICRO, arrow::TimeUnit::MILLI,
+                                                     arrow::TimeUnit::NANO};
+    for (auto& unit : time_units) {
+        for (auto& zone : zones) {
+            auto type = std::make_shared<arrow::TimestampType>(unit, zone);
+            test_datetime<arrow::TimestampType, ColumnVector<Int64>, false>(type, test_cases, 64);
+            test_datetime<arrow::TimestampType, ColumnVector<Int64>, true>(type, test_cases, 64);
+        }
+    }
+}
+
+template <typename ArrowType, typename CppType, bool is_nullable,
+          typename ColumnType = ColumnVector<CppType>,
+          typename ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType>
+void test_arrow_to_numeric_column(std::shared_ptr<ArrowType> type, ColumnWithTypeAndName& column,
+                                  size_t num_elements, ArrowCppType arrow_numeric, CppType numeric,
+                                  size_t& counter) {
+    ASSERT_EQ(column.column->size(), counter);
+    auto array = create_constant_numeric_array<ArrowType, is_nullable>(num_elements, arrow_numeric,
+                                                                       type, counter);
+    auto ret = arrow_column_to_doris_column(array.get(), 0, column, num_elements, "UTC");
+    ASSERT_EQ(ret.ok(), true);
+    ASSERT_EQ(column.column->size(), counter);
+    MutableColumnPtr data_column = nullptr;
+    vectorized::ColumnNullable* nullable_column = nullptr;
+    if (column.column->is_nullable()) {
+        nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
+                (*std::move(column.column)).mutate().get());
+        data_column = nullable_column->get_nested_column_ptr();
+    } else {
+        data_column = (*std::move(column.column)).mutate();
+    }
+    auto& numeric_data = static_cast<ColumnType&>(*data_column).get_data();
+    for (auto i = 0; i < num_elements; ++i) {
+        auto idx = counter - num_elements + i;
+        if (is_nullable) {
+            ASSERT_NE(nullable_column, nullptr);
+            NullMap& map_data = nullable_column->get_null_map_data();
+            if (i % 2 == 0) {
+                ASSERT_EQ(map_data[idx], true);
+            } else {
+                ASSERT_EQ(map_data[idx], false);
+                ASSERT_EQ(numeric_data[idx], numeric);
+            }
+        } else {
+            ASSERT_EQ(numeric_data[idx], numeric);
+        }
+    }
+}
+
+template <typename ArrowType, typename CppType, bool is_nullable,
+          typename ColumnType = ColumnVector<CppType>>
+void test_numeric(std::shared_ptr<ArrowType> type, const std::vector<CppType>& test_cases,
+                  size_t num_elements) {
+    using ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType;
+    size_t counter = 0;
+    auto pt = arrow_type_to_primitive_type(type->id());
+    ASSERT_NE(pt, INVALID_TYPE);
+    DataTypePtr data_type = DataTypeFactory::instance().create_data_type(pt, true);
+    MutableColumnPtr data_column = data_type->create_column();
+    ColumnWithTypeAndName column(std::move(data_column), data_type, "test_numeric_column");
+    for (auto& value : test_cases) {
+        test_arrow_to_numeric_column<ArrowType, CppType, is_nullable>(
+                type, column, num_elements, ArrowCppType(value), value, counter);
+    }
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_int8) {
+    auto type = std::make_shared<arrow::Int8Type>();
+    std::vector<Int8> test_cases = {1, -1, -128, 127, int8_t(255)};
+    test_numeric<arrow::Int8Type, Int8, false>(type, test_cases, 64);
+    test_numeric<arrow::Int8Type, Int8, true>(type, test_cases, 64);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_uint8) {
+    auto type = std::make_shared<arrow::UInt8Type>();
+    std::vector<UInt8> test_cases = {uint8_t(-1), uint8_t(1), uint8_t(-128), uint8_t(127),
+                                     uint8_t(255)};
+    test_numeric<arrow::UInt8Type, UInt8, false>(type, test_cases, 64);
+    test_numeric<arrow::UInt8Type, UInt8, true>(type, test_cases, 64);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_uint16) {
+    auto type = std::make_shared<arrow::UInt16Type>();
+    std::vector<UInt16> test_cases = {uint16_t(-1), uint16_t(1), uint16_t(-128), uint16_t(127),
+                                      uint16_t(65535)};
+    test_numeric<arrow::UInt16Type, UInt16, false>(type, test_cases, 64);
+    test_numeric<arrow::UInt16Type, UInt16, true>(type, test_cases, 64);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_uint32) {
+    auto type = std::make_shared<arrow::UInt32Type>();
+    std::vector<UInt32> test_cases = {uint32_t(-1), uint32_t(1), uint32_t(-65535), uint32_t(65535),
+                                      uint32_t(4294967295)};
+    test_numeric<arrow::UInt32Type, UInt32, false>(type, test_cases, 64);
+    test_numeric<arrow::UInt32Type, UInt32, true>(type, test_cases, 64);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_uint64) {
+    auto type = std::make_shared<arrow::UInt64Type>();
+    std::vector<UInt64> test_cases = {uint64_t(-1),
+                                      uint64_t(1),
+                                      uint64_t(-4294967295),
+                                      uint64_t(4294967295),
+                                      uint64_t(std::numeric_limits<uint64_t>::min()),
+                                      uint64_t(std::numeric_limits<uint64_t>::max())};
+    test_numeric<arrow::UInt64Type, UInt64, false>(type, test_cases, 64);
+    test_numeric<arrow::UInt64Type, UInt64, true>(type, test_cases, 64);
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_float64) {
+    auto type = std::make_shared<arrow::DoubleType>();
+    std::vector<double> test_cases = {double(-1.11f),
+                                      double(1.11f),
+                                      double(-4294967295),
+                                      double(4294967295),
+                                      double(std::numeric_limits<double>::min()),
+                                      double(std::numeric_limits<double>::max())};
+    test_numeric<arrow::DoubleType, Float64, false>(type, test_cases, 64);
+    test_numeric<arrow::DoubleType, Float64, true>(type, test_cases, 64);
+}
+
+template <bool is_nullable>
+std::shared_ptr<arrow::Array> create_decimal_array(size_t num_elements, int128_t decimal,
+                                                   std::shared_ptr<arrow::Decimal128Type> type,
+                                                   size_t& counter) {
+    std::vector<std::shared_ptr<arrow::Buffer>> buffers;
+    buffers.resize(2);
+    auto byte_width = type->byte_width();
+    auto buffer0_res = arrow::AllocateBuffer((num_elements + 7) / 8);
+    buffers[0] = std::move(buffer0_res.ValueOrDie());
+    auto buffer1_res = arrow::AllocateBuffer(byte_width * num_elements);
+    buffers[1] = std::move(buffer1_res.ValueOrDie());
+    auto* nulls = buffers[0]->mutable_data();
+    auto* data = buffers[1]->mutable_data();
+    for (auto i = 0; i < num_elements; ++i) {
+        if (is_nullable && (i % 2 == 0)) {
+            arrow::bit_util::ClearBit(nulls, i);
+        } else {
+            arrow::bit_util::SetBit(nulls, i);
+            memcpy(data + i * byte_width, &decimal, sizeof(decimal));
+        }
+    }
+    auto array_data = std::make_shared<arrow::ArrayData>(type, num_elements, buffers);
+    auto array = std::make_shared<arrow::Decimal128Array>(array_data);
+    counter += num_elements;
+    return array;
+}
+
+template <bool is_nullable>
+void test_arrow_to_decimal_column(std::shared_ptr<arrow::Decimal128Type> type,
+                                  ColumnWithTypeAndName& column, size_t num_elements,
+                                  int128_t arrow_value, int128_t expect_value, size_t& counter) {
+    ASSERT_EQ(column.column->size(), counter);
+    auto array = create_decimal_array<is_nullable>(num_elements, arrow_value, type, counter);
+    auto ret = arrow_column_to_doris_column(array.get(), 0, column, num_elements, "UTC");
+    ASSERT_EQ(ret.ok(), true);
+    ASSERT_EQ(column.column->size(), counter);
+    MutableColumnPtr data_column = nullptr;
+    vectorized::ColumnNullable* nullable_column = nullptr;
+    if (column.column->is_nullable()) {
+        nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
+                (*std::move(column.column)).mutate().get());
+        data_column = nullable_column->get_nested_column_ptr();
+    } else {
+        data_column = (*std::move(column.column)).mutate();
+    }
+    auto& decimal_data =
+            static_cast<ColumnDecimal<vectorized::Decimal128>&>(*data_column).get_data();
+    for (auto i = 0; i < num_elements; ++i) {
+        auto idx = counter - num_elements + i;
+        if (is_nullable) {
+            ASSERT_NE(nullable_column, nullptr);
+            NullMap& map_data = nullable_column->get_null_map_data();
+            if (i % 2 == 0) {
+                ASSERT_EQ(map_data[idx], true);
+            } else {
+                ASSERT_EQ(map_data[idx], false);
+                ASSERT_EQ(Int128(decimal_data[idx]), expect_value);
+            }
+        } else {
+            ASSERT_EQ(Int128(decimal_data[idx]), expect_value);
+        }
+    }
+}
+
+template <bool is_nullable>
+void test_decimalv2(std::shared_ptr<arrow::Decimal128Type> type,
+                    const std::vector<std::string>& test_cases, size_t num_elements) {
+    using ArrowCppType = typename arrow::TypeTraits<arrow::Decimal128Type>::CType;
+    size_t counter = 0;
+    auto pt = arrow_type_to_primitive_type(type->id());
+    ASSERT_NE(pt, INVALID_TYPE);
+    DataTypePtr data_type = DataTypeFactory::instance().create_data_type(pt, true);
+    MutableColumnPtr data_column = data_type->create_column();
+    ColumnWithTypeAndName column(std::move(data_column), data_type, "test_numeric_column");
+    for (auto& str : test_cases) {
+        DecimalV2Value decimal_value(str);
+        int128_t value = binary_cast<DecimalV2Value, int128_t>(decimal_value);
+        int128_t expect_value =
+                convert_decimals<vectorized::DataTypeDecimal<vectorized::Decimal128>,
+                                 vectorized::DataTypeDecimal<vectorized::Decimal128>>(
+                        value, type->scale(), 9);
+        test_arrow_to_decimal_column<is_nullable>(type, column, num_elements, value, expect_value,
+                                                  counter);
+    }
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_decimalv2) {
+    std::vector<std::string> test_cases = {"1.2345678", "-12.34567890", "99999999999.99999999",
+                                           "-99999999999.99999999"};
+    auto type_p27s9 = std::make_shared<arrow::Decimal128Type>(27, 9);
+    test_decimalv2<false>(type_p27s9, test_cases, 64);
+    test_decimalv2<true>(type_p27s9, test_cases, 64);
+
+    auto type_p27s25 = std::make_shared<arrow::Decimal128Type>(27, 25);
+    test_decimalv2<false>(type_p27s25, test_cases, 128);
+    test_decimalv2<true>(type_p27s25, test_cases, 128);
+}
+
+template <int bytes_width, bool is_nullable = false>
+static inline std::shared_ptr<arrow::Array> create_fixed_size_binary_array(int64_t num_elements,
+                                                                           const std::string& value,
+                                                                           size_t& counter) {
+    auto data_buf_size = bytes_width * num_elements;
+    auto data_buf_tmp = arrow::AllocateBuffer(data_buf_size);
+    std::shared_ptr<arrow::Buffer> data_buf = std::move(data_buf_tmp.ValueOrDie());
+    auto* p = data_buf->mutable_data();
+
+    auto null_bitmap_bytes = (num_elements + 7) / 8;
+    auto null_bitmap_tmp = arrow::AllocateBuffer(null_bitmap_bytes);
+    std::shared_ptr<arrow::Buffer> null_bitmap_buf = std::move(null_bitmap_tmp.ValueOrDie());
+    auto* nulls = null_bitmap_buf->mutable_data();
+
+    for (auto i = 0; i < num_elements; ++i) {
+        if (is_nullable && i % 2 == 0) {
+            arrow::bit_util::ClearBit(nulls, i);
+        } else {
+            arrow::bit_util::SetBit(nulls, i);
+        }
+        memcpy(p, value.c_str(), std::min(value.size() + 1, (std::string::size_type)bytes_width));
+        p += bytes_width;
+    }
+    auto type = std::make_shared<arrow::FixedSizeBinaryType>(bytes_width);
+    auto array = std::make_shared<arrow::FixedSizeBinaryArray>(type, num_elements, data_buf,
+                                                               null_bitmap_buf);
+    counter += num_elements;
+    return std::static_pointer_cast<arrow::Array>(array);
+}
+
+template <int bytes_width, bool is_nullable>
+void test_arrow_to_fixed_binary_column(ColumnWithTypeAndName& column, size_t num_elements,
+                                       const std::string value, size_t& counter) {
+    ASSERT_EQ(column.column->size(), counter);
+    auto array =
+            create_fixed_size_binary_array<bytes_width, is_nullable>(num_elements, value, counter);
+    auto ret = arrow_column_to_doris_column(array.get(), 0, column, num_elements, "UTC");
+    ASSERT_EQ(ret.ok(), true);
+    ASSERT_EQ(column.column->size(), counter);
+    MutableColumnPtr data_column = nullptr;
+    vectorized::ColumnNullable* nullable_column = nullptr;
+    if (column.column->is_nullable()) {
+        nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
+                (*std::move(column.column)).mutate().get());
+        data_column = nullable_column->get_nested_column_ptr();
+    } else {
+        data_column = (*std::move(column.column)).mutate();
+    }
+    auto& string_column = static_cast<ColumnString&>(*data_column);
+    auto string_size = std::min((std::string::size_type)bytes_width, value.size());
+    for (auto i = 0; i < num_elements; ++i) {
+        auto idx = counter - num_elements + i;
+        auto s = string_column.get_data_at(idx);
+        if (is_nullable) {
+            ASSERT_NE(nullable_column, nullptr);
+            NullMap& map_data = nullable_column->get_null_map_data();
+            if (i % 2 == 0) {
+                ASSERT_EQ(map_data[idx], true);
+                ASSERT_EQ(s.size, 0);
+            } else {
+                ASSERT_EQ(map_data[idx], false);
+                ASSERT_EQ(value.compare(0, string_size, s.to_string(), 0, string_size), 0);
+            }
+        } else {
+            ASSERT_EQ(value.compare(0, string_size, s.to_string(), 0, string_size), 0);
+        }
+    }
+}
+
+template <int bytes_width, bool is_nullable>
+void test_fixed_binary(const std::vector<std::string>& test_cases, size_t num_elements) {
+    size_t counter = 0;
+    auto pt = arrow_type_to_primitive_type(::arrow::Type::FIXED_SIZE_BINARY);
+    ASSERT_NE(pt, INVALID_TYPE);
+    DataTypePtr data_type = DataTypeFactory::instance().create_data_type(pt, true);
+    MutableColumnPtr data_column = data_type->create_column();
+    ColumnWithTypeAndName column(std::move(data_column), data_type, "test_fixed_binary_column");
+    for (auto& value : test_cases) {
+        test_arrow_to_fixed_binary_column<bytes_width, is_nullable>(column, num_elements, value,
+                                                                    counter);
+    }
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_fixed_binary) {
+    std::vector<std::string> test_cases = {"1.2345678", "-12.34567890", "99999999999.99999999",
+                                           "-99999999999.99999999"};
+    test_fixed_binary<10, false>(test_cases, 64);
+    test_fixed_binary<10, true>(test_cases, 64);
+
+    test_fixed_binary<255, false>(test_cases, 64);
+    test_fixed_binary<255, true>(test_cases, 64);
+}
+
+template <typename ArrowType, bool is_nullable = false>
+static inline std::shared_ptr<arrow::Array> create_binary_array(int64_t num_elements,
+                                                                const std::string& value,
+                                                                size_t& counter) {
+    using offset_type = typename ArrowType::offset_type;
+    size_t offsets_bytes = (num_elements + 1) * sizeof(offset_type);
+    auto offsets_buf_tmp = arrow::AllocateBuffer(offsets_bytes);
+    std::shared_ptr<arrow::Buffer> offsets_buf = std::move(offsets_buf_tmp.ValueOrDie());
+    auto* offsets = (offset_type*)offsets_buf->mutable_data();
+    offsets[0] = 0;
+
+    auto value_size = value.size();
+    size_t data_bytes = value_size * num_elements;
+    auto data_buf_tmp = arrow::AllocateBuffer(data_bytes);
+    std::shared_ptr<arrow::Buffer> data_buf = std::move(data_buf_tmp.ValueOrDie());
+    auto* data = data_buf->mutable_data();
+
+    auto null_bitmap_bytes = (num_elements + 7) / 8;
+    auto null_bitmap_tmp = arrow::AllocateBuffer(null_bitmap_bytes);
+    std::shared_ptr<arrow::Buffer> null_bitmap = std::move(null_bitmap_tmp.ValueOrDie());
+    auto nulls = null_bitmap->mutable_data();
+    auto data_off = 0;
+    for (auto i = 0; i < num_elements; ++i) {
+        if (is_nullable && i % 2 == 0) {
+            arrow::bit_util::ClearBit(nulls, i);
+        } else {
+            arrow::bit_util::SetBit(nulls, i);
+            memcpy(data + data_off, value.data(), value_size);
+            data_off += value_size;
+        }
+        offsets[i + 1] = data_off;
+    }
+
+    using ArrayType = typename arrow::TypeTraits<ArrowType>::ArrayType;
+    auto array = std::make_shared<ArrayType>(num_elements, offsets_buf, data_buf, null_bitmap);
+    counter += num_elements;
+    return std::static_pointer_cast<arrow::Array>(array);
+}
+
+template <typename ArrowType, bool is_nullable,
+          typename ArrowCppType = typename arrow::TypeTraits<ArrowType>::CType>
+void test_arrow_to_binary_column(ColumnWithTypeAndName& column, size_t num_elements,
+                                 ArrowCppType value, size_t& counter) {
+    ASSERT_EQ(column.column->size(), counter);
+    auto array = create_binary_array<ArrowType, is_nullable>(num_elements, value, counter);
+    auto ret = arrow_column_to_doris_column(array.get(), 0, column, num_elements, "UTC");
+    ASSERT_EQ(ret.ok(), true);
+    ASSERT_EQ(column.column->size(), counter);
+    MutableColumnPtr data_column = nullptr;
+    vectorized::ColumnNullable* nullable_column = nullptr;
+    if (column.column->is_nullable()) {
+        nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
+                (*std::move(column.column)).mutate().get());
+        data_column = nullable_column->get_nested_column_ptr();
+    } else {
+        data_column = (*std::move(column.column)).mutate();
+    }
+    auto& string_column = static_cast<ColumnString&>(*data_column);
+    for (auto i = 0; i < num_elements; ++i) {
+        auto idx = counter - num_elements + i;
+        auto s = string_column.get_data_at(idx);
+        if (is_nullable) {
+            ASSERT_NE(nullable_column, nullptr);
+            NullMap& map_data = nullable_column->get_null_map_data();
+            if (i % 2 == 0) {
+                ASSERT_EQ(map_data[idx], true);
+                ASSERT_EQ(s.size, 0);
+            } else {
+                ASSERT_EQ(map_data[idx], false);
+                ASSERT_EQ(value, s.to_string());
+            }
+        } else {
+            ASSERT_EQ(value, s.to_string());
+        }
+    }
+}
+
+template <typename ArrowType, bool is_nullable>
+void test_binary(const std::vector<std::string>& test_cases, size_t num_elements) {
+    size_t counter = 0;
+    DataTypePtr data_type = DataTypeFactory::instance().create_data_type(TYPE_VARCHAR, true);
+    MutableColumnPtr data_column = data_type->create_column();
+    ColumnWithTypeAndName column(std::move(data_column), data_type, "test_binary_column");
+    for (auto& value : test_cases) {
+        test_arrow_to_binary_column<ArrowType, is_nullable>(column, num_elements, value, counter);
+    }
+}
+
+TEST(ArrowColumnToDorisColumnTest, test_binary) {
+    std::vector<std::string> test_cases = {"1.2345678", "-12.34567890", "99999999999.99999999",
+                                           "-99999999999.99999999"};
+    test_binary<arrow::StringType, false>(test_cases, 64);
+    test_binary<arrow::StringType, true>(test_cases, 64);
+
+    test_binary<arrow::BinaryType, false>(test_cases, 64);
+    test_binary<arrow::BinaryType, true>(test_cases, 64);
+}
+} // namespace doris::vectorized
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/Expr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/Expr.java
@ -186,6 +186,23 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
                public boolean apply(Expr arg) { return arg instanceof NullLiteral; }
            };

+    public static final com.google.common.base.Predicate<Expr> IS_VARCHAR_SLOT_REF_IMPLICIT_CAST =
+            new com.google.common.base.Predicate<Expr>() {
+                @Override
+                public boolean apply(Expr arg) {
+                    // exclude explicit cast. for example: cast(k1 as date)
+                    if (!arg.isImplicitCast()) {
+                        return false;
+                    }
+                    List<Expr> children = arg.getChildren();
+                    if (children.isEmpty()) {
+                        return false;
+                    }
+                    Expr child = children.get(0);
+                    return child instanceof SlotRef && child.getType().isVarchar();
+                }
+            };
+
    public void setSelectivity() {
        selectivity = -1;
    }
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/TupleDescriptor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/TupleDescriptor.java
@ -105,6 +105,21 @@ public class TupleDescriptor {
        return slots;
    }

+    /**
+     * get slot desc by slot id.
+     *
+     * @param slotId slot id
+     * @return this slot's desc
+     */
+    public SlotDescriptor getSlot(int slotId) {
+        for (SlotDescriptor slotDesc : slots) {
+            if (slotDesc.getId().asInt() == slotId) {
+                return slotDesc;
+            }
+        }
+        return null;
+    }
+
    public void setCardinality(long cardinality) {
        this.cardinality = cardinality;
    }
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Type.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Type.java
@ -192,6 +192,10 @@ public abstract class Type {
                || isScalarType(PrimitiveType.STRING);
    }

+    public boolean isVarchar() {
+        return isScalarType(PrimitiveType.VARCHAR);
+    }
+
    // only metric types have the following constraint:
    // 1. don't support as key column
    // 2. don't support filter
--- a/fe/fe-core/src/main/java/org/apache/doris/load/Load.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/Load.java
@ -95,6 +95,7 @@ import org.apache.doris.task.LoadTaskInfo;
 import org.apache.doris.task.PushTask;
 import org.apache.doris.thrift.TBrokerScanRangeParams;
 import org.apache.doris.thrift.TEtlState;
+import org.apache.doris.thrift.TFileFormatType;
 import org.apache.doris.thrift.TMiniLoadRequest;
 import org.apache.doris.thrift.TNetworkAddress;
 import org.apache.doris.thrift.TPriority;
@ -930,7 +931,7 @@ public class Load {
     */
    public static void initColumns(Table tbl, List<ImportColumnDesc> columnExprs,
                                   Map<String, Pair<String, List<String>>> columnToHadoopFunction) throws UserException {
-        initColumns(tbl, columnExprs, columnToHadoopFunction, null, null, null, null, null, false);
+        initColumns(tbl, columnExprs, columnToHadoopFunction, null, null, null, null, null, null, false, false);
    }

    /*
@ -940,10 +941,11 @@ public class Load {
    public static void initColumns(Table tbl, LoadTaskInfo.ImportColumnDescs columnDescs,
                                   Map<String, Pair<String, List<String>>> columnToHadoopFunction,
                                   Map<String, Expr> exprsByName, Analyzer analyzer, TupleDescriptor srcTupleDesc,
-                                   Map<String, SlotDescriptor> slotDescByName, TBrokerScanRangeParams params) throws UserException {
+                                   Map<String, SlotDescriptor> slotDescByName, TBrokerScanRangeParams params,
+                                   TFileFormatType formatType, boolean useVectorizedLoad) throws UserException {
        rewriteColumns(columnDescs);
        initColumns(tbl, columnDescs.descs, columnToHadoopFunction, exprsByName, analyzer,
-                srcTupleDesc, slotDescByName, params, true);
+                srcTupleDesc, slotDescByName, params, formatType, useVectorizedLoad, true);
    }

    /*
@ -958,6 +960,7 @@ public class Load {
                                    Map<String, Pair<String, List<String>>> columnToHadoopFunction,
                                    Map<String, Expr> exprsByName, Analyzer analyzer, TupleDescriptor srcTupleDesc,
                                    Map<String, SlotDescriptor> slotDescByName, TBrokerScanRangeParams params,
+                                    TFileFormatType formatType, boolean useVectorizedLoad,
                                    boolean needInitSlotAndAnalyzeExprs) throws UserException {
        // We make a copy of the columnExprs so that our subsequent changes
        // to the columnExprs will not affect the original columnExprs.
@ -1043,30 +1046,70 @@ public class Load {
        if (!needInitSlotAndAnalyzeExprs) {
            return;
        }
-
+        Set<String> exprSrcSlotName = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
+        for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
+            if (importColumnDesc.isColumn()) {
+                continue;
+            }
+            List<SlotRef> slots = Lists.newArrayList();
+            importColumnDesc.getExpr().collect(SlotRef.class, slots);
+            for (SlotRef slot : slots) {
+                String slotColumnName = slot.getColumnName();
+                exprSrcSlotName.add(slotColumnName);
+            }
+        }
+        // excludedColumns is columns that should be varchar type
+        Set<String> excludedColumns = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
        // init slot desc add expr map, also transform hadoop functions
        for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
            // make column name case match with real column name
            String columnName = importColumnDesc.getColumnName();
+            Column tblColumn = tbl.getColumn(columnName);
            String realColName;
-            if (tbl.getColumn(columnName) == null || importColumnDesc.getExpr() == null) {
+            if (tblColumn == null || tblColumn.getName() == null || importColumnDesc.getExpr() == null) {
                realColName = columnName;
            } else {
-                realColName = tbl.getColumn(columnName).getName();
+                realColName = tblColumn.getName();
            }
            if (importColumnDesc.getExpr() != null) {
                Expr expr = transformHadoopFunctionExpr(tbl, realColName, importColumnDesc.getExpr());
                exprsByName.put(realColName, expr);
            } else {
                SlotDescriptor slotDesc = analyzer.getDescTbl().addSlotDescriptor(srcTupleDesc);
-                slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                // only support parquet format now
+                if (useVectorizedLoad  && formatType == TFileFormatType.FORMAT_PARQUET
+                        && tblColumn != null) {
+                    // in vectorized load
+                    // example: k1 is DATETIME in source file, and INT in schema, mapping exper is k1=year(k1)
+                    // we can not determine whether to use the type in the schema or the type inferred from expr
+                    // so use varchar type as before
+                    if (exprSrcSlotName.contains(columnName)) {
+                        // columns in expr args should be varchar type
+                        slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                        slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
+                        excludedColumns.add(realColName);
+                        // example k1, k2 = k1 + 1, k1 is not nullable, k2 is nullable
+                        // so we can not determine columns in expr args whether not nullable or nullable
+                        // slot in expr args use nullable as before
+                        slotDesc.setIsNullable(true);
+                    } else {
+                        // columns from files like parquet files can be parsed as the type in table schema
+                        slotDesc.setType(tblColumn.getType());
+                        slotDesc.setColumn(new Column(realColName, tblColumn.getType()));
+                        // non-nullable column is allowed in vectorized load with parquet format
+                        slotDesc.setIsNullable(tblColumn.isAllowNull());
+                    }
+                } else {
+                    // columns default be varchar type
+                    slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                    slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
+                    // ISSUE A: src slot should be nullable even if the column is not nullable.
+                    // because src slot is what we read from file, not represent to real column value.
+                    // If column is not nullable, error will be thrown when filling the dest slot,
+                    // which is not nullable.
+                    slotDesc.setIsNullable(true);
+                }
                slotDesc.setIsMaterialized(true);
-                // ISSUE A: src slot should be nullable even if the column is not nullable.
-                // because src slot is what we read from file, not represent to real column value.
-                // If column is not nullable, error will be thrown when filling the dest slot,
-                // which is not nullable.
-                slotDesc.setIsNullable(true);
-                slotDesc.setColumn(new Column(realColName, PrimitiveType.VARCHAR));
                params.addToSrcSlotIds(slotDesc.getId().asInt());
                slotDescByName.put(realColName, slotDesc);
            }
@ -1085,7 +1128,30 @@ public class Load {
        }

        LOG.debug("slotDescByName: {}, exprsByName: {}, mvDefineExpr: {}", slotDescByName, exprsByName, mvDefineExpr);
+        // we only support parquet format now
+        // use implicit deduction to convert columns
+        // that are not in the doris table from varchar to a more appropriate type
+        if (useVectorizedLoad && formatType == TFileFormatType.FORMAT_PARQUET) {
+            // analyze all exprs
+            Map<String, Expr> cloneExprsByName = Maps.newHashMap(exprsByName);
+            Map<String, Expr> cloneMvDefineExpr = Maps.newHashMap(mvDefineExpr);
+            analyzeAllExprs(tbl, analyzer, cloneExprsByName, cloneMvDefineExpr, slotDescByName, useVectorizedLoad);
+            // columns that only exist in mapping expr args, replace type with inferred from exprs,
+            // if there are more than one, choose the last except varchar type
+            // for example:
+            // k1 involves two mapping expr args: year(k1), t1=k1, k1's varchar type will be replaced by DATETIME
+            replaceVarcharWithCastType(cloneExprsByName, srcTupleDesc, excludedColumns);
+        }

+        // in vectorized load, reanalyze exprs with castExpr type
+        // otherwise analyze exprs with varchar type
+        analyzeAllExprs(tbl, analyzer, exprsByName, mvDefineExpr, slotDescByName, useVectorizedLoad);
+        LOG.debug("after init column, exprMap: {}", exprsByName);
+    }
+
+    private static void analyzeAllExprs(Table tbl, Analyzer analyzer, Map<String, Expr> exprsByName,
+                                            Map<String, Expr> mvDefineExpr, Map<String, SlotDescriptor> slotDescByName,
+                                            boolean useVectorizedLoad) throws UserException {
        // analyze all exprs
        for (Map.Entry<String, Expr> entry : exprsByName.entrySet()) {
            ExprSubstitutionMap smap = new ExprSubstitutionMap();
@ -1094,14 +1160,17 @@ public class Load {
            for (SlotRef slot : slots) {
                SlotDescriptor slotDesc = slotDescByName.get(slot.getColumnName());
                if (slotDesc == null) {
-                    if (entry.getKey().equalsIgnoreCase(Column.DELETE_SIGN)) {
-                        throw new UserException("unknown reference column in DELETE ON clause:" + slot.getColumnName());
-                    } else if (entry.getKey().equalsIgnoreCase(Column.SEQUENCE_COL)) {
-                        throw new UserException("unknown reference column in ORDER BY clause:" + slot.getColumnName());
-                    } else {
-                        throw new UserException("unknown reference column, column=" + entry.getKey()
-                                + ", reference=" + slot.getColumnName());
+                    if (entry.getKey() != null) {
+                        if (entry.getKey().equalsIgnoreCase(Column.DELETE_SIGN)) {
+                            throw new UserException("unknown reference column in DELETE ON clause:"
+                                + slot.getColumnName());
+                        } else if (entry.getKey().equalsIgnoreCase(Column.SEQUENCE_COL)) {
+                            throw new UserException("unknown reference column in ORDER BY clause:"
+                                + slot.getColumnName());
+                        }
                    }
+                    throw new UserException("unknown reference column, column=" + entry.getKey()
+                            + ", reference=" + slot.getColumnName());
                }
                smap.getLhs().add(slot);
                smap.getRhs().add(new SlotRef(slotDesc));
@ -1149,7 +1218,50 @@ public class Load {

            exprsByName.put(entry.getKey(), expr);
        }
-        LOG.debug("after init column, exprMap: {}", exprsByName);
+    }
+
+    /**
+     * columns that only exist in mapping expr args, replace type with inferred from exprs.
+     *
+     * @param excludedColumns columns that the type should not be inferred from expr.
+     *                         1. column exists in both schema and expr args.
+     */
+    private static void replaceVarcharWithCastType(Map<String, Expr> exprsByName, TupleDescriptor srcTupleDesc,
+                                               Set<String> excludedColumns) throws UserException {
+        // if there are more than one, choose the last except varchar type.
+        // for example:
+        // k1 involves two mapping expr args: year(k1), t1=k1, k1's varchar type will be replaced by DATETIME.
+        for (Map.Entry<String, Expr> entry : exprsByName.entrySet()) {
+            List<CastExpr> casts = Lists.newArrayList();
+            // exclude explicit cast. for example: cast(k1 as date)
+            entry.getValue().collect(Expr.IS_VARCHAR_SLOT_REF_IMPLICIT_CAST, casts);
+            if (casts.isEmpty()) {
+                continue;
+            }
+
+            for (CastExpr cast : casts) {
+                Expr child = cast.getChild(0);
+                Type type = cast.getType();
+                if (type.isVarchar()) {
+                    continue;
+                }
+
+                SlotRef slotRef = (SlotRef) child;
+                String columnName = slotRef.getColumn().getName();
+                if (excludedColumns.contains(columnName)) {
+                    continue;
+                }
+
+                // replace src slot desc with cast return type
+                int slotId = slotRef.getSlotId().asInt();
+                SlotDescriptor srcSlotDesc = srcTupleDesc.getSlot(slotId);
+                if (srcSlotDesc == null) {
+                    throw new UserException("Unknown source slot descriptor. id: " + slotId);
+                }
+                srcSlotDesc.setType(type);
+                srcSlotDesc.setColumn(new Column(columnName, type));
+            }
+        }
    }

    public static void rewriteColumns(LoadTaskInfo.ImportColumnDescs columnDescs) {
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/BrokerScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/BrokerScanNode.java
@ -37,6 +37,7 @@ import org.apache.doris.common.DdlException;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.UserException;
 import org.apache.doris.common.util.BrokerUtil;
+import org.apache.doris.common.util.VectorizedUtil;
 import org.apache.doris.load.BrokerFileGroup;
 import org.apache.doris.load.Load;
 import org.apache.doris.load.loadv2.LoadTask;
@ -268,7 +269,8 @@ public class BrokerScanNode extends LoadScanNode {

        Load.initColumns(targetTable, columnDescs,
                context.fileGroup.getColumnToHadoopFunction(), context.exprMap, analyzer,
-                context.srcTupleDescriptor, context.slotDescByName, context.params);
+                context.srcTupleDescriptor, context.slotDescByName, context.params,
+                formatType(context.fileGroup.getFileFormat(), ""), VectorizedUtil.isVectorized());
    }

    private TScanRangeLocations newLocations(TBrokerScanRangeParams params, BrokerDesc brokerDesc)
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java
@ -27,6 +27,7 @@ import org.apache.doris.analysis.TupleDescriptor;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.Table;
 import org.apache.doris.common.UserException;
+import org.apache.doris.common.util.VectorizedUtil;
 import org.apache.doris.load.Load;
 import org.apache.doris.load.loadv2.LoadTask;
 import org.apache.doris.task.LoadTaskInfo;
@ -140,7 +141,8 @@ public class StreamLoadScanNode extends LoadScanNode {
        }

        Load.initColumns(dstTable, columnExprDescs, null /* no hadoop function */,
-                exprsByName, analyzer, srcTupleDesc, slotDescByName, params);
+                exprsByName, analyzer, srcTupleDesc, slotDescByName, params,
+                taskInfo.getFormatType(), VectorizedUtil.isVectorized());

        // analyze where statement
        initAndSetPrecedingFilter(taskInfo.getPrecedingFilter(), this.srcTupleDesc, analyzer);
--- a/fe/fe-core/src/main/java/org/apache/doris/task/StreamLoadTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/task/StreamLoadTask.java
@ -260,6 +260,8 @@ public class StreamLoadTask implements LoadTaskInfo {
        }
        switch (request.getFileType()) {
            case FILE_STREAM:
+            // fall through to case FILE_LOCAL
+            case FILE_LOCAL:
                path = request.getPath();
                break;
            default: