[fix](spark-load) no need to filter row group when doing spark load (#13116)

1. Fix issue #13115 2. Modify the method of `get_next_block` or `GenericReader`, to return "read_rows" explicitly. Some columns in block may not be filled in reader, if the first column is not filled, use `block->rows()` can not return real row numbers. 3. Add more checks for broker load test cases.
2022-10-05 23:00:56 +08:00
parent 90512ebd59
commit d286aa7bf7
16 changed files with 48 additions and 20 deletions
--- a/be/src/exec/arrow/arrow_reader.cpp
+++ b/be/src/exec/arrow/arrow_reader.cpp
@ -98,7 +98,7 @@ int ArrowReaderWrap::get_column_index(std::string column_name) {
    }
 }

-Status ArrowReaderWrap::get_next_block(vectorized::Block* block, bool* eof) {
+Status ArrowReaderWrap::get_next_block(vectorized::Block* block, size_t* read_row, bool* eof) {
    size_t rows = 0;
    bool tmp_eof = false;
    do {
@ -107,7 +107,7 @@ Status ArrowReaderWrap::get_next_block(vectorized::Block* block, bool* eof) {
            // We need to make sure the eof is set to true iff block is empty.
            if (tmp_eof) {
                *eof = (rows == 0);
-                return Status::OK();
+                break;
            }
        }

@ -129,6 +129,7 @@ Status ArrowReaderWrap::get_next_block(vectorized::Block* block, bool* eof) {
        rows += num_elements;
        _arrow_batch_cur_idx += num_elements;
    } while (!tmp_eof && rows < _state->batch_size());
+    *read_row = rows;
    return Status::OK();
 }