[Bug] Duplicate results when reading aggregation table (#5307)

Previously, we introduced an optimization logic for the aggr table, that is, in the case of only one rowset and nonoverlapping, the data can be read directly without merging. But this logic has bugs.
2021-02-04 09:21:35 +08:00
parent 77b756fb87
commit ea7f61e1c7
2 changed files with 18 additions and 4 deletions
--- a/be/src/olap/reader.cpp
+++ b/be/src/olap/reader.cpp
@ -122,22 +122,27 @@ OLAPStatus Reader::init(const ReaderParams& read_params) {
                     << ", version:" << read_params.version;
        return res;
    }
+
    // When only one rowset has data, and this rowset is nonoverlapping, we can read directly without aggregation
    bool has_delete_rowset = false;
+    bool has_overlapping = false;
    int nonoverlapping_count = 0;
    for (auto rs_reader : _rs_readers) {
        if (rs_reader->rowset()->rowset_meta()->delete_flag()) {
            has_delete_rowset = true;
            break;
        }
-        if (rs_reader->rowset()->rowset_meta()->num_rows() > 0 &&
-            !rs_reader->rowset()->rowset_meta()->is_segments_overlapping()) {
-            if (++nonoverlapping_count > 1) {
+        if (rs_reader->rowset()->rowset_meta()->num_rows() > 0) {
+            if (rs_reader->rowset()->rowset_meta()->is_segments_overlapping()) {
+                // when there are overlapping segments, can not do directly read
+                has_overlapping = true;
+                break;
+            } else if (++nonoverlapping_count > 1) {
                break;
            }
        }
    }
-    if (nonoverlapping_count == 1 && !has_delete_rowset) {
+    if (!has_overlapping && nonoverlapping_count == 1 && !has_delete_rowset) {
        _next_row_func = _tablet->keys_type() == AGG_KEYS ? &Reader::_direct_agg_key_next_row
                                                          : &Reader::_direct_next_row;
    } else {
--- a/be/src/olap/reader.h
+++ b/be/src/olap/reader.h
@ -145,12 +145,21 @@ private:

    void _init_load_bf_columns(const ReaderParams& read_params);

+    // Direcly read row from rowset and pass to upper caller. No need to do aggregation.
+    // This is usually used for DUPLICATE KEY tables
    OLAPStatus _direct_next_row(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool,
                                bool* eof);
+    // Just same as _direct_next_row, but this is only for AGGREGATE KEY tables.
+    // And this is an optimization for AGGR tables.
+    // When there is only one rowset and is not overlapping, we can read it directly without aggregation.
    OLAPStatus _direct_agg_key_next_row(RowCursor* row_cursor, MemPool* mem_pool,
                                        ObjectPool* agg_pool, bool* eof);
+    // For normal AGGREGATE KEY tables, read data by a merge heap.
    OLAPStatus _agg_key_next_row(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool,
                                 bool* eof);
+    // For UNIQUE KEY tables, read data by a merge heap.
+    // The difference from _agg_key_next_row is that it will read the data from high version to low version,
+    // to minimize the comparison time in merge heap.
    OLAPStatus _unique_key_next_row(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool,
                                    bool* eof);