From ea7f61e1c76502360da0b3f20a2f059fa13f2c56 Mon Sep 17 00:00:00 2001
From: Mingyu Chen <morningman.cmy@gmail.com>
Date: Thu, 4 Feb 2021 09:21:35 +0800
Subject: [PATCH] [Bug] Duplicate results when reading aggregation table
 (#5307)

Previously, we introduced an optimization logic for the aggr table,
that is, in the case of only one rowset and nonoverlapping,
the data can be read directly without merging.
But this logic has bugs.
---
 be/src/olap/reader.cpp | 13 +++++++++----
 be/src/olap/reader.h   |  9 +++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp
index ea7c8db9a8..bfceff9ccb 100644
--- a/be/src/olap/reader.cpp
+++ b/be/src/olap/reader.cpp
@@ -122,22 +122,27 @@ OLAPStatus Reader::init(const ReaderParams& read_params) {
                      << ", version:" << read_params.version;
         return res;
     }
+
     // When only one rowset has data, and this rowset is nonoverlapping, we can read directly without aggregation
     bool has_delete_rowset = false;
+    bool has_overlapping = false;
     int nonoverlapping_count = 0;
     for (auto rs_reader : _rs_readers) {
         if (rs_reader->rowset()->rowset_meta()->delete_flag()) {
             has_delete_rowset = true;
             break;
         }
-        if (rs_reader->rowset()->rowset_meta()->num_rows() > 0 &&
-            !rs_reader->rowset()->rowset_meta()->is_segments_overlapping()) {
-            if (++nonoverlapping_count > 1) {
+        if (rs_reader->rowset()->rowset_meta()->num_rows() > 0) {
+            if (rs_reader->rowset()->rowset_meta()->is_segments_overlapping()) {
+                // when there are overlapping segments, can not do directly read
+                has_overlapping = true;
+                break;
+            } else if (++nonoverlapping_count > 1) {
                 break;
             }
         }
     }
-    if (nonoverlapping_count == 1 && !has_delete_rowset) {
+    if (!has_overlapping && nonoverlapping_count == 1 && !has_delete_rowset) {
         _next_row_func = _tablet->keys_type() == AGG_KEYS ? &Reader::_direct_agg_key_next_row
                                                           : &Reader::_direct_next_row;
     } else {
diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h
index 9da3006252..b3fff2245c 100644
--- a/be/src/olap/reader.h
+++ b/be/src/olap/reader.h
@@ -145,12 +145,21 @@ private:
 
     void _init_load_bf_columns(const ReaderParams& read_params);
 
+    // Direcly read row from rowset and pass to upper caller. No need to do aggregation.
+    // This is usually used for DUPLICATE KEY tables
     OLAPStatus _direct_next_row(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool,
                                 bool* eof);
+    // Just same as _direct_next_row, but this is only for AGGREGATE KEY tables.
+    // And this is an optimization for AGGR tables.
+    // When there is only one rowset and is not overlapping, we can read it directly without aggregation.
     OLAPStatus _direct_agg_key_next_row(RowCursor* row_cursor, MemPool* mem_pool,
                                         ObjectPool* agg_pool, bool* eof);
+    // For normal AGGREGATE KEY tables, read data by a merge heap.
     OLAPStatus _agg_key_next_row(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool,
                                  bool* eof);
+    // For UNIQUE KEY tables, read data by a merge heap.
+    // The difference from _agg_key_next_row is that it will read the data from high version to low version,
+    // to minimize the comparison time in merge heap.
     OLAPStatus _unique_key_next_row(RowCursor* row_cursor, MemPool* mem_pool, ObjectPool* agg_pool,
                                     bool* eof);