From d969047b50397ca20bc22e8841aa6f4abf2f50e2 Mon Sep 17 00:00:00 2001
From: Pxl <pxl290@qq.com>
Date: Tue, 28 Nov 2023 19:46:00 +0800
Subject: [PATCH] [Refactor](join) refactor of hash join (#27557)

Improve the performance under the tpch data set by reconstructing the join related code and the use of hash table

Co-authored-by: HappenLee <happenlee@hotmail.com>
Co-authored-by: BiteTheDDDDt <pxl290@qq.com>
---
 be/src/exprs/bitmapfilter_predicate.h         |   8 +-
 be/src/exprs/block_bloom_filter.hpp           |  25 +-
 be/src/exprs/bloom_filter_func.h              | 328 +++----
 be/src/exprs/hybrid_set.h                     | 117 ++-
 be/src/exprs/minmax_predicate.h               | 303 +++----
 be/src/exprs/runtime_filter.cpp               | 111 +--
 be/src/exprs/runtime_filter.h                 |   6 +-
 be/src/exprs/runtime_filter_slots.h           |  33 +-
 be/src/exprs/runtime_filter_slots_cross.h     |  24 +-
 be/src/olap/bloom_filter_predicate.h          |  65 +-
 be/src/pipeline/exec/hashjoin_build_sink.cpp  | 187 ++--
 be/src/pipeline/exec/hashjoin_build_sink.h    |  16 +-
 .../pipeline/exec/hashjoin_probe_operator.cpp |   2 +
 .../pipeline/exec/hashjoin_probe_operator.h   |   8 +-
 .../exec/join_build_sink_operator.cpp         |   8 +-
 .../pipeline/exec/join_build_sink_operator.h  |   4 +-
 .../exec/nested_loop_join_build_operator.h    |   8 +-
 be/src/pipeline/exec/set_sink_operator.cpp    |  13 +-
 be/src/pipeline/exec/set_sink_operator.h      |   2 +-
 be/src/pipeline/exec/set_source_operator.cpp  |   6 +-
 be/src/pipeline/pipeline_x/dependency.h       |   7 +-
 be/src/vec/columns/column.h                   |   9 +-
 be/src/vec/columns/column_array.cpp           |  11 +
 be/src/vec/columns/column_array.h             |   3 +
 be/src/vec/columns/column_complex.h           |  15 +
 be/src/vec/columns/column_const.h             |   5 +
 be/src/vec/columns/column_decimal.h           |  12 +
 be/src/vec/columns/column_dictionary.h        |  34 +-
 .../vec/columns/column_fixed_length_object.h  |  22 +
 be/src/vec/columns/column_map.cpp             |  11 +
 be/src/vec/columns/column_map.h               |   3 +
 be/src/vec/columns/column_nothing.h           |   5 +
 be/src/vec/columns/column_nullable.cpp        |  10 +
 be/src/vec/columns/column_nullable.h          |   3 +
 be/src/vec/columns/column_object.cpp          |  11 +
 be/src/vec/columns/column_object.h            |   3 +
 be/src/vec/columns/column_string.cpp          |  37 +
 be/src/vec/columns/column_string.h            |   3 +
 be/src/vec/columns/column_struct.cpp          |   9 +
 be/src/vec/columns/column_struct.h            |   3 +
 be/src/vec/columns/column_vector.cpp          |  14 +
 be/src/vec/columns/column_vector.h            |   2 +
 be/src/vec/columns/predicate_column.h         |   5 +
 be/src/vec/common/hash_table/hash_map.h       | 344 ++++++++
 .../vec/common/hash_table/hash_map_context.h  | 129 ++-
 be/src/vec/common/hash_table/hash_table.h     |   1 -
 .../common/hash_table/hash_table_set_build.h  |   9 +-
 be/src/vec/core/block.cpp                     |  15 +-
 be/src/vec/exec/join/join_op.h                |  72 +-
 .../vec/exec/join/process_hash_table_probe.h  |  34 +-
 .../exec/join/process_hash_table_probe_impl.h | 816 +++---------------
 be/src/vec/exec/join/vhash_join_node.cpp      | 149 ++--
 be/src/vec/exec/join/vhash_join_node.h        | 187 +---
 be/src/vec/exec/join/vjoin_node_base.cpp      |   4 +-
 be/src/vec/exec/join/vjoin_node_base.h        |   4 +-
 .../vec/exec/join/vnested_loop_join_node.cpp  |   4 +-
 be/src/vec/exec/join/vnested_loop_join_node.h |   8 +-
 be/src/vec/exec/vset_operation_node.cpp       |  72 +-
 be/src/vec/exec/vset_operation_node.h         |   5 +-
 be/src/vec/exprs/vbloom_predicate.cpp         |  37 +-
 .../runtime/shared_hash_table_controller.h    |  11 +-
 be/test/exprs/bloom_filter_predicate_test.cpp |   3 -
 .../query_p0/join/mark_join/mark_join.out     |  19 +
 .../nereids_syntax_p0/sub_query_alias.groovy  |   2 +-
 .../sub_query_correlated.groovy               |  12 +-
 .../suites/nereids_syntax_p0/view.groovy      |   4 +-
 .../query_p0/join/mark_join/mark_join.groovy  |  64 ++
 67 files changed, 1579 insertions(+), 1937 deletions(-)
 create mode 100644 regression-test/data/query_p0/join/mark_join/mark_join.out
 create mode 100644 regression-test/suites/query_p0/join/mark_join/mark_join.groovy

diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h
index 743a55c4b6..8df488cf87 100644
--- a/be/src/exprs/bitmapfilter_predicate.h
+++ b/be/src/exprs/bitmapfilter_predicate.h
@@ -31,7 +31,7 @@ namespace doris {
 class BitmapFilterFuncBase : public FilterFuncBase {
 public:
     virtual void insert(const void* data) = 0;
-    virtual void insert_many(const std::vector<const BitmapValue*> bitmaps) = 0;
+    virtual void insert_many(const std::vector<const BitmapValue*>& bitmaps) = 0;
     virtual bool empty() = 0;
     virtual Status assign(BitmapValue* bitmap_value) = 0;
     virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; }
@@ -60,7 +60,7 @@ public:
 
     void insert(const void* data) override;
 
-    void insert_many(const std::vector<const BitmapValue*> bitmaps) override;
+    void insert_many(const std::vector<const BitmapValue*>& bitmaps) override;
 
     uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets,
                                         int number) override;
@@ -75,7 +75,7 @@ public:
         return Status::OK();
     }
 
-    void light_copy(BitmapFilterFuncBase* bloomfilter_func) override;
+    void light_copy(BitmapFilterFuncBase* bitmapfilter_func) override;
 
     size_t size() const override { return _bitmap_value->cardinality(); }
 
@@ -108,7 +108,7 @@ void BitmapFilterFunc<type>::insert(const void* data) {
 }
 
 template <PrimitiveType type>
-void BitmapFilterFunc<type>::insert_many(const std::vector<const BitmapValue*> bitmaps) {
+void BitmapFilterFunc<type>::insert_many(const std::vector<const BitmapValue*>& bitmaps) {
     if (bitmaps.empty()) {
         return;
     }
diff --git a/be/src/exprs/block_bloom_filter.hpp b/be/src/exprs/block_bloom_filter.hpp
index 654867d6cc..f31d7f7d4c 100644
--- a/be/src/exprs/block_bloom_filter.hpp
+++ b/be/src/exprs/block_bloom_filter.hpp
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "vec/common/string_ref.h"
 #ifdef __AVX2__
 #include <immintrin.h>
 
@@ -72,14 +73,7 @@ public:
     // non-equal values will have the same hash value) is 0.
     void insert(uint32_t hash) noexcept;
     // Same as above with convenience of hashing the key.
-    void insert(const Slice& key) noexcept {
-        if (key.data) {
-            insert(HashUtil::murmur_hash3_32(key.data, key.size, _hash_seed));
-        }
-    }
-
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    void insert_crc32_hash(const Slice& key) noexcept {
+    void insert(const StringRef& key) noexcept {
         if (key.data) {
             insert(HashUtil::crc_hash(key.data, key.size, _hash_seed));
         }
@@ -123,22 +117,13 @@ public:
 #endif
     }
     // Same as above with convenience of hashing the key.
-    bool find(const Slice& key) const noexcept {
-        if (key.data) {
-            return find(HashUtil::murmur_hash3_32(key.data, key.size, _hash_seed));
-        } else {
-            return false;
-        }
-    }
-
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    bool find_crc32_hash(const Slice& key) const noexcept {
+    bool find(const StringRef& key) const noexcept {
         if (key.data) {
             return find(HashUtil::crc_hash(key.data, key.size, _hash_seed));
-        } else {
-            return false;
         }
+        return false;
     }
+
     // Computes the logical OR of this filter with 'other' and stores the result in this
     // filter.
     // Notes:
diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h
index dfb775cc0a..0323d44315 100644
--- a/be/src/exprs/bloom_filter_func.h
+++ b/be/src/exprs/bloom_filter_func.h
@@ -20,6 +20,7 @@
 #include "exprs/block_bloom_filter.hpp"
 #include "exprs/runtime_filter.h"
 #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep
+#include "vec/common/string_ref.h"
 
 namespace doris {
 
@@ -53,27 +54,12 @@ public:
         return _bloom_filter->find(data);
     }
 
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    template <typename T>
-    bool test_new_hash(T data) const {
-        if constexpr (std::is_same_v<T, Slice>) {
-            return _bloom_filter->find_crc32_hash(data);
-        } else {
-            return _bloom_filter->find(data);
-        }
-    }
-
-    void add_bytes(const char* data, size_t len) { _bloom_filter->insert(Slice(data, len)); }
-
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    void add_bytes_new_hash(const char* data, size_t len) {
-        _bloom_filter->insert_crc32_hash(Slice(data, len));
-    }
+    void add_bytes(const char* data, size_t len) { _bloom_filter->insert(StringRef(data, len)); }
 
     // test_element/find_element only used on vectorized engine
     template <typename T>
     bool test_element(T element) const {
-        if constexpr (std::is_same_v<T, Slice>) {
+        if constexpr (std::is_same_v<T, StringRef>) {
             return _bloom_filter->find(element);
         } else {
             return _bloom_filter->find(HashUtil::fixed_len_to_uint32(element));
@@ -82,7 +68,7 @@ public:
 
     template <typename T>
     void add_element(T element) {
-        if constexpr (std::is_same_v<T, Slice>) {
+        if constexpr (std::is_same_v<T, StringRef>) {
             _bloom_filter->insert(element);
         } else {
             _bloom_filter->insert(HashUtil::fixed_len_to_uint32(element));
@@ -96,8 +82,6 @@ private:
 // Only Used In RuntimeFilter
 class BloomFilterFuncBase : public FilterFuncBase {
 public:
-    BloomFilterFuncBase() : _inited(false) {}
-
     virtual ~BloomFilterFuncBase() = default;
 
     Status init(int64_t expect_num, double fpp) {
@@ -112,9 +96,8 @@ public:
     Status init_with_fixed_length() {
         if (_build_bf_exactly) {
             return Status::OK();
-        } else {
-            return init_with_fixed_length(_bloom_filter_length);
         }
+        return init_with_fixed_length(_bloom_filter_length);
     }
 
     Status init_with_cardinality(const size_t build_bf_cardinality) {
@@ -127,10 +110,10 @@ public:
 
             // Handle case where ndv == 1 => ceil(log2(m/8)) < 0.
             int log_filter_size = std::max(0, (int)(std::ceil(std::log(m / 8) / std::log(2))));
-            return init_with_fixed_length(((int64_t)1) << log_filter_size);
-        } else {
-            return Status::OK();
+            _bloom_filter_length = std::min(((int64_t)1) << log_filter_size, _bloom_filter_length);
+            return init_with_fixed_length(_bloom_filter_length);
         }
+        return Status::OK();
     }
 
     Status init_with_fixed_length(int64_t bloom_filter_length) {
@@ -157,36 +140,35 @@ public:
         // allocate memory again.
         if (_inited) {
             DCHECK(bloomfilter_func != nullptr);
-            auto other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
+            auto* other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
             if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) {
-                LOG(WARNING) << "bloom filter size not the same: already allocated bytes = "
-                             << _bloom_filter_alloced << ", expected allocated bytes = "
-                             << other_func->_bloom_filter_alloced;
-                return Status::InvalidArgument("bloom filter size invalid");
+                return Status::InvalidArgument(
+                        "bloom filter size not the same: already allocated bytes {}, expected "
+                        "allocated bytes {}",
+                        _bloom_filter_alloced, other_func->_bloom_filter_alloced);
             }
             return _bloom_filter->merge(other_func->_bloom_filter.get());
         }
         {
             std::lock_guard<std::mutex> l(_lock);
             if (!_inited) {
-                auto other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
+                auto* other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
                 DCHECK(_bloom_filter == nullptr);
                 DCHECK(bloomfilter_func != nullptr);
                 _bloom_filter = bloomfilter_func->_bloom_filter;
                 _bloom_filter_alloced = other_func->_bloom_filter_alloced;
                 _inited = true;
                 return Status::OK();
-            } else {
-                DCHECK(bloomfilter_func != nullptr);
-                auto other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
-                if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) {
-                    LOG(WARNING) << "bloom filter size not the same: already allocated bytes = "
-                                 << _bloom_filter_alloced << ", expected allocated bytes = "
-                                 << other_func->_bloom_filter_alloced;
-                    return Status::InvalidArgument("bloom filter size invalid");
-                }
-                return _bloom_filter->merge(other_func->_bloom_filter.get());
             }
+            DCHECK(bloomfilter_func != nullptr);
+            auto* other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
+            if (_bloom_filter_alloced != other_func->_bloom_filter_alloced) {
+                return Status::InvalidArgument(
+                        "bloom filter size not the same: already allocated bytes {}, expected "
+                        "allocated bytes {}",
+                        _bloom_filter_alloced, other_func->_bloom_filter_alloced);
+            }
+            return _bloom_filter->merge(other_func->_bloom_filter.get());
         }
     }
 
@@ -208,7 +190,7 @@ public:
     size_t get_size() const { return _bloom_filter ? _bloom_filter->size() : 0; }
 
     void light_copy(BloomFilterFuncBase* bloomfilter_func) {
-        auto other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
+        auto* other_func = static_cast<BloomFilterFuncBase*>(bloomfilter_func);
         _bloom_filter_alloced = other_func->_bloom_filter_alloced;
         _bloom_filter = other_func->_bloom_filter;
         _inited = other_func->_inited;
@@ -216,62 +198,47 @@ public:
 
     virtual void insert(const void* data) = 0;
 
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    virtual void insert_crc32_hash(const void* data) = 0;
-
     virtual bool find(const void* data) const = 0;
 
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    virtual bool find_crc32_hash(const void* data) const = 0;
-
     virtual bool find_olap_engine(const void* data) const = 0;
 
     virtual bool find_uint32_t(uint32_t data) const = 0;
 
-    virtual void insert_fixed_len(const char* data, const int* offsets, int number) = 0;
+    virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0;
 
-    virtual void insert_fixed_len(const char* data) = 0;
+    virtual void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) = 0;
 
     virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap,
                                                 uint16_t* offsets, int number,
                                                 bool is_parse_column) = 0;
 
-    virtual void find_fixed_len(const char* data, const uint8* nullmap, int number,
-                                uint8* results) = 0;
-
 protected:
     // bloom filter size
     int32_t _bloom_filter_alloced;
     std::shared_ptr<BloomFilterAdaptor> _bloom_filter;
-    bool _inited;
+    bool _inited {};
     std::mutex _lock;
     int64_t _bloom_filter_length;
     bool _build_bf_exactly = false;
 };
 
-template <class T>
-struct CommonFindOp {
-    // test_batch/find_batch/find_batch_olap_engine only used on vectorized engine
-    void insert_batch(BloomFilterAdaptor& bloom_filter, const char* data, const int* offsets,
-                      int number) const {
-        for (int i = 0; i < number; i++) {
-            bloom_filter.add_element(*((T*)data + offsets[i]));
-        }
-    }
+struct BaseOp {
+    virtual ~BaseOp() = default;
 
-    void insert_single(BloomFilterAdaptor& bloom_filter, const char* data) const {
-        bloom_filter.add_element(*((T*)data));
-    }
+    virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
+                                  const void* data) const = 0;
 
-    uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data,
-                                    const uint8* nullmap, uint16_t* offsets, int number,
-                                    const bool is_parse_column) const {
+    uint16_t find_batch_olap_engine_with_element_size(const BloomFilterAdaptor& bloom_filter,
+                                                      const char* data, const uint8* nullmap,
+                                                      uint16_t* offsets, int number,
+                                                      const bool is_parse_column,
+                                                      size_t element_size) const {
         uint16_t new_size = 0;
         if (is_parse_column) {
             if (nullmap == nullptr) {
                 for (int i = 0; i < number; i++) {
                     uint16_t idx = offsets[i];
-                    if (!bloom_filter.test_element(*((T*)data + idx))) {
+                    if (!find_olap_engine(bloom_filter, data + element_size * idx)) {
                         continue;
                     }
                     offsets[new_size++] = idx;
@@ -282,7 +249,7 @@ struct CommonFindOp {
                     if (nullmap[idx]) {
                         continue;
                     }
-                    if (!bloom_filter.test_element(*((T*)data + idx))) {
+                    if (!find_olap_engine(bloom_filter, data + element_size * idx)) {
                         continue;
                     }
                     offsets[new_size++] = idx;
@@ -291,7 +258,7 @@ struct CommonFindOp {
         } else {
             if (nullmap == nullptr) {
                 for (int i = 0; i < number; i++) {
-                    if (!bloom_filter.test_element(*((T*)data + i))) {
+                    if (!find_olap_engine(bloom_filter, data + element_size * i)) {
                         continue;
                     }
                     offsets[new_size++] = i;
@@ -301,7 +268,7 @@ struct CommonFindOp {
                     if (nullmap[i]) {
                         continue;
                     }
-                    if (!bloom_filter.test_element(*((T*)data + i))) {
+                    if (!find_olap_engine(bloom_filter, data + element_size * i)) {
                         continue;
                     }
                     offsets[new_size++] = i;
@@ -310,28 +277,71 @@ struct CommonFindOp {
         }
         return new_size;
     }
+};
 
-    void find_batch(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap,
-                    int number, uint8* results) const {
-        for (int i = 0; i < number; i++) {
-            results[i] = false;
-            if (nullmap != nullptr && nullmap[i]) {
-                continue;
+template <class T>
+struct CommonFindOp : BaseOp {
+    uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data,
+                                    const uint8* nullmap, uint16_t* offsets, int number,
+                                    const bool is_parse_column) {
+        return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets,
+                                                        number, is_parse_column, sizeof(T));
+    }
+
+    void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column,
+                      size_t start) const {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col = nullable->get_nested_column();
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            const T* data = (T*)col.get_raw_data().data;
+            for (size_t i = start; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    bloom_filter.add_element(*(data + i));
+                }
             }
-            if (!bloom_filter.test_element(*((T*)data + i))) {
-                continue;
+        } else {
+            const T* data = (T*)column->get_raw_data().data;
+            for (size_t i = start; i < column->size(); i++) {
+                bloom_filter.add_element(*(data + i));
+            }
+        }
+    }
+
+    void find_batch(const BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column,
+                    uint8_t* results) const {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            const T* data = (T*)nullable->get_nested_column().get_raw_data().data;
+            for (size_t i = 0; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    results[i] = bloom_filter.test_element(data[i]);
+                } else {
+                    results[i] = false;
+                }
+            }
+        } else {
+            const T* data = (T*)column->get_raw_data().data;
+            for (size_t i = 0; i < column->size(); i++) {
+                results[i] = bloom_filter.test_element(data[i]);
             }
-            results[i] = true;
         }
     }
 
     void insert(BloomFilterAdaptor& bloom_filter, const void* data) const {
-        bloom_filter.add_bytes((char*)data, sizeof(T));
+        bloom_filter.add_element(((T*)data)[0]);
     }
     bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const {
-        return bloom_filter.test(Slice((char*)data, sizeof(T)));
+        return bloom_filter.test_element(((T*)data)[0]);
     }
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override {
         return find(bloom_filter, data);
     }
     bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const {
@@ -339,64 +349,82 @@ struct CommonFindOp {
     }
 };
 
-struct StringFindOp {
-    void insert_batch(BloomFilterAdaptor& bloom_filter, const char* data, const int* offsets,
-                      int number) const {
-        LOG(FATAL) << "StringFindOp does not support insert_batch";
-    }
-
-    void insert_single(BloomFilterAdaptor& bloom_filter, const char* data) const {
-        LOG(FATAL) << "StringFindOp does not support insert_single";
-    }
-
+struct StringFindOp : public BaseOp {
     uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data,
                                     const uint8* nullmap, uint16_t* offsets, int number,
-                                    const bool is_parse_column) const {
-        LOG(FATAL) << "StringFindOp does not support find_batch_olap_engine";
-        return 0;
+                                    const bool is_parse_column) {
+        return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets,
+                                                        number, is_parse_column, sizeof(StringRef));
     }
 
-    void find_batch(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap,
-                    int number, uint8* results) const {
-        LOG(FATAL) << "StringFindOp does not support find_batch";
+    static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column,
+                             size_t start) {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col =
+                    assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            for (size_t i = start; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    bloom_filter.add_element(col.get_data_at(i));
+                }
+            }
+        } else {
+            const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
+            for (size_t i = start; i < column->size(); i++) {
+                bloom_filter.add_element(col->get_data_at(i));
+            }
+        }
     }
 
-    void insert(BloomFilterAdaptor& bloom_filter, const void* data) const {
+    static void find_batch(const BloomFilterAdaptor& bloom_filter,
+                           const vectorized::ColumnPtr& column, uint8_t* results) {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col =
+                    assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            for (size_t i = 0; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    results[i] = bloom_filter.test_element(col.get_data_at(i));
+                } else {
+                    results[i] = false;
+                }
+            }
+        } else {
+            const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
+            for (size_t i = 0; i < column->size(); i++) {
+                results[i] = bloom_filter.test_element(col->get_data_at(i));
+            }
+        }
+    }
+
+    static void insert(BloomFilterAdaptor& bloom_filter, const void* data) {
         const auto* value = reinterpret_cast<const StringRef*>(data);
         if (value) {
             bloom_filter.add_bytes(value->data, value->size);
         }
     }
 
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    void insert_crc32_hash(BloomFilterAdaptor& bloom_filter, const void* data) const {
-        const auto* value = reinterpret_cast<const StringRef*>(data);
-        if (value) {
-            bloom_filter.add_bytes_new_hash(value->data, value->size);
-        }
-    }
-
-    bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const {
+    static bool find(const BloomFilterAdaptor& bloom_filter, const void* data) {
         const auto* value = reinterpret_cast<const StringRef*>(data);
         if (value == nullptr) {
             return false;
         }
-        return bloom_filter.test(Slice(value->data, value->size));
+        return bloom_filter.test(*value);
     }
 
-    //This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    bool find_crc32_hash(const BloomFilterAdaptor& bloom_filter, const void* data) const {
-        const auto* value = reinterpret_cast<const StringRef*>(data);
-        if (value == nullptr) {
-            return false;
-        }
-        return bloom_filter.test_new_hash(Slice(value->data, value->size));
-    }
-
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override {
         return StringFindOp::find(bloom_filter, data);
     }
-    bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const {
+
+    static bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) {
         return bloom_filter.test(data);
     }
 };
@@ -404,7 +432,8 @@ struct StringFindOp {
 // We do not need to judge whether data is empty, because null will not appear
 // when filer used by the storage engine
 struct FixedStringFindOp : public StringFindOp {
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* input_data) const {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
+                          const void* input_data) const override {
         const auto* value = reinterpret_cast<const StringRef*>(input_data);
         int64_t size = value->size;
         const char* data = value->data;
@@ -412,7 +441,7 @@ struct FixedStringFindOp : public StringFindOp {
         while (size > 0 && data[size - 1] == '\0') {
             size--;
         }
-        return bloom_filter.test(Slice(value->data, size));
+        return bloom_filter.test(StringRef(value->data, size));
     }
 };
 
@@ -449,37 +478,13 @@ public:
         dummy.insert(*_bloom_filter, data);
     }
 
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    void insert_crc32_hash(const void* data) override {
-        if constexpr (std::is_same_v<typename BloomFilterTypeTraits<type>::FindOp, StringFindOp> ||
-                      std::is_same_v<typename BloomFilterTypeTraits<type>::FindOp,
-                                     FixedStringFindOp>) {
-            DCHECK(_bloom_filter != nullptr);
-            dummy.insert_crc32_hash(*_bloom_filter, data);
-        } else {
-            insert(data);
-        }
-    }
-
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
+    void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
         DCHECK(_bloom_filter != nullptr);
-        dummy.insert_batch(*_bloom_filter, data, offsets, number);
+        dummy.insert_batch(*_bloom_filter, column, start);
     }
 
-    void insert_fixed_len(const char* data) override {
-        DCHECK(_bloom_filter != nullptr);
-        dummy.insert_single(*_bloom_filter, data);
-    }
-
-    uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets,
-                                        int number, const bool is_parse_column) override {
-        return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number,
-                                            is_parse_column);
-    }
-
-    void find_fixed_len(const char* data, const uint8* nullmap, int number,
-                        uint8* results) override {
-        dummy.find_batch(*_bloom_filter, data, nullmap, number, results);
+    void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) override {
+        dummy.find_batch(*_bloom_filter, column, results);
     }
 
     bool find(const void* data) const override {
@@ -487,23 +492,18 @@ public:
         return dummy.find(*_bloom_filter, data);
     }
 
-    // This function is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    bool find_crc32_hash(const void* data) const override {
-        if constexpr (std::is_same_v<typename BloomFilterTypeTraits<type>::FindOp, StringFindOp> ||
-                      std::is_same_v<typename BloomFilterTypeTraits<type>::FindOp,
-                                     FixedStringFindOp>) {
-            DCHECK(_bloom_filter != nullptr);
-            return dummy.find_crc32_hash(*_bloom_filter, data);
-        }
-        return find(data);
-    }
-
     bool find_olap_engine(const void* data) const override {
         return dummy.find_olap_engine(*_bloom_filter, data);
     }
 
     bool find_uint32_t(uint32_t data) const override { return dummy.find(*_bloom_filter, data); }
 
+    uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets,
+                                        int number, bool is_parse_column) override {
+        return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number,
+                                            is_parse_column);
+    }
+
 private:
     typename BloomFilterTypeTraits<type>::FindOp dummy;
 };
diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h
index 6a90bdd47c..9151dc7d3b 100644
--- a/be/src/exprs/hybrid_set.h
+++ b/be/src/exprs/hybrid_set.h
@@ -29,7 +29,7 @@
 
 namespace doris {
 
-#define FIXED_CONTAINER_MAX_SIZE 8
+constexpr int FIXED_CONTAINER_MAX_SIZE = 8;
 
 /**
  * Fix Container can use simd to improve performance. 1 <= N <= 8 can be improved performance by test. FIXED_CONTAINER_MAX_SIZE = 8.
@@ -44,7 +44,7 @@ public:
 
     class Iterator;
 
-    FixedContainer() : _size(0) { static_assert(N >= 0 && N <= FIXED_CONTAINER_MAX_SIZE); }
+    FixedContainer() { static_assert(N >= 0 && N <= FIXED_CONTAINER_MAX_SIZE); }
 
     ~FixedContainer() = default;
 
@@ -141,7 +141,7 @@ public:
 
 private:
     std::array<T, N> _data;
-    size_t _size;
+    size_t _size {};
 };
 
 /**
@@ -183,7 +183,7 @@ public:
     // use in vectorize execute engine
     virtual void insert(void* data, size_t) = 0;
 
-    virtual void insert_fixed_len(const char* data, const int* offsets, int number) = 0;
+    virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0;
 
     virtual void insert(HybridSetBase* set) {
         HybridSetBase::IteratorBase* iter = set->begin();
@@ -199,11 +199,6 @@ public:
     // use in vectorize execute engine
     virtual bool find(const void* data, size_t) const = 0;
 
-    virtual void find_fixed_len(const char* __restrict data, const uint8* __restrict null_map,
-                                int number, uint8* __restrict results) {
-        LOG(FATAL) << "HybridSetBase not support find_fixed_len";
-    }
-
     virtual void find_batch(const doris::vectorized::IColumn& column, size_t rows,
                             doris::vectorized::ColumnUInt8::Container& results) {
         LOG(FATAL) << "HybridSetBase not support find_batch";
@@ -275,21 +270,29 @@ public:
         if (data == nullptr) {
             return;
         }
-
-        if constexpr (sizeof(ElementType) >= 16) {
-            // for large int, it will core dump with no memcpy
-            ElementType value;
-            memcpy(&value, data, sizeof(ElementType));
-            _set.insert(value);
-        } else {
-            _set.insert(*reinterpret_cast<const ElementType*>(data));
-        }
+        _set.insert(*reinterpret_cast<const ElementType*>(data));
     }
-    void insert(void* data, size_t) override { insert(data); }
+    void insert(void* data, size_t /*unused*/) override { insert(data); }
 
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
-        for (int i = 0; i < number; i++) {
-            insert((void*)((ElementType*)data + offsets[i]));
+    void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col = nullable->get_nested_column();
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            const ElementType* data = (ElementType*)col.get_raw_data().data;
+            for (size_t i = start; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    _set.insert(*(data + i));
+                }
+            }
+        } else {
+            const ElementType* data = (ElementType*)column->get_raw_data().data;
+            for (size_t i = start; i < column->size(); i++) {
+                _set.insert(*(data + i));
+            }
         }
     }
 
@@ -303,21 +306,7 @@ public:
         return _set.find(*reinterpret_cast<const ElementType*>(data));
     }
 
-    bool find(const void* data, size_t) const override { return find(data); }
-
-    void find_fixed_len(const char* __restrict data, const uint8* __restrict null_map, int number,
-                        uint8* __restrict results) override {
-        ElementType* value = (ElementType*)data;
-        if (null_map == nullptr) {
-            for (int i = 0; i < number; i++) {
-                results[i] = _set.find(value[i]);
-            }
-        } else {
-            for (int i = 0; i < number; i++) {
-                results[i] = _set.find(value[i]) & !null_map[i];
-            }
-        }
-    }
+    bool find(const void* data, size_t /*unused*/) const override { return find(data); }
 
     void find_batch(const doris::vectorized::IColumn& column, size_t rows,
                     doris::vectorized::ColumnUInt8::Container& results) override {
@@ -414,8 +403,26 @@ public:
         _set.insert(str_value);
     }
 
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
-        LOG(FATAL) << "string set not support insert_fixed_len";
+    void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col =
+                    assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            for (size_t i = start; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    _set.insert(col.get_data_at(i).to_string());
+                }
+            }
+        } else {
+            const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
+            for (size_t i = start; i < column->size(); i++) {
+                _set.insert(col->get_data_at(i).to_string());
+            }
+        }
     }
 
     int size() override { return _set.size(); }
@@ -425,7 +432,7 @@ public:
             return false;
         }
 
-        auto* value = reinterpret_cast<const StringRef*>(data);
+        const auto* value = reinterpret_cast<const StringRef*>(data);
         std::string str_value(const_cast<const char*>(value->data), value->size);
         return _set.find(str_value);
     }
@@ -461,7 +468,7 @@ public:
     void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
                      const doris::vectorized::NullMap* null_map,
                      doris::vectorized::ColumnUInt8::Container& results) {
-        auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
+        const auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
         const uint8_t* __restrict null_map_data;
         if constexpr (is_nullable) {
             null_map_data = null_map->data();
@@ -538,8 +545,26 @@ public:
         _set.insert(sv);
     }
 
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
-        LOG(FATAL) << "string set not support insert_fixed_len";
+    void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col =
+                    assert_cast<const vectorized::ColumnString&>(nullable->get_nested_column());
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            for (size_t i = start; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    _set.insert(col.get_data_at(i));
+                }
+            }
+        } else {
+            const auto& col = assert_cast<const vectorized::ColumnString*>(column.get());
+            for (size_t i = start; i < column->size(); i++) {
+                _set.insert(col->get_data_at(i));
+            }
+        }
     }
 
     int size() override { return _set.size(); }
@@ -549,7 +574,7 @@ public:
             return false;
         }
 
-        auto* value = reinterpret_cast<const StringRef*>(data);
+        const auto* value = reinterpret_cast<const StringRef*>(data);
         return _set.find(*value);
     }
 
@@ -588,10 +613,10 @@ public:
     void _find_batch(const doris::vectorized::IColumn& column, size_t rows,
                      const doris::vectorized::NullMap* null_map,
                      doris::vectorized::ColumnUInt8::Container& results) {
-        auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
+        const auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
         const uint32_t* __restrict offset = col.get_offsets().data();
         const uint8_t* __restrict data = col.get_chars().data();
-        uint8_t* __restrict cursor = const_cast<uint8_t*>(data);
+        auto* __restrict cursor = const_cast<uint8_t*>(data);
         const uint8_t* __restrict null_map_data;
         if constexpr (is_nullable) {
             null_map_data = null_map->data();
diff --git a/be/src/exprs/minmax_predicate.h b/be/src/exprs/minmax_predicate.h
index cdf898292f..fcf2ef44a1 100644
--- a/be/src/exprs/minmax_predicate.h
+++ b/be/src/exprs/minmax_predicate.h
@@ -17,17 +17,22 @@
 
 #pragma once
 
+#include <type_traits>
+
 #include "common/object_pool.h"
 #include "runtime/type_limit.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/common/assert_cast.h"
 
 namespace doris {
 // only used in Runtime Filter
 class MinMaxFuncBase {
 public:
     virtual void insert(const void* data) = 0;
-    virtual void insert_fixed_len(const char* data, const int* offsets, int number) = 0;
+    virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0;
     virtual bool find(void* data) = 0;
-    virtual bool is_empty() = 0;
     virtual void* get_max() = 0;
     virtual void* get_min() = 0;
     // assign minmax data
@@ -37,7 +42,7 @@ public:
     virtual ~MinMaxFuncBase() = default;
 };
 
-template <class T>
+template <class T, bool NeedMax = true, bool NeedMin = true>
 class MinMaxNumFunc : public MinMaxFuncBase {
 public:
     MinMaxNumFunc() = default;
@@ -50,32 +55,78 @@ public:
 
         T val_data = *reinterpret_cast<const T*>(data);
 
-        if (_empty) {
-            _min = val_data;
-            _max = val_data;
-            _empty = false;
-            return;
+        if constexpr (NeedMin) {
+            if (val_data < _min) {
+                _min = val_data;
+            }
         }
-        if (val_data < _min) {
-            _min = val_data;
-        } else if (val_data > _max) {
-            _max = val_data;
+
+        if constexpr (NeedMax) {
+            if (val_data > _max) {
+                _max = val_data;
+            }
         }
     }
 
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
-        if (!number) {
+    void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override {
+        if (column->empty()) {
             return;
         }
-        if (_empty) {
-            _min = *((T*)data + offsets[0]);
-            _max = *((T*)data + offsets[0]);
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col = nullable->get_nested_column();
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+
+            if constexpr (std::is_same_v<T, StringRef>) {
+                const auto& column_string = assert_cast<const vectorized::ColumnString&>(col);
+                for (size_t i = start; i < column->size(); i++) {
+                    if (!nullmap[i]) {
+                        if constexpr (NeedMin) {
+                            _min = std::min(_min, column_string.get_data_at(i));
+                        }
+                        if constexpr (NeedMax) {
+                            _max = std::max(_max, column_string.get_data_at(i));
+                        }
+                    }
+                }
+            } else {
+                const T* data = (T*)col.get_raw_data().data;
+                for (size_t i = start; i < column->size(); i++) {
+                    if (!nullmap[i]) {
+                        if constexpr (NeedMin) {
+                            _min = std::min(_min, *(data + i));
+                        }
+                        if constexpr (NeedMax) {
+                            _max = std::max(_max, *(data + i));
+                        }
+                    }
+                }
+            }
+        } else {
+            if constexpr (std::is_same_v<T, StringRef>) {
+                const auto& column_string = assert_cast<const vectorized::ColumnString&>(*column);
+                for (size_t i = start; i < column->size(); i++) {
+                    if constexpr (NeedMin) {
+                        _min = std::min(_min, column_string.get_data_at(i));
+                    }
+                    if constexpr (NeedMax) {
+                        _max = std::max(_max, column_string.get_data_at(i));
+                    }
+                }
+            } else {
+                const T* data = (T*)column->get_raw_data().data;
+                for (size_t i = start; i < column->size(); i++) {
+                    if constexpr (NeedMin) {
+                        _min = std::min(_min, *(data + i));
+                    }
+                    if constexpr (NeedMax) {
+                        _max = std::max(_max, *(data + i));
+                    }
+                }
+            }
         }
-        for (int i = _empty; i < number; i++) {
-            _min = std::min(_min, *((T*)data + offsets[i]));
-            _max = std::max(_max, *((T*)data + offsets[i]));
-        }
-        _empty = false;
     }
 
     bool find(void* data) override {
@@ -84,40 +135,55 @@ public:
         }
 
         T val_data = *reinterpret_cast<T*>(data);
-        return val_data >= _min && val_data <= _max;
+        if constexpr (NeedMin) {
+            if (val_data < _min) {
+                return false;
+            }
+        }
+        if constexpr (NeedMax) {
+            if (val_data > _max) {
+                return false;
+            }
+        }
+        return true;
     }
 
     Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override {
         if constexpr (std::is_same_v<T, StringRef>) {
-            MinMaxNumFunc<T>* other_minmax = static_cast<MinMaxNumFunc<T>*>(minmax_func);
-
-            if (other_minmax->_min < _min) {
-                auto& other_min = other_minmax->_min;
-                auto str = pool->add(new std::string(other_min.data, other_min.size));
-                _min.data = str->data();
-                _min.size = str->length();
+            auto* other_minmax = static_cast<MinMaxNumFunc<T>*>(minmax_func);
+            if constexpr (NeedMin) {
+                if (other_minmax->_min < _min) {
+                    auto& other_min = other_minmax->_min;
+                    auto* str = pool->add(new std::string(other_min.data, other_min.size));
+                    _min.data = str->data();
+                    _min.size = str->length();
+                }
             }
-            if (other_minmax->_max > _max) {
-                auto& other_max = other_minmax->_max;
-                auto str = pool->add(new std::string(other_max.data, other_max.size));
-                _max.data = str->data();
-                _max.size = str->length();
+            if constexpr (NeedMax) {
+                if (other_minmax->_max > _max) {
+                    auto& other_max = other_minmax->_max;
+                    auto* str = pool->add(new std::string(other_max.data, other_max.size));
+                    _max.data = str->data();
+                    _max.size = str->length();
+                }
             }
         } else {
-            MinMaxNumFunc<T>* other_minmax = static_cast<MinMaxNumFunc<T>*>(minmax_func);
-            if (other_minmax->_min < _min) {
-                _min = other_minmax->_min;
+            auto* other_minmax = static_cast<MinMaxNumFunc<T>*>(minmax_func);
+            if constexpr (NeedMin) {
+                if (other_minmax->_min < _min) {
+                    _min = other_minmax->_min;
+                }
             }
-            if (other_minmax->_max > _max) {
-                _max = other_minmax->_max;
+            if constexpr (NeedMax) {
+                if (other_minmax->_max > _max) {
+                    _max = other_minmax->_max;
+                }
             }
         }
 
         return Status::OK();
     }
 
-    bool is_empty() override { return _empty; }
-
     void* get_max() override { return &_max; }
 
     void* get_min() override { return &_min; }
@@ -131,161 +197,12 @@ public:
 protected:
     T _max = type_limit<T>::min();
     T _min = type_limit<T>::max();
-    // we use _empty to avoid compare twice
-    bool _empty = true;
 };
 
 template <class T>
-class MinNumFunc : public MinMaxNumFunc<T> {
-public:
-    MinNumFunc() = default;
-    ~MinNumFunc() override = default;
-
-    void insert(const void* data) override {
-        if (data == nullptr) {
-            return;
-        }
-
-        T val_data = *reinterpret_cast<const T*>(data);
-
-        if (this->_empty) {
-            this->_min = val_data;
-            this->_empty = false;
-            return;
-        }
-        if (val_data < this->_min) {
-            this->_min = val_data;
-        }
-    }
-
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
-        if (!number) {
-            return;
-        }
-        if (this->_empty) {
-            this->_min = *((T*)data + offsets[0]);
-        }
-        for (int i = this->_empty; i < number; i++) {
-            this->_min = std::min(this->_min, *((T*)data + offsets[i]));
-        }
-        this->_empty = false;
-    }
-
-    bool find(void* data) override {
-        if (data == nullptr) {
-            return false;
-        }
-
-        T val_data = *reinterpret_cast<T*>(data);
-        return val_data >= this->_min;
-    }
-
-    Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override {
-        if constexpr (std::is_same_v<T, StringRef>) {
-            MinNumFunc<T>* other_minmax = assert_cast<MinNumFunc<T>*>(minmax_func);
-            if (other_minmax->_min < this->_min) {
-                auto& other_min = other_minmax->_min;
-                auto str = pool->add(new std::string(other_min.data, other_min.size));
-                this->_min.data = str->data();
-                this->_min.size = str->length();
-            }
-        } else {
-            MinNumFunc<T>* other_minmax = assert_cast<MinNumFunc<T>*>(minmax_func);
-            if (other_minmax->_min < this->_min) {
-                this->_min = other_minmax->_min;
-            }
-        }
-
-        return Status::OK();
-    }
-
-    //min filter the max is useless, so return nullptr directly
-    void* get_max() override {
-        DCHECK(false);
-        return nullptr;
-    }
-
-    Status assign(void* min_data, void* max_data) override {
-        this->_min = *(T*)min_data;
-        return Status::OK();
-    }
-};
+using MinNumFunc = MinMaxNumFunc<T, false, true>;
 
 template <class T>
-class MaxNumFunc : public MinMaxNumFunc<T> {
-public:
-    MaxNumFunc() = default;
-    ~MaxNumFunc() override = default;
-
-    void insert(const void* data) override {
-        if (data == nullptr) {
-            return;
-        }
-
-        T val_data = *reinterpret_cast<const T*>(data);
-
-        if (this->_empty) {
-            this->_max = val_data;
-            this->_empty = false;
-            return;
-        }
-        if (val_data > this->_max) {
-            this->_max = val_data;
-        }
-    }
-
-    void insert_fixed_len(const char* data, const int* offsets, int number) override {
-        if (!number) {
-            return;
-        }
-        if (this->_empty) {
-            this->_max = *((T*)data + offsets[0]);
-        }
-        for (int i = this->_empty; i < number; i++) {
-            this->_max = std::max(this->_max, *((T*)data + offsets[i]));
-        }
-        this->_empty = false;
-    }
-
-    bool find(void* data) override {
-        if (data == nullptr) {
-            return false;
-        }
-
-        T val_data = *reinterpret_cast<T*>(data);
-        return val_data <= this->_max;
-    }
-
-    Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override {
-        if constexpr (std::is_same_v<T, StringRef>) {
-            MinMaxNumFunc<T>* other_minmax = assert_cast<MinMaxNumFunc<T>*>(minmax_func);
-
-            if (other_minmax->_max > this->_max) {
-                auto& other_max = other_minmax->_max;
-                auto str = pool->add(new std::string(other_max.data, other_max.size));
-                this->_max.data = str->data();
-                this->_max.size = str->length();
-            }
-        } else {
-            MinMaxNumFunc<T>* other_minmax = assert_cast<MinMaxNumFunc<T>*>(minmax_func);
-            if (other_minmax->_max > this->_max) {
-                this->_max = other_minmax->_max;
-            }
-        }
-
-        return Status::OK();
-    }
-
-    //max filter the min is useless, so return nullptr directly
-    void* get_min() override {
-        DCHECK(false);
-        return nullptr;
-    }
-
-    Status assign(void* min_data, void* max_data) override {
-        this->_max = *(T*)max_data;
-        return Status::OK();
-    }
-};
+using MaxNumFunc = MinMaxNumFunc<T, true, false>;
 
 } // namespace doris
diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp
index aac153b08b..31bf202598 100644
--- a/be/src/exprs/runtime_filter.cpp
+++ b/be/src/exprs/runtime_filter.cpp
@@ -52,6 +52,7 @@
 #include "util/string_parser.hpp"
 #include "vec/columns/column.h"
 #include "vec/columns/column_complex.h"
+#include "vec/columns/column_nullable.h"
 #include "vec/common/assert_cast.h"
 #include "vec/core/wide_integer.h"
 #include "vec/core/wide_integer_to_string.h"
@@ -286,10 +287,7 @@ public:
               _pool(pool),
               _column_return_type(params->column_return_type),
               _filter_type(params->filter_type),
-              _filter_id(params->filter_id),
-              _use_batch(
-                      IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)),
-              _use_new_hash(_be_exec_version >= 2) {}
+              _filter_id(params->filter_id) {}
     // for a 'tmp' runtime predicate wrapper
     // only could called assign method or as a param for merge
     RuntimePredicateWrapper(RuntimeState* state, ObjectPool* pool, PrimitiveType column_type,
@@ -299,10 +297,7 @@ public:
               _pool(pool),
               _column_return_type(column_type),
               _filter_type(type),
-              _filter_id(filter_id),
-              _use_batch(
-                      IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)),
-              _use_new_hash(_be_exec_version >= 2) {}
+              _filter_id(filter_id) {}
 
     RuntimePredicateWrapper(QueryContext* query_ctx, ObjectPool* pool,
                             const RuntimeFilterParams* params)
@@ -311,10 +306,7 @@ public:
               _pool(pool),
               _column_return_type(params->column_return_type),
               _filter_type(params->filter_type),
-              _filter_id(params->filter_id),
-              _use_batch(
-                      IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)),
-              _use_new_hash(_be_exec_version >= 2) {}
+              _filter_id(params->filter_id) {}
     // for a 'tmp' runtime predicate wrapper
     // only could called assign method or as a param for merge
     RuntimePredicateWrapper(QueryContext* query_ctx, ObjectPool* pool, PrimitiveType column_type,
@@ -324,10 +316,7 @@ public:
               _pool(pool),
               _column_return_type(column_type),
               _filter_type(type),
-              _filter_id(filter_id),
-              _use_batch(
-                      IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)),
-              _use_new_hash(_be_exec_version >= 2) {}
+              _filter_id(filter_id) {}
     // init runtime filter wrapper
     // alloc memory to init runtime filter function
     Status init(const RuntimeFilterParams* params) {
@@ -389,23 +378,10 @@ public:
 
     void insert_to_bloom_filter(BloomFilterFuncBase* bloom_filter) const {
         if (_context.hybrid_set->size() > 0) {
-            auto it = _context.hybrid_set->begin();
-
-            if (_use_batch) {
-                while (it->has_next()) {
-                    bloom_filter->insert_fixed_len((char*)it->get_value());
-                    it->next();
-                }
-            } else {
-                while (it->has_next()) {
-                    if (_use_new_hash) {
-                        bloom_filter->insert_crc32_hash(it->get_value());
-                    } else {
-                        bloom_filter->insert(it->get_value());
-                    }
-
-                    it->next();
-                }
+            auto* it = _context.hybrid_set->begin();
+            while (it->has_next()) {
+                bloom_filter->insert(it->get_value());
+                it->next();
             }
         }
     }
@@ -428,20 +404,12 @@ public:
             break;
         }
         case RuntimeFilterType::BLOOM_FILTER: {
-            if (_use_new_hash) {
-                _context.bloom_filter_func->insert_crc32_hash(data);
-            } else {
-                _context.bloom_filter_func->insert(data);
-            }
+            _context.bloom_filter_func->insert(data);
             break;
         }
         case RuntimeFilterType::IN_OR_BLOOM_FILTER: {
             if (_is_bloomfilter) {
-                if (_use_new_hash) {
-                    _context.bloom_filter_func->insert_crc32_hash(data);
-                } else {
-                    _context.bloom_filter_func->insert(data);
-                }
+                _context.bloom_filter_func->insert(data);
             } else {
                 _context.hybrid_set->insert(data);
             }
@@ -457,30 +425,30 @@ public:
         }
     }
 
-    void insert_fixed_len(const char* data, const int* offsets, int number) {
+    void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) {
         switch (_filter_type) {
         case RuntimeFilterType::IN_FILTER: {
             if (_is_ignored_in_filter) {
                 break;
             }
-            _context.hybrid_set->insert_fixed_len(data, offsets, number);
+            _context.hybrid_set->insert_fixed_len(column, start);
             break;
         }
         case RuntimeFilterType::MIN_FILTER:
         case RuntimeFilterType::MAX_FILTER:
         case RuntimeFilterType::MINMAX_FILTER: {
-            _context.minmax_func->insert_fixed_len(data, offsets, number);
+            _context.minmax_func->insert_fixed_len(column, start);
             break;
         }
         case RuntimeFilterType::BLOOM_FILTER: {
-            _context.bloom_filter_func->insert_fixed_len(data, offsets, number);
+            _context.bloom_filter_func->insert_fixed_len(column, start);
             break;
         }
         case RuntimeFilterType::IN_OR_BLOOM_FILTER: {
             if (_is_bloomfilter) {
-                _context.bloom_filter_func->insert_fixed_len(data, offsets, number);
+                _context.bloom_filter_func->insert_fixed_len(column, start);
             } else {
-                _context.hybrid_set->insert_fixed_len(data, offsets, number);
+                _context.hybrid_set->insert_fixed_len(column, start);
             }
             break;
         }
@@ -508,24 +476,33 @@ public:
         }
     }
 
-    void insert_batch(const vectorized::ColumnPtr column, const std::vector<int>& rows) {
+    void insert_batch(const vectorized::ColumnPtr& column, size_t start) {
         if (get_real_type() == RuntimeFilterType::BITMAP_FILTER) {
-            bitmap_filter_insert_batch(column, rows);
-        } else if (IRuntimeFilter::enable_use_batch(_be_exec_version > 0, _column_return_type)) {
-            insert_fixed_len(column->get_raw_data().data, rows.data(), rows.size());
+            bitmap_filter_insert_batch(column, start);
         } else {
-            for (int index : rows) {
-                insert(column->get_data_at(index));
-            }
+            insert_fixed_len(column, start);
         }
     }
 
-    void bitmap_filter_insert_batch(const vectorized::ColumnPtr column,
-                                    const std::vector<int>& rows) {
+    void bitmap_filter_insert_batch(const vectorized::ColumnPtr column, size_t start) {
         std::vector<const BitmapValue*> bitmaps;
-        auto* col = assert_cast<const vectorized::ColumnComplexType<BitmapValue>*>(column.get());
-        for (int index : rows) {
-            bitmaps.push_back(&(col->get_data()[index]));
+        if (column->is_nullable()) {
+            const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get());
+            const auto& col =
+                    assert_cast<const vectorized::ColumnBitmap&>(nullable->get_nested_column());
+            const auto& nullmap =
+                    assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column())
+                            .get_data();
+            for (size_t i = start; i < column->size(); i++) {
+                if (!nullmap[i]) {
+                    bitmaps.push_back(&(col.get_data()[i]));
+                }
+            }
+        } else {
+            const auto* col = assert_cast<const vectorized::ColumnBitmap*>(column.get());
+            for (size_t i = start; i < column->size(); i++) {
+                bitmaps.push_back(&(col->get_data()[i]));
+            }
         }
         _context.bitmap_filter_func->insert_many(bitmaps);
     }
@@ -1039,13 +1016,6 @@ private:
     bool _is_ignored_in_filter = false;
     std::string* _ignored_in_filter_msg = nullptr;
     uint32_t _filter_id;
-
-    // When _column_return_type is invalid, _use_batch will be always false.
-    bool _use_batch;
-
-    // When _use_new_hash is set to true, use the new hash method.
-    // This is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-    const bool _use_new_hash;
 };
 
 Status IRuntimeFilter::create(RuntimeState* state, ObjectPool* pool, const TRuntimeFilterDesc* desc,
@@ -1092,10 +1062,9 @@ void IRuntimeFilter::insert(const StringRef& value) {
     _wrapper->insert(value);
 }
 
-void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column,
-                                  const std::vector<int>& rows) {
+void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, size_t start) {
     DCHECK(is_producer());
-    _wrapper->insert_batch(column, rows);
+    _wrapper->insert_batch(column, start);
 }
 
 Status IRuntimeFilter::publish() {
diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h
index 797b217662..4b8c982fee 100644
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@@ -248,7 +248,7 @@ public:
     // only used for producer
     void insert(const void* data);
     void insert(const StringRef& data);
-    void insert_batch(vectorized::ColumnPtr column, const std::vector<int>& rows);
+    void insert_batch(vectorized::ColumnPtr column, size_t start);
 
     // publish filter
     // push filter to remote node or push down it to scan_node
@@ -336,10 +336,6 @@ public:
 
     void update_runtime_filter_type_to_profile();
 
-    static bool enable_use_batch(bool use_batch, PrimitiveType type) {
-        return use_batch && (is_int_or_bool(type) || is_float_or_double(type));
-    }
-
     int filter_id() const { return _filter_id; }
 
     static std::string to_string(RuntimeFilterType type) {
diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h
index 6c96b16055..62cf0eab7d 100644
--- a/be/src/exprs/runtime_filter_slots.h
+++ b/be/src/exprs/runtime_filter_slots.h
@@ -37,7 +37,7 @@ public:
             const std::vector<TRuntimeFilterDesc>& runtime_filter_descs)
             : _build_expr_context(build_expr_ctxs), _runtime_filter_descs(runtime_filter_descs) {}
 
-    Status init(RuntimeState* state, int64_t hash_table_size, size_t build_bf_cardinality) {
+    Status init(RuntimeState* state, int64_t hash_table_size) {
         // runtime filter effect strategy
         // 1. we will ignore IN filter when hash_table_size is too big
         // 2. we will ignore BLOOM filter and MinMax filter when hash_table_size
@@ -111,7 +111,7 @@ public:
             }
 
             if (runtime_filter->is_bloomfilter()) {
-                RETURN_IF_ERROR(runtime_filter->init_bloom_filter(build_bf_cardinality));
+                RETURN_IF_ERROR(runtime_filter->init_bloom_filter(hash_table_size));
             }
 
             // Note:
@@ -162,7 +162,7 @@ public:
         return Status::OK();
     }
 
-    void insert(std::unordered_map<const vectorized::Block*, std::vector<int>>& datas) {
+    void insert(const std::unordered_set<const vectorized::Block*>& datas) {
         for (int i = 0; i < _build_expr_context.size(); ++i) {
             auto iter = _runtime_filters.find(i);
             if (iter == _runtime_filters.end()) {
@@ -170,29 +170,10 @@ public:
             }
 
             int result_column_id = _build_expr_context[i]->get_last_result_column_id();
-            for (auto it : datas) {
-                auto& column = it.first->get_by_position(result_column_id).column;
-
-                if (auto* nullable =
-                            vectorized::check_and_get_column<vectorized::ColumnNullable>(*column)) {
-                    auto& column_nested = nullable->get_nested_column_ptr();
-                    auto& column_nullmap = nullable->get_null_map_column_ptr();
-                    std::vector<int> indexs;
-                    for (int row_num : it.second) {
-                        if (assert_cast<const vectorized::ColumnUInt8*>(column_nullmap.get())
-                                    ->get_bool(row_num)) {
-                            continue;
-                        }
-                        indexs.push_back(row_num);
-                    }
-                    for (auto filter : iter->second) {
-                        filter->insert_batch(column_nested, indexs);
-                    }
-
-                } else {
-                    for (auto filter : iter->second) {
-                        filter->insert_batch(column, it.second);
-                    }
+            for (const auto* it : datas) {
+                auto column = it->get_by_position(result_column_id).column;
+                for (auto* filter : iter->second) {
+                    filter->insert_batch(column, 1);
                 }
             }
         }
diff --git a/be/src/exprs/runtime_filter_slots_cross.h b/be/src/exprs/runtime_filter_slots_cross.h
index 4868b27a4e..76b6085bab 100644
--- a/be/src/exprs/runtime_filter_slots_cross.h
+++ b/be/src/exprs/runtime_filter_slots_cross.h
@@ -61,7 +61,7 @@ public:
     Status insert(vectorized::Block* block) {
         for (int i = 0; i < _runtime_filters.size(); ++i) {
             auto* filter = _runtime_filters[i];
-            auto& vexpr_ctx = filter_src_expr_ctxs[i];
+            const auto& vexpr_ctx = filter_src_expr_ctxs[i];
 
             int result_column_id = -1;
             RETURN_IF_ERROR(vexpr_ctx->execute(block, &result_column_id));
@@ -70,25 +70,7 @@ public:
                     block->get_by_position(result_column_id)
                             .column->convert_to_full_column_if_const();
 
-            auto& column = block->get_by_position(result_column_id).column;
-            if (auto* nullable =
-                        vectorized::check_and_get_column<vectorized::ColumnNullable>(*column)) {
-                auto& column_nested = nullable->get_nested_column_ptr();
-                auto& column_nullmap = nullable->get_null_map_column_ptr();
-                std::vector<int> indexs;
-                for (int row_index = 0; row_index < column->size(); ++row_index) {
-                    if (assert_cast<const vectorized::ColumnUInt8*>(column_nullmap.get())
-                                ->get_bool(row_index)) {
-                        continue;
-                    }
-                    indexs.push_back(row_index);
-                }
-                filter->insert_batch(column_nested, indexs);
-            } else {
-                std::vector<int> rows(column->size());
-                std::iota(rows.begin(), rows.end(), 0);
-                filter->insert_batch(column, rows);
-            }
+            filter->insert_batch(block->get_by_position(result_column_id).column, 0);
         }
         return Status::OK();
     }
@@ -100,7 +82,7 @@ public:
         return Status::OK();
     }
 
-    bool empty() { return !_runtime_filters.size(); }
+    bool empty() { return _runtime_filters.empty(); }
 
 private:
     const std::vector<TRuntimeFilterDesc>& _runtime_filter_descs;
diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h
index 7280a1e836..156f054a3f 100644
--- a/be/src/olap/bloom_filter_predicate.h
+++ b/be/src/olap/bloom_filter_predicate.h
@@ -66,52 +66,17 @@ private:
         uint16_t new_size = 0;
         if (column.is_column_dictionary()) {
             const auto* dict_col = reinterpret_cast<const vectorized::ColumnDictI32*>(&column);
-            if (_be_exec_version >= 2) {
-                for (uint16_t i = 0; i < size; i++) {
-                    uint16_t idx = sel[i];
-                    sel[new_size] = idx;
-                    if constexpr (is_nullable) {
-                        new_size += !null_map[idx] && _specific_filter->find_uint32_t(
-                                                              dict_col->get_crc32_hash_value(idx));
-                    } else {
-                        new_size += _specific_filter->find_uint32_t(
-                                dict_col->get_crc32_hash_value(idx));
-                    }
-                }
-            } else {
-                for (uint16_t i = 0; i < size; i++) {
-                    uint16_t idx = sel[i];
-                    sel[new_size] = idx;
-                    if constexpr (is_nullable) {
-                        new_size += !null_map[idx] &&
-                                    _specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
-                    } else {
-                        new_size += _specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
-                    }
-                }
-            }
-        } else if (is_string_type(T) && _be_exec_version >= 2) {
-            auto& pred_col =
-                    reinterpret_cast<
-                            const vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>(
-                            &column)
-                            ->get_data();
-
-            auto pred_col_data = pred_col.data();
-            const bool is_dense_column = pred_col.size() == size;
             for (uint16_t i = 0; i < size; i++) {
-                uint16_t idx = is_dense_column ? i : sel[i];
+                uint16_t idx = sel[i];
+                sel[new_size] = idx;
                 if constexpr (is_nullable) {
-                    if (!null_map[idx] && _specific_filter->find_crc32_hash(&pred_col_data[idx])) {
-                        sel[new_size++] = idx;
-                    }
+                    new_size += !null_map[idx] &&
+                                _specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
                 } else {
-                    if (_specific_filter->find_crc32_hash(&pred_col_data[idx])) {
-                        sel[new_size++] = idx;
-                    }
+                    new_size += _specific_filter->find_uint32_t(dict_col->get_hash_value(idx));
                 }
             }
-        } else if (IRuntimeFilter::enable_use_batch(_be_exec_version > 0, T)) {
+        } else {
             const auto& data =
                     reinterpret_cast<
                             const vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>(
@@ -119,20 +84,6 @@ private:
                             ->get_data();
             new_size = _specific_filter->find_fixed_len_olap_engine((char*)data.data(), null_map,
                                                                     sel, size, data.size() != size);
-        } else {
-            auto& pred_col =
-                    reinterpret_cast<
-                            const vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>(
-                            &column)
-                            ->get_data();
-
-            auto pred_col_data = pred_col.data();
-#define EVALUATE_WITH_NULL_IMPL(IDX) \
-    !null_map[IDX] && _specific_filter->find_olap_engine(&pred_col_data[IDX])
-#define EVALUATE_WITHOUT_NULL_IMPL(IDX) _specific_filter->find_olap_engine(&pred_col_data[IDX])
-            EVALUATE_BY_SELECTOR(EVALUATE_WITH_NULL_IMPL, EVALUATE_WITHOUT_NULL_IMPL)
-#undef EVALUATE_WITH_NULL_IMPL
-#undef EVALUATE_WITHOUT_NULL_IMPL
         }
         return new_size;
     }
@@ -164,8 +115,8 @@ uint16_t BloomFilterColumnPredicate<T>::evaluate(const vectorized::IColumn& colu
         return size;
     }
     if (column.is_nullable()) {
-        auto* nullable_col = reinterpret_cast<const vectorized::ColumnNullable*>(&column);
-        auto& null_map_data = nullable_col->get_null_map_column().get_data();
+        const auto* nullable_col = reinterpret_cast<const vectorized::ColumnNullable*>(&column);
+        const auto& null_map_data = nullable_col->get_null_map_column().get_data();
         new_size =
                 evaluate<true>(nullable_col->get_nested_column(), null_map_data.data(), sel, size);
     } else {
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index 41b030b4e1..1e31014512 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -39,10 +39,7 @@ Overload(Callables&&... callables) -> Overload<Callables...>;
 
 HashJoinBuildSinkLocalState::HashJoinBuildSinkLocalState(DataSinkOperatorXBase* parent,
                                                          RuntimeState* state)
-        : JoinBuildSinkLocalState(parent, state),
-          _build_block_idx(0),
-          _build_side_mem_used(0),
-          _build_side_last_mem_used(0) {}
+        : JoinBuildSinkLocalState(parent, state) {}
 
 Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) {
     RETURN_IF_ERROR(JoinBuildSinkLocalState::init(state, info));
@@ -52,15 +49,7 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo
             _parent->operator_id(), _parent->node_id(), state->get_query_ctx());
     auto& p = _parent->cast<HashJoinBuildSinkOperatorX>();
     _shared_state->join_op_variants = p._join_op_variants;
-    if (p._is_broadcast_join && state->enable_share_hash_table_for_broadcast_join()) {
-        _shared_state->build_blocks = p._shared_hash_table_context->blocks;
-    } else {
-        _shared_state->build_blocks.reset(new std::vector<vectorized::Block>());
-        // avoid vector expand change block address.
-        // one block can store 4g data, _build_blocks can store 128*4g data.
-        // if probe data bigger than 512g, runtime filter maybe will core dump when insert data.
-        _shared_state->build_blocks->reserve(vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT);
-    }
+
     _shared_state->is_null_safe_eq_join = p._is_null_safe_eq_join;
     _shared_state->store_null_in_hash_table = p._store_null_in_hash_table;
     _build_expr_ctxs.resize(p._build_expr_ctxs.size());
@@ -84,11 +73,6 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo
         _shared_hash_table_dependency->block();
         p._shared_hashtable_controller->append_dependency(p.node_id(),
                                                           _shared_hash_table_dependency);
-    } else if (p._is_broadcast_join) {
-        // avoid vector expand change block address.
-        // one block can store 4g data, _build_blocks can store 128*4g data.
-        // if probe data bigger than 512g, runtime filter maybe will core dump when insert data.
-        _shared_state->build_blocks->reserve(vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT);
     }
 
     _memory_usage_counter = ADD_LABEL_COUNTER(profile(), "MemoryUsage");
@@ -106,17 +90,10 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo
     _build_side_merge_block_timer = ADD_TIMER(profile(), "BuildSideMergeBlockTime");
     _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime");
     _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime");
-    _build_table_expanse_timer = ADD_TIMER(record_profile, "BuildTableExpanseTime");
-    _build_table_convert_timer = ADD_TIMER(record_profile, "BuildTableConvertToPartitionedTime");
     _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime");
-    _build_runtime_filter_timer = ADD_TIMER(record_profile, "BuildRuntimeFilterTime");
 
     _allocate_resource_timer = ADD_TIMER(profile(), "AllocateResourceTime");
 
-    _build_buckets_counter = ADD_COUNTER(profile(), "BuildBuckets", TUnit::UNIT);
-    _build_buckets_fill_counter = ADD_COUNTER(profile(), "FilledBuckets", TUnit::UNIT);
-
-    _build_collisions_counter = ADD_COUNTER(profile(), "BuildCollisions", TUnit::UNIT);
     // Hash Table Init
     _hash_table_init(state);
 
@@ -158,19 +135,18 @@ void HashJoinBuildSinkLocalState::init_short_circuit_for_probe() {
     _shared_state->short_circuit_for_probe =
             (_shared_state->_has_null_in_build_side &&
              p._join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && !p._is_mark_join) ||
-            (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::INNER_JOIN &&
+            (!_shared_state->build_block && p._join_op == TJoinOp::INNER_JOIN &&
              !p._is_mark_join) ||
-            (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::LEFT_SEMI_JOIN &&
+            (!_shared_state->build_block && p._join_op == TJoinOp::LEFT_SEMI_JOIN &&
              !p._is_mark_join) ||
-            (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::RIGHT_OUTER_JOIN) ||
-            (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::RIGHT_SEMI_JOIN) ||
-            (_shared_state->build_blocks->empty() && p._join_op == TJoinOp::RIGHT_ANTI_JOIN);
+            (!_shared_state->build_block && p._join_op == TJoinOp::RIGHT_OUTER_JOIN) ||
+            (!_shared_state->build_block && p._join_op == TJoinOp::RIGHT_SEMI_JOIN) ||
+            (!_shared_state->build_block && p._join_op == TJoinOp::RIGHT_ANTI_JOIN);
 
     //when build table rows is 0 and not have other_join_conjunct and not _is_mark_join and join type is one of LEFT_OUTER_JOIN/FULL_OUTER_JOIN/LEFT_ANTI_JOIN
     //we could get the result is probe table + null-column(if need output)
     _shared_state->empty_right_table_need_probe_dispose =
-            (_shared_state->build_blocks->empty() && !p._have_other_join_conjunct &&
-             !p._is_mark_join) &&
+            (!_shared_state->build_block && !p._have_other_join_conjunct && !p._is_mark_join) &&
             (p._join_op == TJoinOp::LEFT_OUTER_JOIN || p._join_op == TJoinOp::FULL_OUTER_JOIN ||
              p._join_op == TJoinOp::LEFT_ANTI_JOIN);
 }
@@ -238,7 +214,7 @@ Status HashJoinBuildSinkLocalState::_extract_join_column(
 }
 
 Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state,
-                                                        vectorized::Block& block, uint8_t offset) {
+                                                        vectorized::Block& block) {
     auto& p = _parent->cast<HashJoinBuildSinkOperatorX>();
     SCOPED_TIMER(_build_table_timer);
     size_t rows = block.rows();
@@ -254,6 +230,14 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state,
     RETURN_IF_ERROR(_do_evaluate(block, _build_expr_ctxs, *_build_expr_call_timer, res_col_ids));
     if (p._join_op == TJoinOp::LEFT_OUTER_JOIN || p._join_op == TJoinOp::FULL_OUTER_JOIN) {
         _convert_block_to_null(block);
+        // first row is mocked
+        for (int i = 0; i < block.columns(); i++) {
+            auto [column, is_const] = unpack_if_const(block.safe_get_by_position(i).column);
+            assert_cast<vectorized::ColumnNullable*>(column->assume_mutable().get())
+                    ->get_null_map_column()
+                    .get_data()
+                    .data()[0] = 1;
+        }
     }
     // TODO: Now we are not sure whether a column is nullable only by ExecNode's `row_desc`
     //  so we have to initialize this flag by the first build block.
@@ -270,29 +254,30 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state,
     Status st = _extract_join_column(block, null_map_val, raw_ptrs, res_col_ids);
 
     st = std::visit(
-            Overload {
-                    [&](std::monostate& arg, auto has_null_value,
-                        auto short_circuit_for_null_in_build_side) -> Status {
-                        LOG(FATAL) << "FATAL: uninited hash table";
-                        __builtin_unreachable();
-                        return Status::OK();
-                    },
-                    [&](auto&& arg, auto has_null_value,
-                        auto short_circuit_for_null_in_build_side) -> Status {
-                        using HashTableCtxType = std::decay_t<decltype(arg)>;
-                        vectorized::ProcessHashTableBuild<HashTableCtxType,
-                                                          HashJoinBuildSinkLocalState>
-                                hash_table_build_process(rows, block, raw_ptrs, this,
-                                                         state->batch_size(), offset, state);
-                        return hash_table_build_process
-                                .template run<has_null_value, short_circuit_for_null_in_build_side>(
-                                        arg,
-                                        has_null_value || short_circuit_for_null_in_build_side
-                                                ? &null_map_val->get_data()
-                                                : nullptr,
-                                        &_shared_state->_has_null_in_build_side);
-                    }},
-            *_shared_state->hash_table_variants,
+            Overload {[&](std::monostate& arg, auto join_op, auto has_null_value,
+                          auto short_circuit_for_null_in_build_side) -> Status {
+                          LOG(FATAL) << "FATAL: uninited hash table";
+                          __builtin_unreachable();
+                          return Status::OK();
+                      },
+                      [&](auto&& arg, auto&& join_op, auto has_null_value,
+                          auto short_circuit_for_null_in_build_side) -> Status {
+                          using HashTableCtxType = std::decay_t<decltype(arg)>;
+                          using JoinOpType = std::decay_t<decltype(join_op)>;
+                          vectorized::ProcessHashTableBuild<HashTableCtxType,
+                                                            HashJoinBuildSinkLocalState>
+                                  hash_table_build_process(rows, block, raw_ptrs, this,
+                                                           state->batch_size(), state);
+                          return hash_table_build_process
+                                  .template run<JoinOpType::value, has_null_value,
+                                                short_circuit_for_null_in_build_side>(
+                                          arg,
+                                          has_null_value || short_circuit_for_null_in_build_side
+                                                  ? &null_map_val->get_data()
+                                                  : nullptr,
+                                          &_shared_state->_has_null_in_build_side);
+                      }},
+            *_shared_state->hash_table_variants, _shared_state->join_op_variants,
             vectorized::make_bool_variant(_build_side_ignore_null),
             vectorized::make_bool_variant(p._short_circuit_for_null_in_build_side));
 
@@ -384,7 +369,7 @@ void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) {
                     }
                     return;
                 }
-                if (!try_get_hash_map_context_fixed<PartitionedHashMap, HashCRC32, RowRefListType>(
+                if (!try_get_hash_map_context_fixed<JoinFixedHashMap, HashCRC32, RowRefListType>(
                             *_shared_state->hash_table_variants, _build_expr_ctxs)) {
                     _shared_state->hash_table_variants
                             ->emplace<vectorized::SerializedHashTableContext<RowRefListType>>();
@@ -394,16 +379,6 @@ void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) {
             vectorized::make_bool_variant(p._have_other_join_conjunct));
 
     DCHECK(!std::holds_alternative<std::monostate>(*_shared_state->hash_table_variants));
-
-    std::visit(vectorized::Overload {[&](std::monostate& arg) {
-                                         LOG(FATAL) << "FATAL: uninited hash table";
-                                         __builtin_unreachable();
-                                     },
-                                     [&](auto&& arg) {
-                                         arg.hash_table->set_partitioned_threshold(
-                                                 state->partitioned_hash_join_rows_threshold());
-                                     }},
-               *_shared_state->hash_table_variants);
 }
 
 HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id,
@@ -466,68 +441,41 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
     SCOPED_TIMER(local_state.exec_time_counter());
     COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows());
 
-    // make one block for each 4 gigabytes
-    constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL;
-
-    if (local_state._shared_state->_has_null_in_build_side) {
-        // TODO: if _has_null_in_build_side is true we should finish current pipeline task.
-        DCHECK(state->enable_pipeline_exec());
-        return Status::OK();
-    }
     if (local_state._should_build_hash_table) {
         // If eos or have already met a null value using short-circuit strategy, we do not need to pull
         // data from probe side.
         local_state._build_side_mem_used += in_block->allocated_bytes();
 
+        if (local_state._build_side_mutable_block.empty()) {
+            auto tmp_build_block = vectorized::VectorizedUtils::create_empty_columnswithtypename(
+                    _child_x->row_desc());
+            local_state._build_side_mutable_block =
+                    vectorized::MutableBlock::build_mutable_block(&tmp_build_block);
+            RETURN_IF_ERROR(local_state._build_side_mutable_block.merge(
+                    *(tmp_build_block.create_same_struct_block(1, false))));
+        }
+
         if (in_block->rows() != 0) {
             SCOPED_TIMER(local_state._build_side_merge_block_timer);
             RETURN_IF_ERROR(local_state._build_side_mutable_block.merge(*in_block));
-        }
-
-        if (UNLIKELY(local_state._build_side_mem_used - local_state._build_side_last_mem_used >
-                     BUILD_BLOCK_MAX_SIZE)) {
-            if (local_state._shared_state->build_blocks->size() ==
-                vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT) {
-                return Status::NotSupported(strings::Substitute(
-                        "data size of right table in hash join > $0",
-                        BUILD_BLOCK_MAX_SIZE * vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT));
+            if (local_state._build_side_mutable_block.rows() >
+                std::numeric_limits<uint32_t>::max()) {
+                return Status::NotSupported(
+                        "Hash join do not support build table rows"
+                        " over:" +
+                        std::to_string(std::numeric_limits<uint32_t>::max()));
             }
-            local_state._shared_state->build_blocks->emplace_back(
-                    local_state._build_side_mutable_block.to_block());
-
-            COUNTER_UPDATE(local_state._build_blocks_memory_usage,
-                           (*local_state._shared_state->build_blocks)[local_state._build_block_idx]
-                                   .bytes());
-
-            // TODO:: Rethink may we should do the process after we receive all build blocks ?
-            // which is better.
-            RETURN_IF_ERROR(local_state.process_build_block(
-                    state, (*local_state._shared_state->build_blocks)[local_state._build_block_idx],
-                    local_state._build_block_idx));
-
-            local_state._build_side_mutable_block = vectorized::MutableBlock();
-            ++local_state._build_block_idx;
-            local_state._build_side_last_mem_used = local_state._build_side_mem_used;
         }
     }
 
     if (local_state._should_build_hash_table && source_state == SourceState::FINISHED) {
-        if (!local_state._build_side_mutable_block.empty()) {
-            if (local_state._shared_state->build_blocks->size() ==
-                vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT) {
-                return Status::NotSupported(strings::Substitute(
-                        "data size of right table in hash join > $0",
-                        BUILD_BLOCK_MAX_SIZE * vectorized::HASH_JOIN_MAX_BUILD_BLOCK_COUNT));
-            }
-            local_state._shared_state->build_blocks->emplace_back(
-                    local_state._build_side_mutable_block.to_block());
-            COUNTER_UPDATE(local_state._build_blocks_memory_usage,
-                           (*local_state._shared_state->build_blocks)[local_state._build_block_idx]
-                                   .bytes());
-            RETURN_IF_ERROR(local_state.process_build_block(
-                    state, (*local_state._shared_state->build_blocks)[local_state._build_block_idx],
-                    local_state._build_block_idx));
-        }
+        DCHECK(!local_state._build_side_mutable_block.empty());
+        local_state._shared_state->build_block = std::make_shared<vectorized::Block>(
+                local_state._build_side_mutable_block.to_block());
+        COUNTER_UPDATE(local_state._build_blocks_memory_usage,
+                       (*local_state._shared_state->build_block).bytes());
+        RETURN_IF_ERROR(
+                local_state.process_build_block(state, (*local_state._shared_state->build_block)));
         auto ret = std::visit(
                 Overload {[&](std::monostate&) -> Status {
                               LOG(FATAL) << "FATAL: uninited hash table";
@@ -557,6 +505,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
                 local_state._runtime_filter_slots->copy_to_shared_context(
                         _shared_hash_table_context);
             }
+            _shared_hash_table_context->block = local_state._shared_state->build_block;
             _shared_hashtable_controller->signal(node_id());
         }
     } else if (!local_state._should_build_hash_table) {
@@ -585,6 +534,8 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
                 *std::static_pointer_cast<vectorized::HashTableVariants>(
                         _shared_hash_table_context->hash_table_variants));
 
+        local_state._shared_state->build_block = _shared_hash_table_context->block;
+
         if (!_shared_hash_table_context->runtime_filters.empty()) {
             auto ret = std::visit(
                     Overload {
@@ -601,7 +552,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
                                                 _build_expr_ctxs, _runtime_filter_descs);
 
                                 RETURN_IF_ERROR(local_state._runtime_filter_slots->init(
-                                        state, arg.hash_table->size(), 0));
+                                        state, arg.hash_table->size()));
                                 RETURN_IF_ERROR(
                                         local_state._runtime_filter_slots->copy_from_shared_context(
                                                 _shared_hash_table_context));
@@ -617,7 +568,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
     if (source_state == SourceState::FINISHED) {
         // Since the comparison of null values is meaningless, null aware left anti join should not output null
         // when the build side is not empty.
-        if (!local_state._shared_state->build_blocks->empty() &&
+        if (local_state._shared_state->build_block &&
             _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
             local_state._shared_state->probe_ignore_null = true;
         }
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h
index 34c3147b7f..b45c2eed75 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.h
+++ b/be/src/pipeline/exec/hashjoin_build_sink.h
@@ -68,11 +68,11 @@ public:
     ENABLE_FACTORY_CREATOR(HashJoinBuildSinkLocalState);
     using Parent = HashJoinBuildSinkOperatorX;
     HashJoinBuildSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state);
-    ~HashJoinBuildSinkLocalState() = default;
+    ~HashJoinBuildSinkLocalState() override = default;
 
     Status init(RuntimeState* state, LocalSinkStateInfo& info) override;
     Status open(RuntimeState* state) override;
-    Status process_build_block(RuntimeState* state, vectorized::Block& block, uint8_t offset);
+    Status process_build_block(RuntimeState* state, vectorized::Block& block);
 
     void init_short_circuit_for_probe();
 
@@ -108,30 +108,20 @@ protected:
 
     std::vector<IRuntimeFilter*> _runtime_filters;
     bool _should_build_hash_table = true;
-    uint8_t _build_block_idx = 0;
     int64_t _build_side_mem_used = 0;
     int64_t _build_side_last_mem_used = 0;
     vectorized::MutableBlock _build_side_mutable_block;
     std::shared_ptr<VRuntimeFilterSlots> _runtime_filter_slots;
     bool _has_set_need_null_map_for_build = false;
     bool _build_side_ignore_null = false;
-    size_t _build_rf_cardinality = 0;
-    std::unordered_map<const vectorized::Block*, std::vector<int>> _inserted_rows;
+    std::unordered_set<const vectorized::Block*> _inserted_blocks;
     std::shared_ptr<SharedHashTableDependency> _shared_hash_table_dependency;
 
     RuntimeProfile::Counter* _build_table_timer = nullptr;
     RuntimeProfile::Counter* _build_expr_call_timer = nullptr;
     RuntimeProfile::Counter* _build_table_insert_timer = nullptr;
-    RuntimeProfile::Counter* _build_table_expanse_timer = nullptr;
-    RuntimeProfile::Counter* _build_table_convert_timer = nullptr;
-    RuntimeProfile::Counter* _build_buckets_counter = nullptr;
-    RuntimeProfile::Counter* _build_buckets_fill_counter = nullptr;
-
     RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr;
     RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr;
-    RuntimeProfile::Counter* _build_runtime_filter_timer = nullptr;
-
-    RuntimeProfile::Counter* _build_collisions_counter = nullptr;
 
     RuntimeProfile::Counter* _allocate_resource_timer = nullptr;
 
diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
index 52ac0ea842..412c358037 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
@@ -89,7 +89,9 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) {
 
 void HashJoinProbeLocalState::prepare_for_next() {
     _probe_index = 0;
+    _build_index = 0;
     _ready_probe = false;
+    _last_probe_match = -1;
     _prepare_probe_block();
 }
 
diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h
index 59a1057b3a..4de50474bf 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.h
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.h
@@ -97,8 +97,8 @@ public:
     vectorized::DataTypes right_table_data_types();
     vectorized::DataTypes left_table_data_types();
     bool* has_null_in_build_side() { return &_shared_state->_has_null_in_build_side; }
-    std::shared_ptr<std::vector<vectorized::Block>> build_blocks() const {
-        return _shared_state->build_blocks;
+    const std::shared_ptr<vectorized::Block>& build_block() const {
+        return _shared_state->build_block;
     }
 
 private:
@@ -114,9 +114,11 @@ private:
     friend struct vectorized::ProcessHashTableProbe;
 
     int _probe_index = -1;
+    uint32_t _build_index = 0;
     bool _ready_probe = false;
     bool _probe_eos = false;
     std::atomic<bool> _probe_inited = false;
+    int _last_probe_match;
 
     vectorized::Block _probe_block;
     vectorized::ColumnRawPtrs _probe_columns;
@@ -130,8 +132,6 @@ private:
     bool _need_null_map_for_probe = false;
     bool _has_set_need_null_map_for_probe = false;
     vectorized::ColumnUInt8::MutablePtr _null_map_column;
-    // for cases when a probe row matches more than batch size build rows.
-    bool _is_any_probe_match_row_output = false;
     std::unique_ptr<HashTableCtxVariants> _process_hashtable_ctx_variants =
             std::make_unique<HashTableCtxVariants>();
 
diff --git a/be/src/pipeline/exec/join_build_sink_operator.cpp b/be/src/pipeline/exec/join_build_sink_operator.cpp
index fe790d9032..e2cc361c22 100644
--- a/be/src/pipeline/exec/join_build_sink_operator.cpp
+++ b/be/src/pipeline/exec/join_build_sink_operator.cpp
@@ -35,10 +35,10 @@ Status JoinBuildSinkLocalState<DependencyType, Derived>::init(RuntimeState* stat
     _build_rows_counter = ADD_COUNTER(PipelineXSinkLocalState<DependencyType>::profile(),
                                       "BuildRows", TUnit::UNIT);
 
-    _push_down_timer = ADD_TIMER(PipelineXSinkLocalState<DependencyType>::profile(),
-                                 "PublishRuntimeFilterTime");
-    _push_compute_timer =
-            ADD_TIMER(PipelineXSinkLocalState<DependencyType>::profile(), "PushDownComputeTime");
+    _publish_runtime_filter_timer = ADD_TIMER(PipelineXSinkLocalState<DependencyType>::profile(),
+                                              "PublishRuntimeFilterTime");
+    _runtime_filter_compute_timer = ADD_TIMER(PipelineXSinkLocalState<DependencyType>::profile(),
+                                              "RuntimeFilterComputeTime");
 
     return Status::OK();
 }
diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h
index d339c2a977..8eeb02e2af 100644
--- a/be/src/pipeline/exec/join_build_sink_operator.h
+++ b/be/src/pipeline/exec/join_build_sink_operator.h
@@ -41,8 +41,8 @@ protected:
     friend class JoinBuildSinkOperatorX;
 
     RuntimeProfile::Counter* _build_rows_counter = nullptr;
-    RuntimeProfile::Counter* _push_down_timer = nullptr;
-    RuntimeProfile::Counter* _push_compute_timer = nullptr;
+    RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr;
+    RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr;
 };
 
 template <typename LocalStateType>
diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h
index 0097b75c0a..9d7b8821c9 100644
--- a/be/src/pipeline/exec/nested_loop_join_build_operator.h
+++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h
@@ -67,9 +67,13 @@ public:
 
     const std::vector<TRuntimeFilterDesc>& runtime_filter_descs();
     vectorized::VExprContextSPtrs& filter_src_expr_ctxs() { return _filter_src_expr_ctxs; }
-    RuntimeProfile::Counter* push_compute_timer() { return _push_compute_timer; }
+    RuntimeProfile::Counter* runtime_filter_compute_timer() {
+        return _runtime_filter_compute_timer;
+    }
     vectorized::Blocks& build_blocks() { return _shared_state->build_blocks; }
-    RuntimeProfile::Counter* push_down_timer() { return _push_down_timer; }
+    RuntimeProfile::Counter* publish_runtime_filter_timer() {
+        return _publish_runtime_filter_timer;
+    }
 
 private:
     friend class NestedLoopJoinBuildSinkOperatorX;
diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp
index 31e8185720..cf8a994b1b 100644
--- a/be/src/pipeline/exec/set_sink_operator.cpp
+++ b/be/src/pipeline/exec/set_sink_operator.cpp
@@ -60,8 +60,7 @@ Status SetSinkOperatorX<is_intersect>::sink(RuntimeState* state, vectorized::Blo
     COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows());
 
     auto& mem_used = local_state._shared_state->mem_used;
-    auto& build_blocks = local_state._shared_state->build_blocks;
-    auto& build_block_index = local_state._shared_state->build_block_index;
+    auto& build_block = local_state._shared_state->build_block;
     auto& valid_element_in_hash_tbl = local_state._shared_state->valid_element_in_hash_tbl;
 
     if (in_block->rows() != 0) {
@@ -71,11 +70,9 @@ Status SetSinkOperatorX<is_intersect>::sink(RuntimeState* state, vectorized::Blo
 
     if (source_state == SourceState::FINISHED ||
         local_state._mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) {
-        build_blocks.emplace_back(local_state._mutable_block.to_block());
-        RETURN_IF_ERROR(_process_build_block(local_state, build_blocks[build_block_index],
-                                             build_block_index, state));
+        build_block = local_state._mutable_block.to_block();
+        RETURN_IF_ERROR(_process_build_block(local_state, build_block, state));
         local_state._mutable_block.clear();
-        ++build_block_index;
 
         if (source_state == SourceState::FINISHED) {
             if constexpr (is_intersect) {
@@ -102,7 +99,7 @@ Status SetSinkOperatorX<is_intersect>::sink(RuntimeState* state, vectorized::Blo
 
 template <bool is_intersect>
 Status SetSinkOperatorX<is_intersect>::_process_build_block(
-        SetSinkLocalState<is_intersect>& local_state, vectorized::Block& block, uint8_t offset,
+        SetSinkLocalState<is_intersect>& local_state, vectorized::Block& block,
         RuntimeState* state) {
     size_t rows = block.rows();
     if (rows == 0) {
@@ -118,7 +115,7 @@ Status SetSinkOperatorX<is_intersect>::_process_build_block(
                 using HashTableCtxType = std::decay_t<decltype(arg)>;
                 if constexpr (!std::is_same_v<HashTableCtxType, std::monostate>) {
                     vectorized::HashTableBuild<HashTableCtxType, is_intersect>
-                            hash_table_build_process(&local_state, rows, raw_ptrs, offset, state);
+                            hash_table_build_process(&local_state, rows, raw_ptrs, state);
                     static_cast<void>(hash_table_build_process(arg, local_state._arena));
                 } else {
                     LOG(FATAL) << "FATAL: uninited hash table";
diff --git a/be/src/pipeline/exec/set_sink_operator.h b/be/src/pipeline/exec/set_sink_operator.h
index 7c4cc7d2cb..d8abe12c9a 100644
--- a/be/src/pipeline/exec/set_sink_operator.h
+++ b/be/src/pipeline/exec/set_sink_operator.h
@@ -132,7 +132,7 @@ private:
     friend struct HashTableBuild;
 
     Status _process_build_block(SetSinkLocalState<is_intersect>& local_state,
-                                vectorized::Block& block, uint8_t offset, RuntimeState* state);
+                                vectorized::Block& block, RuntimeState* state);
     Status _extract_build_column(SetSinkLocalState<is_intersect>& local_state,
                                  vectorized::Block& block, vectorized::ColumnRawPtrs& raw_ptrs);
 
diff --git a/be/src/pipeline/exec/set_source_operator.cpp b/be/src/pipeline/exec/set_source_operator.cpp
index 8baadf7e53..e8a73c00ad 100644
--- a/be/src/pipeline/exec/set_source_operator.cpp
+++ b/be/src/pipeline/exec/set_source_operator.cpp
@@ -180,12 +180,12 @@ void SetSourceOperatorX<is_intersect>::_add_result_columns(
         SetSourceLocalState<is_intersect>& local_state, vectorized::RowRefListWithFlags& value,
         int& block_size) {
     auto& build_col_idx = local_state._shared_state->build_col_idx;
-    auto& build_blocks = local_state._shared_state->build_blocks;
+    auto& build_block = local_state._shared_state->build_block;
 
     auto it = value.begin();
     for (auto idx = build_col_idx.begin(); idx != build_col_idx.end(); ++idx) {
-        auto& column = *build_blocks[it->block_offset].get_by_position(idx->first).column;
-        if (local_state._mutable_cols[idx->second]->is_nullable() xor column.is_nullable()) {
+        auto& column = *build_block.get_by_position(idx->first).column;
+        if (local_state._mutable_cols[idx->second]->is_nullable() ^ column.is_nullable()) {
             DCHECK(local_state._mutable_cols[idx->second]->is_nullable());
             ((vectorized::ColumnNullable*)(local_state._mutable_cols[idx->second].get()))
                     ->insert_from_not_nullable(column, it->row_num);
diff --git a/be/src/pipeline/pipeline_x/dependency.h b/be/src/pipeline/pipeline_x/dependency.h
index a26b7de781..32aa1ff4cf 100644
--- a/be/src/pipeline/pipeline_x/dependency.h
+++ b/be/src/pipeline/pipeline_x/dependency.h
@@ -398,7 +398,7 @@ struct HashJoinSharedState : public JoinSharedState {
             std::make_shared<vectorized::HashTableVariants>();
     const std::vector<TupleDescriptor*> build_side_child_desc;
     size_t build_exprs_size = 0;
-    std::shared_ptr<std::vector<vectorized::Block>> build_blocks;
+    std::shared_ptr<vectorized::Block> build_block;
     bool probe_ignore_null = false;
 };
 
@@ -434,8 +434,7 @@ public:
     /// default init
     //record memory during running
     int64_t mem_used = 0;
-    std::vector<vectorized::Block> build_blocks; // build to source
-    int build_block_index = 0;                   // build to source
+    vectorized::Block build_block; // build to source
     //record element size in hashtable
     int64_t valid_element_in_hash_tbl = 0;
     //first:column_id, could point to origin column or cast column
@@ -506,7 +505,7 @@ public:
             return;
         }
 
-        if (!try_get_hash_map_context_fixed<PartitionedHashMap, HashCRC32,
+        if (!try_get_hash_map_context_fixed<JoinFixedHashMap, HashCRC32,
                                             vectorized::RowRefListWithFlags>(
                     *hash_table_variants, child_exprs_lists[0])) {
             hash_table_variants->emplace<
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 58fe0cb87e..9b49cce1c2 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -241,10 +241,17 @@ public:
     /// Appends a batch elements from other column with the same type
     /// indices_begin + indices_end represent the row indices of column src
     /// Warning:
-    ///       if *indices == -1 means the row is null, only use in outer join, do not use in any other place
+    ///       if *indices == -1 means the row is null
     virtual void insert_indices_from(const IColumn& src, const int* indices_begin,
                                      const int* indices_end) = 0;
 
+    /// Appends a batch elements from other column with the same type
+    /// indices_begin + indices_end represent the row indices of column src
+    /// Warning:
+    ///       if *indices == 0 means the row is null, only use in outer join, do not use in any other place
+    virtual void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                          const uint32_t* indices_end) = 0;
+
     /// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented).
     /// Is used to optimize some computations (in aggregation, for example).
     /// Parameter length could be ignored if column values have fixed size.
diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp
index fa9e048636..98fb480dd1 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -808,6 +808,17 @@ void ColumnArray::insert_indices_from(const IColumn& src, const int* indices_beg
     }
 }
 
+void ColumnArray::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                           const uint32_t* indices_end) {
+    for (auto x = indices_begin; x != indices_end; ++x) {
+        if (*x == 0) {
+            ColumnArray::insert_default();
+        } else {
+            ColumnArray::insert_from(src, *x);
+        }
+    }
+}
+
 ColumnPtr ColumnArray::replicate(const IColumn::Offsets& replicate_offsets) const {
     if (replicate_offsets.empty()) return clone_empty();
 
diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h
index 172815d765..95fd463334 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -222,6 +222,9 @@ public:
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
+
     void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override {
         DCHECK(size() > self_row);
         const auto& r = assert_cast<const ColumnArray&>(rhs);
diff --git a/be/src/vec/columns/column_complex.h b/be/src/vec/columns/column_complex.h
index 6c752d082b..fb89740d85 100644
--- a/be/src/vec/columns/column_complex.h
+++ b/be/src/vec/columns/column_complex.h
@@ -199,6 +199,21 @@ public:
         }
     }
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        const Self& src_vec = assert_cast<const Self&>(src);
+        auto new_size = indices_end - indices_begin;
+
+        for (uint32_t i = 0; i < new_size; ++i) {
+            auto offset = *(indices_begin + i);
+            if (offset == 0) {
+                data.emplace_back(T {});
+            } else {
+                data.emplace_back(src_vec.get_element(offset));
+            }
+        }
+    }
+
     void pop_back(size_t n) override { data.erase(data.end() - n, data.end()); }
     // it's impossible to use ComplexType as key , so we don't have to implement them
     [[noreturn]] StringRef serialize_value_into_arena(size_t n, Arena& arena,
diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h
index 307066a7ae..280d2de834 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -116,6 +116,11 @@ public:
         s += (indices_end - indices_begin);
     }
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        s += (indices_end - indices_begin);
+    }
+
     void insert(const Field&) override { ++s; }
 
     void insert_data(const char*, size_t) override { ++s; }
diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h
index 30c4f1116f..b61753146f 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -131,6 +131,18 @@ public:
         }
     }
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        auto origin_size = size();
+        auto new_size = indices_end - indices_begin;
+        data.resize(origin_size + new_size);
+        const T* __restrict src_data = reinterpret_cast<const T*>(src.get_raw_data().data);
+
+        for (uint32_t i = 0; i < new_size; ++i) {
+            data[origin_size + i] = src_data[indices_begin[i]];
+        }
+    }
+
     void insert_many_fix_len_data(const char* data_ptr, size_t num) override;
 
     void insert_many_raw_data(const char* pos, size_t num) override {
diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h
index 421c8fa2dd..d2374811e1 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -82,6 +82,11 @@ public:
         LOG(FATAL) << "insert_indices_from not supported in ColumnDictionary";
     }
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        LOG(FATAL) << "insert_indices_from_join not supported in ColumnDictionary";
+    }
+
     void pop_back(size_t n) override { LOG(FATAL) << "pop_back not supported in ColumnDictionary"; }
 
     void update_hash_with_value(size_t n, SipHash& hash) const override {
@@ -277,9 +282,7 @@ public:
     }
 
     uint32_t get_hash_value(uint32_t idx) const { return _dict.get_hash_value(_codes[idx], _type); }
-    uint32_t get_crc32_hash_value(uint32_t idx) const {
-        return _dict.get_crc32_hash_value(_codes[idx], _type);
-    }
+
     template <typename HybridSetType>
     void find_codes(const HybridSetType* values, std::vector<vectorized::UInt8>& selected) const {
         return _dict.find_codes(values, selected);
@@ -378,31 +381,6 @@ public:
         }
 
         inline uint32_t get_hash_value(T code, FieldType type) const {
-            if (_compute_hash_value_flags[code]) {
-                return _hash_values[code];
-            } else {
-                auto& sv = (*_dict_data)[code];
-                // The char data is stored in the disk with the schema length,
-                // and zeros are filled if the length is insufficient
-
-                // When reading data, use shrink_char_type_column_suffix_zero(_char_type_idx)
-                // Remove the suffix 0
-                // When writing data, use the CharField::consume function to fill in the trailing 0.
-
-                // For dictionary data of char type, sv.size is the schema length,
-                // so use strnlen to remove the 0 at the end to get the actual length.
-                int32_t len = sv.size;
-                if (type == FieldType::OLAP_FIELD_TYPE_CHAR) {
-                    len = strnlen(sv.data, sv.size);
-                }
-                uint32_t hash_val = HashUtil::murmur_hash3_32(sv.data, len, 0);
-                _hash_values[code] = hash_val;
-                _compute_hash_value_flags[code] = 1;
-                return _hash_values[code];
-            }
-        }
-
-        inline uint32_t get_crc32_hash_value(T code, FieldType type) const {
             if (_compute_hash_value_flags[code]) {
                 return _hash_values[code];
             } else {
diff --git a/be/src/vec/columns/column_fixed_length_object.h b/be/src/vec/columns/column_fixed_length_object.h
index 5b9733748d..e6b1db8bf5 100644
--- a/be/src/vec/columns/column_fixed_length_object.h
+++ b/be/src/vec/columns/column_fixed_length_object.h
@@ -103,6 +103,28 @@ public:
         }
     }
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        const Self& src_vec = assert_cast<const Self&>(src);
+        auto origin_size = size();
+        auto new_size = indices_end - indices_begin;
+        if (_item_size == 0) {
+            _item_size = src_vec._item_size;
+        }
+        DCHECK(_item_size == src_vec._item_size) << "dst and src should have the same _item_size";
+        resize(origin_size + new_size);
+
+        for (uint32_t i = 0; i < new_size; ++i) {
+            auto offset = indices_begin[i];
+            if (offset) {
+                memcpy(&_data[(origin_size + i) * _item_size], &src_vec._data[offset * _item_size],
+                       _item_size);
+            } else {
+                memset(&_data[(origin_size + i) * _item_size], 0, _item_size);
+            }
+        }
+    }
+
     void clear() override {
         _data.clear();
         _item_count = 0;
diff --git a/be/src/vec/columns/column_map.cpp b/be/src/vec/columns/column_map.cpp
index f3fa5ab6c2..82e8c0a911 100644
--- a/be/src/vec/columns/column_map.cpp
+++ b/be/src/vec/columns/column_map.cpp
@@ -196,6 +196,17 @@ void ColumnMap::insert_indices_from(const IColumn& src, const int* indices_begin
     }
 }
 
+void ColumnMap::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                         const uint32_t* indices_end) {
+    for (auto x = indices_begin; x != indices_end; ++x) {
+        if (*x == 0) {
+            ColumnMap::insert_default();
+        } else {
+            ColumnMap::insert_from(src, *x);
+        }
+    }
+}
+
 StringRef ColumnMap::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const {
     size_t array_size = size_at(n);
     size_t offset = offset_at(n);
diff --git a/be/src/vec/columns/column_map.h b/be/src/vec/columns/column_map.h
index f6fd313208..1cb3dd0c73 100644
--- a/be/src/vec/columns/column_map.h
+++ b/be/src/vec/columns/column_map.h
@@ -130,6 +130,9 @@ public:
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
+
     void append_data_by_selector(MutableColumnPtr& res,
                                  const IColumn::Selector& selector) const override {
         return append_data_by_selector_impl<ColumnMap>(res, selector);
diff --git a/be/src/vec/columns/column_nothing.h b/be/src/vec/columns/column_nothing.h
index 8a10eec8b6..8874bb6e7a 100644
--- a/be/src/vec/columns/column_nothing.h
+++ b/be/src/vec/columns/column_nothing.h
@@ -39,6 +39,11 @@ public:
     bool structure_equals(const IColumn& rhs) const override {
         return typeid(rhs) == typeid(ColumnNothing);
     }
+
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        LOG(FATAL) << "insert_indices_from_join not supported in ColumnNothing";
+    }
 };
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp
index 4f25a3f4b1..3553e9823d 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -304,6 +304,16 @@ void ColumnNullable::insert_indices_from(const IColumn& src, const int* indices_
     _need_update_has_null = true;
 }
 
+void ColumnNullable::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                              const uint32_t* indices_end) {
+    const auto& src_concrete = assert_cast<const ColumnNullable&>(src);
+    get_nested_column().insert_indices_from_join(src_concrete.get_nested_column(), indices_begin,
+                                                 indices_end);
+    _get_null_map_column().insert_indices_from_join(src_concrete.get_null_map_column(),
+                                                    indices_begin, indices_end);
+    _need_update_has_null = true;
+}
+
 void ColumnNullable::insert(const Field& x) {
     if (x.is_null()) {
         get_nested_column().insert_default();
diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h
index 8a45c51d23..365400a669 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -123,6 +123,9 @@ public:
     void insert_range_from(const IColumn& src, size_t start, size_t length) override;
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
+
     void insert(const Field& x) override;
     void insert_from(const IColumn& src, size_t n) override;
 
diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp
index cf11b47947..fb793abe75 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -1464,4 +1464,15 @@ void ColumnObject::insert_indices_from(const IColumn& src, const int* indices_be
                                    });
 }
 
+void ColumnObject::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                            const uint32_t* indices_end) {
+    // insert_indices_from with alignment
+    const ColumnObject& src_column = *check_and_get_column<ColumnObject>(src);
+    align_variant_by_name_and_type(*this, src_column, indices_end - indices_begin,
+                                   [indices_begin, indices_end](const IColumn& src, IColumn* dst) {
+                                       dst->insert_indices_from_join(src, indices_begin,
+                                                                     indices_end);
+                                   });
+}
+
 } // namespace doris::vectorized
diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h
index cade1342b6..c279042251 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -361,6 +361,9 @@ public:
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
+
     // May throw execption
     void try_insert(const Field& field);
 
diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp
index d6a3a51499..2d009e2a08 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -161,6 +161,43 @@ void ColumnString::insert_indices_from(const IColumn& src, const int* indices_be
     }
 }
 
+void ColumnString::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                            const uint32_t* indices_end) {
+    const ColumnString& src_str = assert_cast<const ColumnString&>(src);
+    auto src_offset_data = src_str.offsets.data();
+
+    auto old_char_size = chars.size();
+    size_t total_chars_size = old_char_size;
+
+    auto dst_offsets_pos = offsets.size();
+    offsets.resize(offsets.size() + indices_end - indices_begin);
+    auto* dst_offsets_data = offsets.data();
+
+    for (auto x = indices_begin; x != indices_end; ++x) {
+        if (*x != 0) {
+            total_chars_size += src_offset_data[*x] - src_offset_data[*x - 1];
+        }
+        dst_offsets_data[dst_offsets_pos++] = total_chars_size;
+    }
+    check_chars_length(total_chars_size, offsets.size());
+
+    chars.resize(total_chars_size);
+
+    auto* src_data_ptr = src_str.chars.data();
+    auto* dst_data_ptr = chars.data();
+
+    size_t dst_chars_pos = old_char_size;
+    for (auto x = indices_begin; x != indices_end; ++x) {
+        if (*x != 0) {
+            const size_t size_to_append = src_offset_data[*x] - src_offset_data[*x - 1];
+            const size_t offset = src_offset_data[*x - 1];
+            memcpy_small_allow_read_write_overflow15(dst_data_ptr + dst_chars_pos,
+                                                     src_data_ptr + offset, size_to_append);
+            dst_chars_pos += size_to_append;
+        }
+    }
+}
+
 void ColumnString::update_crcs_with_value(uint32_t* __restrict hashes, doris::PrimitiveType type,
                                           uint32_t rows, uint32_t offset,
                                           const uint8_t* __restrict null_data) const {
diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h
index e310817020..191c6a95cf 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -487,6 +487,9 @@ public:
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
+
     ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override;
     size_t filter(const Filter& filter) override;
 
diff --git a/be/src/vec/columns/column_struct.cpp b/be/src/vec/columns/column_struct.cpp
index 93c6213949..3502fdf581 100644
--- a/be/src/vec/columns/column_struct.cpp
+++ b/be/src/vec/columns/column_struct.cpp
@@ -233,6 +233,15 @@ void ColumnStruct::insert_indices_from(const IColumn& src, const int* indices_be
     }
 }
 
+void ColumnStruct::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                            const uint32_t* indices_end) {
+    const ColumnStruct& src_concrete = assert_cast<const ColumnStruct&>(src);
+    for (size_t i = 0; i < columns.size(); ++i) {
+        columns[i]->insert_indices_from_join(src_concrete.get_column(i), indices_begin,
+                                             indices_end);
+    }
+}
+
 void ColumnStruct::insert_range_from(const IColumn& src, size_t start, size_t length) {
     const size_t tuple_size = columns.size();
     for (size_t i = 0; i < tuple_size; ++i) {
diff --git a/be/src/vec/columns/column_struct.h b/be/src/vec/columns/column_struct.h
index 919b971b5a..499fb8444f 100644
--- a/be/src/vec/columns/column_struct.h
+++ b/be/src/vec/columns/column_struct.h
@@ -124,6 +124,9 @@ public:
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
+
     void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
                          Permutation& res) const override {
         LOG(FATAL) << "get_permutation not implemented";
diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp
index 744e74b484..a825e07d5f 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -388,6 +388,20 @@ void ColumnVector<T>::insert_indices_from(const IColumn& src, const int* indices
     }
 }
 
+template <typename T>
+void ColumnVector<T>::insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                               const uint32_t* indices_end) {
+    auto origin_size = size();
+    auto new_size = indices_end - indices_begin;
+    data.resize(origin_size + new_size);
+
+    const T* __restrict src_data = reinterpret_cast<const T*>(src.get_raw_data().data);
+
+    for (uint32_t i = 0; i < new_size; ++i) {
+        data[origin_size + i] = src_data[indices_begin[i]];
+    }
+}
+
 template <typename T>
 ColumnPtr ColumnVector<T>::filter(const IColumn::Filter& filt, ssize_t result_size_hint) const {
     size_t size = data.size();
diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h
index 77df238d2a..cb1edddb52 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -389,6 +389,8 @@ public:
     void insert_indices_from(const IColumn& src, const int* indices_begin,
                              const int* indices_end) override;
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override;
     void fill(const value_type& element, size_t num) {
         auto old_size = data.size();
         auto new_size = old_size + num;
diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h
index 1448c3d4bd..79f445b08d 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -131,6 +131,11 @@ public:
         LOG(FATAL) << "insert_indices_from not supported in PredicateColumnType";
     }
 
+    void insert_indices_from_join(const IColumn& src, const uint32_t* indices_begin,
+                                  const uint32_t* indices_end) override {
+        LOG(FATAL) << "insert_indices_from_join not supported in PredicateColumnType";
+    }
+
     void pop_back(size_t n) override {
         LOG(FATAL) << "pop_back not supported in PredicateColumnType";
     }
diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h
index 5b7cd6f464..6efbdbb3e9 100644
--- a/be/src/vec/common/hash_table/hash_map.h
+++ b/be/src/vec/common/hash_table/hash_map.h
@@ -20,9 +20,14 @@
 
 #pragma once
 
+#include <gen_cpp/PlanNodes_types.h>
+
+#include "common/compiler_util.h"
+#include "vec/columns/column_filter_helper.h"
 #include "vec/common/hash_table/hash.h"
 #include "vec/common/hash_table/hash_table.h"
 #include "vec/common/hash_table/hash_table_allocator.h"
+
 /** NOTE HashMap could only be used for memmoveable (position independent) types.
   * Example: std::string is not position independent in libstdc++ with C++11 ABI or in libc++.
   * Also, key in hash table must be of type, that zero bytes is compared equals to zero key.
@@ -193,10 +198,349 @@ public:
     bool has_null_key_data() const { return false; }
 };
 
+template <typename Key, typename Cell, typename Hash = DefaultHash<Key>,
+          typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator>
+class JoinHashMapTable : public HashMapTable<Key, Cell, Hash, Grower, Allocator> {
+public:
+    using Self = JoinHashMapTable;
+    using Base = HashMapTable<Key, Cell, Hash, Grower, Allocator>;
+
+    using key_type = Key;
+    using value_type = typename Cell::value_type;
+    using mapped_type = typename Cell::Mapped;
+
+    using LookupResult = typename Base::LookupResult;
+
+    using HashMapTable<Key, Cell, Hash, Grower, Allocator>::HashMapTable;
+
+    static uint32_t calc_bucket_size(size_t num_elem) {
+        size_t expect_bucket_size = num_elem + (num_elem - 1) / 7;
+        return phmap::priv::NormalizeCapacity(expect_bucket_size) + 1;
+    }
+
+    size_t get_byte_size() const {
+        auto cal_vector_mem = [](const auto& vec) { return vec.capacity() * sizeof(vec[0]); };
+        return cal_vector_mem(visited) + cal_vector_mem(first) + cal_vector_mem(next);
+    }
+
+    template <int JoinOpType>
+    void prepare_build(size_t num_elem, int batch_size, bool has_null_key) {
+        _has_null_key = has_null_key;
+        max_batch_size = batch_size;
+        bucket_size = calc_bucket_size(num_elem + 1);
+        first.resize(bucket_size + 1);
+        next.resize(num_elem);
+
+        if constexpr (JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) {
+            visited.resize(num_elem);
+        }
+    }
+
+    uint32_t get_bucket_size() const { return bucket_size; }
+
+    size_t size() const { return Base::size() == 0 ? next.size() : Base::size(); }
+
+    std::vector<uint8_t>& get_visited() { return visited; }
+
+    void build(const Key* __restrict keys, const uint32_t* __restrict bucket_nums,
+               size_t num_elem) {
+        build_keys = keys;
+        for (size_t i = 1; i < num_elem; i++) {
+            uint32_t bucket_num = bucket_nums[i];
+            next[i] = first[bucket_num];
+            first[bucket_num] = i;
+        }
+        first[bucket_size] = 0; // index = bucket_num means null
+    }
+
+    template <int JoinOpType, bool with_other_conjuncts, bool is_mark_join, bool need_judge_null>
+    auto find_batch(const Key* __restrict keys, const uint32_t* __restrict build_idx_map,
+                    int probe_idx, uint32_t build_idx, int probe_rows,
+                    uint32_t* __restrict probe_idxs, bool& probe_visited,
+                    uint32_t* __restrict build_idxs,
+                    doris::vectorized::ColumnFilterHelper* mark_column) {
+        if constexpr (is_mark_join) {
+            return _find_batch_mark<JoinOpType, with_other_conjuncts>(
+                    keys, build_idx_map, probe_idx, probe_rows, probe_idxs, build_idxs,
+                    mark_column);
+        }
+
+        if constexpr (with_other_conjuncts) {
+            return _find_batch_conjunct<JoinOpType>(keys, build_idx_map, probe_idx, build_idx,
+                                                    probe_rows, probe_idxs, build_idxs);
+        }
+
+        if constexpr (JoinOpType == doris::TJoinOp::INNER_JOIN ||
+                      JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN) {
+            return _find_batch_inner_outer_join<JoinOpType>(keys, build_idx_map, probe_idx,
+                                                            build_idx, probe_rows, probe_idxs,
+                                                            probe_visited, build_idxs);
+        }
+        if constexpr (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
+                      JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ||
+                      JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
+            return _find_batch_left_semi_anti<JoinOpType, need_judge_null>(
+                    keys, build_idx_map, probe_idx, probe_rows, probe_idxs);
+        }
+        if constexpr (JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) {
+            return _find_batch_right_semi_anti(keys, build_idx_map, probe_idx, probe_rows);
+        }
+        return std::tuple {0, 0U, 0};
+    }
+
+    template <int JoinOpType>
+    bool iterate_map(std::vector<uint32_t>& build_idxs) const {
+        const auto batch_size = max_batch_size;
+        const auto elem_num = visited.size();
+        int count = 0;
+        build_idxs.resize(batch_size);
+
+        while (count < batch_size && iter_idx < elem_num) {
+            const auto matched = visited[iter_idx];
+            build_idxs[count] = iter_idx;
+            if constexpr (JoinOpType != doris::TJoinOp::RIGHT_SEMI_JOIN) {
+                count += !matched;
+            } else {
+                count += matched;
+            }
+            iter_idx++;
+        }
+
+        build_idxs.resize(count);
+        return iter_idx >= elem_num;
+    }
+
+    bool has_null_key() { return _has_null_key; }
+
+    void pre_build_idxs(std::vector<uint32>& bucksets, const uint8_t* null_map) {
+        if (null_map) {
+            first[bucket_size] = bucket_size; // distinguish between not matched and null
+        }
+
+        for (uint32_t i = 0; i < bucksets.size(); i++) {
+            bucksets[i] = first[bucksets[i]];
+        }
+    }
+
+private:
+    // only LEFT_ANTI_JOIN/LEFT_SEMI_JOIN/NULL_AWARE_LEFT_ANTI_JOIN/CROSS_JOIN support mark join
+    template <int JoinOpType, bool with_other_conjuncts>
+    auto _find_batch_mark(const Key* __restrict keys, const uint32_t* __restrict build_idx_map,
+                          int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs,
+                          uint32_t* __restrict build_idxs,
+                          doris::vectorized::ColumnFilterHelper* mark_column) {
+        auto matched_cnt = 0;
+        const auto batch_size = max_batch_size;
+
+        while (probe_idx < probe_rows && matched_cnt < batch_size) {
+            auto build_idx = build_idx_map[probe_idx] == bucket_size ? 0 : build_idx_map[probe_idx];
+
+            while (build_idx && keys[probe_idx] != build_keys[build_idx]) {
+                build_idx = next[build_idx];
+            }
+
+            if constexpr (!with_other_conjuncts) {
+                if (build_idx_map[probe_idx] == bucket_size) {
+                    // mark result as null when probe row is null
+                    mark_column->insert_null();
+                } else {
+                    bool matched = JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ? build_idx != 0
+                                                                                : build_idx == 0;
+                    if (!matched && _has_null_key) {
+                        mark_column->insert_null();
+                    } else {
+                        mark_column->insert_value(matched);
+                    }
+                }
+            }
+
+            probe_idxs[matched_cnt] = probe_idx++;
+            build_idxs[matched_cnt] = build_idx;
+            matched_cnt++;
+        }
+        return std::tuple {probe_idx, 0U, matched_cnt};
+    }
+
+    auto _find_batch_right_semi_anti(const Key* __restrict keys,
+                                     const uint32_t* __restrict build_idx_map, int probe_idx,
+                                     int probe_rows) {
+        while (probe_idx < probe_rows) {
+            auto build_idx = build_idx_map[probe_idx];
+
+            while (build_idx) {
+                if (!visited[build_idx] && keys[probe_idx] == build_keys[build_idx]) {
+                    visited[build_idx] = 1;
+                }
+                build_idx = next[build_idx];
+            }
+            probe_idx++;
+        }
+        return std::tuple {probe_idx, 0U, 0};
+    }
+
+    template <int JoinOpType, bool need_judge_null>
+    auto _find_batch_left_semi_anti(const Key* __restrict keys,
+                                    const uint32_t* __restrict build_idx_map, int probe_idx,
+                                    int probe_rows, uint32_t* __restrict probe_idxs) {
+        auto matched_cnt = 0;
+        const auto batch_size = max_batch_size;
+
+        while (probe_idx < probe_rows && matched_cnt < batch_size) {
+            if constexpr (need_judge_null) {
+                if (build_idx_map[probe_idx] == bucket_size) {
+                    probe_idx++;
+                    continue;
+                }
+            }
+
+            auto build_idx = build_idx_map[probe_idx];
+
+            while (build_idx && keys[probe_idx] != build_keys[build_idx]) {
+                build_idx = next[build_idx];
+            }
+            bool matched =
+                    JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ? build_idx != 0 : build_idx == 0;
+            probe_idxs[matched_cnt] = probe_idx++;
+            matched_cnt += matched;
+        }
+        return std::tuple {probe_idx, 0U, matched_cnt};
+    }
+
+    template <int JoinOpType>
+    auto _find_batch_conjunct(const Key* __restrict keys, const uint32_t* __restrict build_idx_map,
+                              int probe_idx, uint32_t build_idx, int probe_rows,
+                              uint32_t* __restrict probe_idxs, uint32_t* __restrict build_idxs) {
+        auto matched_cnt = 0;
+        const auto batch_size = max_batch_size;
+
+        auto do_the_probe = [&]() {
+            while (build_idx && matched_cnt < batch_size) {
+                if constexpr (JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN ||
+                              JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) {
+                    if (!visited[build_idx] && keys[probe_idx] == build_keys[build_idx]) {
+                        probe_idxs[matched_cnt] = probe_idx;
+                        build_idxs[matched_cnt] = build_idx;
+                        matched_cnt++;
+                    }
+                } else if (keys[probe_idx] == build_keys[build_idx]) {
+                    build_idxs[matched_cnt] = build_idx;
+                    probe_idxs[matched_cnt] = probe_idx;
+                    matched_cnt++;
+                }
+                build_idx = next[build_idx];
+            }
+
+            if constexpr (JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN ||
+                          JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN ||
+                          JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
+                          JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
+                // may over batch_size when emplace 0 into build_idxs
+                if (!build_idx) {
+                    probe_idxs[matched_cnt] = probe_idx;
+                    build_idxs[matched_cnt] = 0;
+                    matched_cnt++;
+                }
+            }
+
+            probe_idx++;
+        };
+
+        if (build_idx) {
+            do_the_probe();
+        }
+
+        while (probe_idx < probe_rows && matched_cnt < batch_size) {
+            build_idx = build_idx_map[probe_idx];
+            do_the_probe();
+        }
+
+        probe_idx -= (build_idx != 0);
+        return std::tuple {probe_idx, build_idx, matched_cnt};
+    }
+
+    template <int JoinOpType>
+    auto _find_batch_inner_outer_join(const Key* __restrict keys,
+                                      const uint32_t* __restrict build_idx_map, int probe_idx,
+                                      uint32_t build_idx, int probe_rows,
+                                      uint32_t* __restrict probe_idxs, bool& probe_visited,
+                                      uint32_t* __restrict build_idxs) {
+        auto matched_cnt = 0;
+        const auto batch_size = max_batch_size;
+
+        auto do_the_probe = [&]() {
+            while (build_idx && matched_cnt < batch_size) {
+                if (keys[probe_idx] == build_keys[build_idx]) {
+                    probe_idxs[matched_cnt] = probe_idx;
+                    build_idxs[matched_cnt] = build_idx;
+                    matched_cnt++;
+                    if constexpr (JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN ||
+                                  JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN) {
+                        if (!visited[build_idx]) {
+                            visited[build_idx] = 1;
+                        }
+                    }
+                }
+                build_idx = next[build_idx];
+            }
+
+            if constexpr (JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN ||
+                          JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN) {
+                // `(!matched_cnt || probe_idxs[matched_cnt - 1] != probe_idx)` means not match one build side
+                probe_visited |= (matched_cnt && probe_idxs[matched_cnt - 1] == probe_idx);
+                if (!build_idx) {
+                    if (!probe_visited) {
+                        probe_idxs[matched_cnt] = probe_idx;
+                        build_idxs[matched_cnt] = 0;
+                        matched_cnt++;
+                    }
+                    probe_visited = false;
+                }
+            }
+            probe_idx++;
+        };
+
+        if (build_idx) {
+            do_the_probe();
+        }
+
+        while (probe_idx < probe_rows && matched_cnt < batch_size) {
+            build_idx = build_idx_map[probe_idx];
+            do_the_probe();
+        }
+
+        probe_idx -= (build_idx != 0);
+        return std::tuple {probe_idx, build_idx, matched_cnt};
+    }
+
+    const Key* __restrict build_keys;
+    std::vector<uint8_t> visited;
+
+    uint32_t bucket_size = 1;
+    int max_batch_size = 4064;
+
+    std::vector<uint32_t> first = {0};
+    std::vector<uint32_t> next = {0};
+
+    // use in iter hash map
+    mutable uint32_t iter_idx = 1;
+    Cell cell;
+    doris::vectorized::Arena* pool;
+    bool _has_null_key = false;
+};
+
 template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>,
           typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator>
 using HashMap = HashMapTable<Key, HashMapCell<Key, Mapped, Hash>, Hash, Grower, Allocator>;
 
+template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>>
+using JoinFixedHashMap = JoinHashMapTable<Key, HashMapCell<Key, Mapped, Hash>, Hash>;
+
 template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>,
           typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator>
 using HashMapWithSavedHash =
diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h
index 32c0d0a31b..031af96e79 100644
--- a/be/src/vec/common/hash_table/hash_map_context.h
+++ b/be/src/vec/common/hash_table/hash_map_context.h
@@ -55,6 +55,9 @@ struct MethodBase {
     Arena arena;
     std::vector<size_t> hash_values;
 
+    // use in join case
+    std::vector<uint32_t> bucket_nums;
+
     MethodBase() { hash_table.reset(new HashMap()); }
     virtual ~MethodBase() = default;
 
@@ -69,8 +72,29 @@ struct MethodBase {
             iterator = hash_table->begin();
         }
     }
+
     virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows,
-                                      const uint8_t* null_map = nullptr) = 0;
+                                      const uint8_t* null_map = nullptr, bool is_join = false,
+                                      bool is_build = false, uint32_t bucket_size = 0) = 0;
+
+    void init_join_bucket_num(uint32_t num_rows, uint32_t bucket_size, const uint8_t* null_map) {
+        bucket_nums.resize(num_rows);
+
+        if (null_map == nullptr) {
+            init_join_bucket_num(num_rows, bucket_size);
+            return;
+        }
+        for (uint32_t k = 0; k < num_rows; ++k) {
+            bucket_nums[k] =
+                    null_map[k] ? bucket_size : hash_table->hash(keys[k]) & (bucket_size - 1);
+        }
+    }
+
+    void init_join_bucket_num(uint32_t num_rows, uint32_t bucket_size) {
+        for (uint32_t k = 0; k < num_rows; ++k) {
+            bucket_nums[k] = hash_table->hash(keys[k]) & (bucket_size - 1);
+        }
+    }
 
     void init_hash_values(size_t num_rows, const uint8_t* null_map) {
         if (null_map == nullptr) {
@@ -148,7 +172,10 @@ struct MethodSerialized : public MethodBase<TData> {
     using Base::init_iterator;
     using State = ColumnsHashing::HashMethodSerialized<typename Base::Value, typename Base::Mapped>;
     using Base::try_presis_key;
-
+    // need keep until the hash probe end.
+    std::vector<StringRef> build_stored_keys;
+    Arena build_arena;
+    // refresh each time probe
     std::vector<StringRef> stored_keys;
 
     StringRef serialize_keys_to_pool_contiguous(size_t i, size_t keys_size,
@@ -163,40 +190,48 @@ struct MethodSerialized : public MethodBase<TData> {
         return {begin, sum_size};
     }
 
-    void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows,
-                              const uint8_t* null_map = nullptr) override {
-        Base::arena.clear();
-        stored_keys.resize(num_rows);
+    void init_serialized_keys_impl(const ColumnRawPtrs& key_columns, size_t num_rows,
+                                   std::vector<StringRef>& keys, Arena& arena) {
+        arena.clear();
+        keys.resize(num_rows);
 
         size_t max_one_row_byte_size = 0;
         for (const auto& column : key_columns) {
             max_one_row_byte_size += column->get_max_row_byte_size();
         }
         size_t total_bytes = max_one_row_byte_size * num_rows;
-
         if (total_bytes > config::pre_serialize_keys_limit_bytes) {
             // reach mem limit, don't serialize in batch
             size_t keys_size = key_columns.size();
             for (size_t i = 0; i < num_rows; ++i) {
-                stored_keys[i] =
-                        serialize_keys_to_pool_contiguous(i, keys_size, key_columns, Base::arena);
+                keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, key_columns, arena);
             }
         } else {
-            auto* serialized_key_buffer =
-                    reinterpret_cast<uint8_t*>(Base::arena.alloc(total_bytes));
+            auto* serialized_key_buffer = reinterpret_cast<uint8_t*>(arena.alloc(total_bytes));
 
             for (size_t i = 0; i < num_rows; ++i) {
-                stored_keys[i].data =
+                keys[i].data =
                         reinterpret_cast<char*>(serialized_key_buffer + i * max_one_row_byte_size);
-                stored_keys[i].size = 0;
+                keys[i].size = 0;
             }
 
             for (const auto& column : key_columns) {
-                column->serialize_vec(stored_keys, num_rows, max_one_row_byte_size);
+                column->serialize_vec(keys, num_rows, max_one_row_byte_size);
             }
         }
-        Base::keys = stored_keys.data();
-        Base::init_hash_values(num_rows, null_map);
+        Base::keys = keys.data();
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows,
+                              const uint8_t* null_map = nullptr, bool is_join = false,
+                              bool is_build = false, uint32_t bucket_size = 0) override {
+        init_serialized_keys_impl(key_columns, num_rows, is_build ? build_stored_keys : stored_keys,
+                                  is_build ? build_arena : Base::arena);
+        if (is_join) {
+            Base::init_join_bucket_num(num_rows, bucket_size, null_map);
+        } else {
+            Base::init_hash_values(num_rows, null_map);
+        }
     }
 
     void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns,
@@ -222,7 +257,8 @@ struct MethodStringNoCache : public MethodBase<TData> {
     std::vector<StringRef> stored_keys;
 
     void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows,
-                              const uint8_t* null_map = nullptr) override {
+                              const uint8_t* null_map = nullptr, bool is_join = false,
+                              bool is_build = false, uint32_t bucket_size = 0) override {
         const IColumn& column = *key_columns[0];
         const auto& column_string = assert_cast<const ColumnString&>(
                 column.is_nullable()
@@ -237,7 +273,11 @@ struct MethodStringNoCache : public MethodBase<TData> {
         }
 
         Base::keys = stored_keys.data();
-        Base::init_hash_values(num_rows, null_map);
+        if (is_join) {
+            Base::init_join_bucket_num(num_rows, bucket_size, null_map);
+        } else {
+            Base::init_hash_values(num_rows, null_map);
+        }
     }
 
     void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns,
@@ -258,15 +298,20 @@ struct MethodOneNumber : public MethodBase<TData> {
                                                       FieldType>;
 
     void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows,
-                              const uint8_t* null_map = nullptr) override {
+                              const uint8_t* null_map = nullptr, bool is_join = false,
+                              bool is_build = false, uint32_t bucket_size = 0) override {
         Base::keys = (FieldType*)(key_columns[0]->is_nullable()
                                           ? assert_cast<const ColumnNullable*>(key_columns[0])
                                                     ->get_nested_column_ptr()
-                                          : key_columns[0])
-                             ->get_raw_data()
-                             .data;
+                                                    ->get_raw_data()
+                                                    .data
+                                          : key_columns[0]->get_raw_data().data);
         std::string name = key_columns[0]->get_name();
-        Base::init_hash_values(num_rows, null_map);
+        if (is_join) {
+            Base::init_join_bucket_num(num_rows, bucket_size, null_map);
+        } else {
+            Base::init_hash_values(num_rows, null_map);
+        }
     }
 
     void insert_keys_into_columns(std::vector<typename Base::Key>& keys,
@@ -292,17 +337,22 @@ struct MethodKeysFixed : public MethodBase<TData> {
     using State = ColumnsHashing::HashMethodKeysFixed<typename Base::Value, Key, Mapped,
                                                       has_nullable_keys>;
 
+    // need keep until the hash probe end. use only in join
+    std::vector<Key> build_stored_keys;
+    // refresh each time probe hash table
     std::vector<Key> stored_keys;
     Sizes key_sizes;
 
     MethodKeysFixed(Sizes key_sizes_) : key_sizes(std::move(key_sizes_)) {}
 
     template <typename T>
-    std::vector<T> pack_fixeds(size_t row_numbers, const ColumnRawPtrs& key_columns,
-                               const ColumnRawPtrs& nullmap_columns) {
+    void pack_fixeds(size_t row_numbers, const ColumnRawPtrs& key_columns,
+                     const ColumnRawPtrs& nullmap_columns, std::vector<T>& result) {
         size_t bitmap_size = get_bitmap_size(nullmap_columns.size());
+        // set size to 0 at first, then use resize to call default constructor on index included from [0, row_numbers) to reset all memory
+        result.clear();
+        result.resize(row_numbers);
 
-        std::vector<T> result(row_numbers);
         size_t offset = 0;
         if (bitmap_size > 0) {
             for (size_t j = 0; j < nullmap_columns.size(); j++) {
@@ -356,11 +406,11 @@ struct MethodKeysFixed : public MethodBase<TData> {
             }
             offset += key_sizes[j];
         }
-        return result;
     }
 
     void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows,
-                              const uint8_t* null_map = nullptr) override {
+                              const uint8_t* null_map = nullptr, bool is_join = false,
+                              bool is_build = false, uint32_t bucket_size = 0) override {
         ColumnRawPtrs actual_columns;
         ColumnRawPtrs null_maps;
         if (has_nullable_keys) {
@@ -378,9 +428,20 @@ struct MethodKeysFixed : public MethodBase<TData> {
         } else {
             actual_columns = key_columns;
         }
-        stored_keys = pack_fixeds<Key>(num_rows, actual_columns, null_maps);
-        Base::keys = stored_keys.data();
-        Base::init_hash_values(num_rows, null_map);
+
+        if (is_build) {
+            pack_fixeds<Key>(num_rows, actual_columns, null_maps, build_stored_keys);
+            Base::keys = build_stored_keys.data();
+        } else {
+            pack_fixeds<Key>(num_rows, actual_columns, null_maps, stored_keys);
+            Base::keys = stored_keys.data();
+        }
+
+        if (is_join) {
+            Base::init_join_bucket_num(num_rows, bucket_size, null_map);
+        } else {
+            Base::init_hash_values(num_rows, null_map);
+        }
     }
 
     void insert_keys_into_columns(std::vector<typename Base::Key>& keys,
@@ -488,14 +549,14 @@ struct MethodSingleNullableColumn : public SingleColumnMethod {
 #endif
 
 template <typename RowRefListType>
-using SerializedHashTableContext = MethodSerialized<PartitionedHashMap<StringRef, RowRefListType>>;
+using SerializedHashTableContext = MethodSerialized<JoinFixedHashMap<StringRef, RowRefListType>>;
 
 template <class T, typename RowRefListType>
 using PrimaryTypeHashTableContext =
-        MethodOneNumber<T, PartitionedHashMap<T, RowRefListType, HashCRC32<T>>>;
+        MethodOneNumber<T, JoinFixedHashMap<T, RowRefListType, HashCRC32<T>>>;
 
 template <class Key, bool has_null, typename Value>
 using FixedKeyHashTableContext =
-        MethodKeysFixed<PartitionedHashMap<Key, Value, HashCRC32<Key>>, has_null>;
+        MethodKeysFixed<JoinFixedHashMap<Key, Value, HashCRC32<Key>>, has_null>;
 
 } // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h
index 20c8c8e457..fcc682a49c 100644
--- a/be/src/vec/common/hash_table/hash_table.h
+++ b/be/src/vec/common/hash_table/hash_table.h
@@ -441,7 +441,6 @@ protected:
     Cell* buf = nullptr; /// A piece of memory for all elements except the element with zero key.
     Grower grower;
     int64_t _resize_timer_ns;
-
     // the bucket count threshold above which it's converted to partioned hash table
     // > 0: enable convert dynamically
     // 0: convert is disabled
diff --git a/be/src/vec/common/hash_table/hash_table_set_build.h b/be/src/vec/common/hash_table/hash_table_set_build.h
index 34fb691f9e..152b20eeef 100644
--- a/be/src/vec/common/hash_table/hash_table_set_build.h
+++ b/be/src/vec/common/hash_table/hash_table_set_build.h
@@ -24,11 +24,9 @@ namespace doris::vectorized {
 template <class HashTableContext, bool is_intersect>
 struct HashTableBuild {
     template <typename Parent>
-    HashTableBuild(Parent* parent, int rows, ColumnRawPtrs& build_raw_ptrs, uint8_t offset,
-                   RuntimeState* state)
+    HashTableBuild(Parent* parent, int rows, ColumnRawPtrs& build_raw_ptrs, RuntimeState* state)
             : _mem_used(parent->mem_used()),
               _rows(rows),
-              _offset(offset),
               _build_raw_ptrs(build_raw_ptrs),
               _state(state) {}
 
@@ -48,9 +46,9 @@ struct HashTableBuild {
         size_t k = 0;
         auto creator = [&](const auto& ctor, auto& key, auto& origin) {
             HashTableContext::try_presis_key(key, origin, arena);
-            ctor(key, Mapped {k, _offset});
+            ctor(key, Mapped {k});
         };
-        auto creator_for_null_key = [&](auto& mapped) { mapped = {k, _offset}; };
+        auto creator_for_null_key = [&](auto& mapped) { mapped = {k}; };
 
         for (; k < _rows; ++k) {
             if (k % CHECK_FRECUENCY == 0) {
@@ -64,7 +62,6 @@ struct HashTableBuild {
 private:
     int64_t* _mem_used = nullptr;
     const int _rows;
-    const uint8_t _offset;
     ColumnRawPtrs& _build_raw_ptrs;
     RuntimeState* _state = nullptr;
 };
diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp
index 40222ad1f9..723dc3ac63 100644
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@@ -265,7 +265,7 @@ void Block::erase(const String& name) {
 ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) {
     if (position >= data.size()) {
         throw Exception(ErrorCode::INTERNAL_ERROR,
-                        "invalid input position, position={}, data.size{}, names={}", position,
+                        "invalid input position, position={}, data.size={}, names={}", position,
                         data.size(), dump_names());
     }
     return data[position];
@@ -274,7 +274,7 @@ ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) {
 const ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) const {
     if (position >= data.size()) {
         throw Exception(ErrorCode::INTERNAL_ERROR,
-                        "invalid input position, position={}, data.size{}, names={}", position,
+                        "invalid input position, position={}, data.size={}, names={}", position,
                         data.size(), dump_names());
     }
     return data[position];
@@ -338,8 +338,9 @@ void Block::check_number_of_rows(bool allow_null_columns) const {
         }
 
         if (!elem.column) {
-            LOG(FATAL) << fmt::format(
-                    "Column {} in block is nullptr, in method check_number_of_rows.", elem.name);
+            throw Exception(ErrorCode::INTERNAL_ERROR,
+                            "Column {} in block is nullptr, in method check_number_of_rows.",
+                            elem.name);
         }
 
         ssize_t size = elem.column->size();
@@ -347,8 +348,8 @@ void Block::check_number_of_rows(bool allow_null_columns) const {
         if (rows == -1) {
             rows = size;
         } else if (rows != size) {
-            LOG(FATAL) << fmt::format("Sizes of columns doesn't match: {}:{},{}:{}, col size: {}",
-                                      data.front().name, rows, elem.name, size, each_col_size());
+            throw Exception(ErrorCode::INTERNAL_ERROR, "Sizes of columns doesn't match, block={}",
+                            dump_structure());
         }
     }
 }
@@ -1088,7 +1089,7 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size, bool is_rese
         if (is_reserve) {
             column->reserve(size);
         } else {
-            column->resize(size);
+            column->insert_many_defaults(size);
         }
         temp_block->insert({std::move(column), d.type, d.name});
     }
diff --git a/be/src/vec/exec/join/join_op.h b/be/src/vec/exec/join/join_op.h
index 8b8efe7389..62569270d9 100644
--- a/be/src/vec/exec/join/join_op.h
+++ b/be/src/vec/exec/join/join_op.h
@@ -18,7 +18,6 @@
 #pragma once
 #include "vec/common/arena.h"
 #include "vec/common/columns_hashing.h"
-#include "vec/common/hash_table/hash_map.h"
 #include "vec/core/block.h"
 
 namespace doris::vectorized {
@@ -45,19 +44,19 @@ namespace doris::vectorized {
  */
 struct RowRef {
     uint32_t row_num = 0;
-    uint8_t block_offset;
 
     RowRef() = default;
-    RowRef(size_t row_num_count, uint8_t block_offset_)
-            : row_num(row_num_count), block_offset(block_offset_) {}
+    RowRef(size_t row_num_count) : row_num(row_num_count) {}
+    void clear() {};
 };
 
 struct RowRefWithFlag : public RowRef {
     bool visited;
 
     RowRefWithFlag() = default;
-    RowRefWithFlag(size_t row_num_count, uint8_t block_offset_, bool is_visited = false)
-            : RowRef(row_num_count, block_offset_), visited(is_visited) {}
+    RowRefWithFlag(size_t row_num_count, bool is_visited = false)
+            : RowRef(row_num_count), visited(is_visited) {}
+    void clear() {};
 };
 
 /// Portion of RowRefs, 16 * (MAX_SIZE + 1) bytes sized.
@@ -93,14 +92,15 @@ public:
     ForwardIterator() : root(nullptr), first(false), batch(nullptr), position(0) {}
 
     ForwardIterator(RowRefListType* begin)
-            : root(begin), first(true), batch(root->next), position(0) {}
+            : root(begin), first(true), batch((&root->next)), position(0) {}
 
     RowRefType& operator*() {
         if (first) {
             return *root;
         }
-        return batch->row_refs[position];
+        return batch->operator[](position);
     }
+
     RowRefType* operator->() { return &(**this); }
 
     void operator++() {
@@ -109,21 +109,17 @@ public:
             return;
         }
 
-        if (batch) {
+        if (batch && position < batch->size()) {
             ++position;
-            if (position >= batch->size) {
-                batch = batch->next;
-                position = 0;
-            }
         }
     }
 
-    bool ok() const { return first || batch; }
+    bool ok() const { return first || (batch && position < batch->size()); }
 
 private:
     RowRefListType* root = nullptr;
     bool first;
-    Batch<RowRefType>* batch = nullptr;
+    std::vector<RowRefType>* batch = nullptr;
     size_t position;
 };
 
@@ -131,76 +127,60 @@ struct RowRefList : RowRef {
     using RowRefType = RowRef;
 
     RowRefList() = default;
-    RowRefList(size_t row_num_, uint8_t block_offset_) : RowRef(row_num_, block_offset_) {}
+    RowRefList(size_t row_num_) : RowRef(row_num_) {}
 
     ForwardIterator<RowRefList> begin() { return ForwardIterator<RowRefList>(this); }
 
     /// insert element after current one
-    void insert(RowRefType&& row_ref, Arena& pool) {
-        if (!next) {
-            next = pool.alloc<Batch<RowRefType>>();
-            *next = Batch<RowRefType>(nullptr);
-        }
-        next = next->insert(std::move(row_ref), pool);
-    }
+    void insert(RowRefType&& row_ref, Arena& pool) { next.emplace_back(std::move(row_ref)); }
+
+    void clear() { next.clear(); }
 
 private:
     friend class ForwardIterator<RowRefList>;
-
-    Batch<RowRefType>* next = nullptr;
+    std::vector<RowRefType> next;
 };
 
 struct RowRefListWithFlag : RowRef {
     using RowRefType = RowRef;
 
     RowRefListWithFlag() = default;
-    RowRefListWithFlag(size_t row_num_, uint8_t block_offset_) : RowRef(row_num_, block_offset_) {}
+    RowRefListWithFlag(size_t row_num_) : RowRef(row_num_) {}
 
     ForwardIterator<RowRefListWithFlag> const begin() {
         return ForwardIterator<RowRefListWithFlag>(this);
     }
 
     /// insert element after current one
-    void insert(RowRef&& row_ref, Arena& pool) {
-        if (!next) {
-            next = pool.alloc<Batch<RowRefType>>();
-            *next = Batch<RowRefType>(nullptr);
-        }
-        next = next->insert(std::move(row_ref), pool);
-    }
+    void insert(RowRefType&& row_ref, Arena& pool) { next.emplace_back(row_ref); }
+
+    void clear() { next.clear(); }
 
     bool visited = false;
 
 private:
     friend class ForwardIterator<RowRefListWithFlag>;
-
-    Batch<RowRefType>* next = nullptr;
+    std::vector<RowRefType> next;
 };
 
 struct RowRefListWithFlags : RowRefWithFlag {
     using RowRefType = RowRefWithFlag;
 
     RowRefListWithFlags() = default;
-    RowRefListWithFlags(size_t row_num_, uint8_t block_offset_)
-            : RowRefWithFlag(row_num_, block_offset_) {}
+    RowRefListWithFlags(size_t row_num_) : RowRefWithFlag(row_num_) {}
 
     ForwardIterator<RowRefListWithFlags> const begin() {
         return ForwardIterator<RowRefListWithFlags>(this);
     }
 
     /// insert element after current one
-    void insert(RowRefWithFlag&& row_ref, Arena& pool) {
-        if (!next) {
-            next = pool.alloc<Batch<RowRefType>>();
-            *next = Batch<RowRefType>(nullptr);
-        }
-        next = next->insert(std::move(row_ref), pool);
-    }
+    void insert(RowRefType&& row_ref, Arena& pool) { next.emplace_back(row_ref); }
+
+    void clear() { next.clear(); }
 
 private:
     friend class ForwardIterator<RowRefListWithFlags>;
-
-    Batch<RowRefType>* next = nullptr;
+    std::vector<RowRefType> next;
 };
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/join/process_hash_table_probe.h b/be/src/vec/exec/join/process_hash_table_probe.h
index df75952820..295317517d 100644
--- a/be/src/vec/exec/join/process_hash_table_probe.h
+++ b/be/src/vec/exec/join/process_hash_table_probe.h
@@ -68,23 +68,12 @@ struct ProcessHashTableProbe {
     // and output block may be different
     // The output result is determined by the other join conjunct result and same_to_prev struct
     Status do_other_join_conjuncts(Block* output_block, bool is_mark_join,
-                                   int multi_matched_output_row_count, bool is_the_last_sub_block);
-
-    void _process_splited_equal_matched_tuples(int start_row_idx, int row_count,
-                                               const UInt8* __restrict other_hit_column,
-                                               UInt8* __restrict null_map_data,
-                                               UInt8* __restrict filter_map, Block* output_block);
-
-    void _emplace_element(int8_t block_offset, int32_t block_row, int& current_offset);
+                                   std::vector<uint8_t>& visited, bool has_null_in_build_side);
 
     template <typename HashTableType>
     typename HashTableType::State _init_probe_side(HashTableType& hash_table_ctx, size_t probe_rows,
                                                    bool with_other_join_conjuncts,
-                                                   const uint8_t* null_map);
-
-    template <typename Mapped, bool with_other_join_conjuncts>
-    ForwardIterator<Mapped>& _probe_row_match(int& current_offset, int& probe_index,
-                                              size_t& probe_size, bool& all_match_one);
+                                                   const uint8_t* null_map, bool need_judge_null);
 
     // Process full outer join/ right join / right semi/anti join to output the join result
     // in hash table
@@ -94,14 +83,14 @@ struct ProcessHashTableProbe {
 
     Parent* _parent = nullptr;
     const int _batch_size;
-    std::shared_ptr<std::vector<Block>> _build_blocks;
+    const std::shared_ptr<Block>& _build_block;
     std::unique_ptr<Arena> _arena;
     std::vector<StringRef> _probe_keys;
 
     std::vector<uint32_t> _probe_indexs;
-    PaddedPODArray<int8_t> _build_block_offsets;
-    PaddedPODArray<int32_t> _build_block_rows;
-    std::vector<std::pair<int8_t, int>> _build_blocks_locs;
+    bool _probe_visited = false;
+    std::vector<uint32_t> _build_indexs;
+    std::vector<int> _build_blocks_locs;
     // only need set the tuple is null in RIGHT_OUTER_JOIN and FULL_OUTER_JOIN
     ColumnUInt8::Container* _tuple_is_null_left_flags = nullptr;
     // only need set the tuple is null in LEFT_OUTER_JOIN and FULL_OUTER_JOIN
@@ -112,13 +101,6 @@ struct ProcessHashTableProbe {
     std::unique_ptr<Arena> _serialize_key_arena;
     std::vector<char> _probe_side_find_result;
 
-    std::vector<bool*> _visited_map;
-    std::vector<bool> _same_to_prev;
-
-    int _right_col_idx;
-    int _right_col_len;
-    int _row_count_from_last_probe;
-
     bool _have_other_join_conjunct;
     bool _is_right_semi_anti;
     std::vector<bool>* _left_output_slot_flags = nullptr;
@@ -130,7 +112,9 @@ struct ProcessHashTableProbe {
     RuntimeProfile::Counter* _build_side_output_timer = nullptr;
     RuntimeProfile::Counter* _probe_side_output_timer = nullptr;
     RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr;
-    static constexpr int PROBE_SIDE_EXPLODE_RATE = 1;
+
+    int _right_col_idx;
+    int _right_col_len;
 };
 
 } // namespace vectorized
diff --git a/be/src/vec/exec/join/process_hash_table_probe_impl.h b/be/src/vec/exec/join/process_hash_table_probe_impl.h
index 704e5dc2b5..38f8b3a558 100644
--- a/be/src/vec/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/vec/exec/join/process_hash_table_probe_impl.h
@@ -32,7 +32,7 @@ template <int JoinOpType, typename Parent>
 ProcessHashTableProbe<JoinOpType, Parent>::ProcessHashTableProbe(Parent* parent, int batch_size)
         : _parent(parent),
           _batch_size(batch_size),
-          _build_blocks(parent->build_blocks()),
+          _build_block(parent->build_block()),
           _tuple_is_null_left_flags(parent->is_outer_join()
                                             ? &(reinterpret_cast<ColumnUInt8&>(
                                                         *parent->_tuple_is_null_left_flag_column)
@@ -52,7 +52,11 @@ ProcessHashTableProbe<JoinOpType, Parent>::ProcessHashTableProbe(Parent* parent,
           _search_hashtable_timer(parent->_search_hashtable_timer),
           _build_side_output_timer(parent->_build_side_output_timer),
           _probe_side_output_timer(parent->_probe_side_output_timer),
-          _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer) {}
+          _probe_process_hashtable_timer(parent->_probe_process_hashtable_timer),
+          _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct)
+                                 ? 0
+                                 : _parent->left_table_data_types().size()),
+          _right_col_len(_parent->right_table_data_types().size()) {}
 
 template <int JoinOpType, typename Parent>
 void ProcessHashTableProbe<JoinOpType, Parent>::build_side_output_column(
@@ -68,52 +72,14 @@ void ProcessHashTableProbe<JoinOpType, Parent>::build_side_output_column(
     constexpr auto probe_all =
             JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN;
 
-    if (!is_semi_anti_join || have_other_join_conjunct) {
-        if (_build_blocks->size() == 1) {
-            for (int i = 0; i < _right_col_len; i++) {
-                auto& column = *(*_build_blocks)[0].get_by_position(i).column;
-                if (output_slot_flags[i]) {
-                    mcol[i + _right_col_idx]->insert_indices_from(column, _build_block_rows.data(),
-                                                                  _build_block_rows.data() + size);
-                } else {
-                    mcol[i + _right_col_idx]->insert_many_defaults(size);
-                }
-            }
-        } else {
-            for (int i = 0; i < _right_col_len; i++) {
-                if (output_slot_flags[i]) {
-                    for (int j = 0; j < size; j++) {
-                        if constexpr (probe_all) {
-                            if (_build_block_offsets[j] == -1) {
-                                DCHECK(mcol[i + _right_col_idx]->is_nullable());
-                                assert_cast<ColumnNullable*>(mcol[i + _right_col_idx].get())
-                                        ->insert_default();
-                            } else {
-                                auto& column = *(*_build_blocks)[_build_block_offsets[j]]
-                                                        .get_by_position(i)
-                                                        .column;
-                                mcol[i + _right_col_idx]->insert_from(column, _build_block_rows[j]);
-                            }
-                        } else {
-                            if (_build_block_offsets[j] == -1) {
-                                // the only case to reach here:
-                                // 1. left anti join with other conjuncts, and
-                                // 2. equal conjuncts does not match
-                                // since nullptr is emplaced back to visited_map,
-                                // the output value of the build side does not matter,
-                                // just insert default value
-                                mcol[i + _right_col_idx]->insert_default();
-                            } else {
-                                auto& column = *(*_build_blocks)[_build_block_offsets[j]]
-                                                        .get_by_position(i)
-                                                        .column;
-                                mcol[i + _right_col_idx]->insert_from(column, _build_block_rows[j]);
-                            }
-                        }
-                    }
-                } else {
-                    mcol[i + _right_col_idx]->insert_many_defaults(size);
-                }
+    if ((!is_semi_anti_join || have_other_join_conjunct) && size) {
+        for (int i = 0; i < _right_col_len; i++) {
+            const auto& column = *_build_block->safe_get_by_position(i).column;
+            if (output_slot_flags[i]) {
+                mcol[i + _right_col_idx]->insert_indices_from_join(column, _build_indexs.data(),
+                                                                   _build_indexs.data() + size);
+            } else {
+                mcol[i + _right_col_idx]->insert_many_defaults(size);
             }
         }
     }
@@ -123,7 +89,7 @@ void ProcessHashTableProbe<JoinOpType, Parent>::build_side_output_column(
         _tuple_is_null_right_flags->resize(size);
         auto* __restrict null_data = _tuple_is_null_right_flags->data();
         for (int i = 0; i < size; ++i) {
-            null_data[i] = _build_block_rows[i] == -1;
+            null_data[i] = _build_indexs[i] == 0;
         }
     }
 }
@@ -159,73 +125,22 @@ template <int JoinOpType, typename Parent>
 template <typename HashTableType>
 typename HashTableType::State ProcessHashTableProbe<JoinOpType, Parent>::_init_probe_side(
         HashTableType& hash_table_ctx, size_t probe_rows, bool with_other_join_conjuncts,
-        const uint8_t* null_map) {
-    _right_col_idx = _is_right_semi_anti && !with_other_join_conjuncts
-                             ? 0
-                             : _parent->left_table_data_types().size();
-    _right_col_len = _parent->right_table_data_types().size();
-    _row_count_from_last_probe = 0;
-
-    _build_block_rows.clear();
-    _build_block_offsets.clear();
-    _probe_indexs.clear();
-    if (with_other_join_conjuncts) {
-        // use in right join to change visited state after exec the vother join conjunct
-        _visited_map.clear();
-        _same_to_prev.clear();
-        _visited_map.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE);
-        _same_to_prev.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE);
-    }
-    _probe_indexs.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE);
-    _build_block_rows.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE);
-    _build_block_offsets.reserve(_batch_size * PROBE_SIDE_EXPLODE_RATE);
+        const uint8_t* null_map, bool need_judge_null) {
+    // may over batch size 1 for some outer join case
+    _probe_indexs.resize(_batch_size + 1);
+    _build_indexs.resize(_batch_size + 1);
 
     if (!_parent->_ready_probe) {
         _parent->_ready_probe = true;
         hash_table_ctx.reset();
-        hash_table_ctx.init_serialized_keys(_parent->_probe_columns, probe_rows, null_map);
+        hash_table_ctx.init_serialized_keys(_parent->_probe_columns, probe_rows, null_map, true,
+                                            false, hash_table_ctx.hash_table->get_bucket_size());
+        hash_table_ctx.hash_table->pre_build_idxs(hash_table_ctx.bucket_nums,
+                                                  need_judge_null ? null_map : nullptr);
     }
     return typename HashTableType::State(_parent->_probe_columns);
 }
 
-template <int JoinOpType, typename Parent>
-template <typename Mapped, bool with_other_join_conjuncts>
-ForwardIterator<Mapped>& ProcessHashTableProbe<JoinOpType, Parent>::_probe_row_match(
-        int& current_offset, int& probe_index, size_t& probe_size, bool& all_match_one) {
-    auto& probe_row_match_iter = std::get<ForwardIterator<Mapped>>(_parent->_probe_row_match_iter);
-    if (!probe_row_match_iter.ok()) {
-        return probe_row_match_iter;
-    }
-
-    SCOPED_TIMER(_search_hashtable_timer);
-    for (; probe_row_match_iter.ok() && current_offset < _batch_size; ++probe_row_match_iter) {
-        _emplace_element(probe_row_match_iter->block_offset, probe_row_match_iter->row_num,
-                         current_offset);
-        _probe_indexs.emplace_back(probe_index);
-        if constexpr (with_other_join_conjuncts) {
-            _visited_map.emplace_back(&probe_row_match_iter->visited);
-        }
-    }
-
-    _row_count_from_last_probe = current_offset;
-    all_match_one &= (current_offset == 1);
-    if (!probe_row_match_iter.ok()) {
-        ++probe_index;
-    }
-    probe_size = 1;
-
-    return probe_row_match_iter;
-}
-
-template <int JoinOpType, typename Parent>
-void ProcessHashTableProbe<JoinOpType, Parent>::_emplace_element(int8_t block_offset,
-                                                                 int32_t block_row,
-                                                                 int& current_offset) {
-    _build_block_offsets.emplace_back(block_offset);
-    _build_block_rows.emplace_back(block_row);
-    current_offset++;
-}
-
 template <int JoinOpType, typename Parent>
 template <bool need_null_map_for_probe, bool ignore_null, typename HashTableType,
           bool with_other_conjuncts, bool is_mark_join>
@@ -234,48 +149,28 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_process(HashTableType& hash
                                                              MutableBlock& mutable_block,
                                                              Block* output_block,
                                                              size_t probe_rows) {
+    if (_right_col_len && !_build_block) {
+        return Status::InternalError("build block is nullptr");
+    }
+
     auto& probe_index = _parent->_probe_index;
+    auto& build_index = _parent->_build_index;
+    auto last_probe_index = probe_index;
 
-    using KeyGetter = typename HashTableType::State;
-    using Mapped = typename HashTableType::Mapped;
-
-    KeyGetter key_getter =
-            _init_probe_side<HashTableType>(hash_table_ctx, probe_rows, with_other_conjuncts,
-                                            need_null_map_for_probe ? null_map->data() : nullptr);
+    _init_probe_side<HashTableType>(
+            hash_table_ctx, probe_rows, with_other_conjuncts,
+            need_null_map_for_probe ? null_map->data() : nullptr,
+            need_null_map_for_probe && ignore_null &&
+                    (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
+                     JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ||
+                     JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || is_mark_join));
 
     auto& mcol = mutable_block.mutable_columns();
 
-    constexpr auto is_right_semi_anti_join =
-            JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || JoinOpType == TJoinOp::RIGHT_SEMI_JOIN;
-
-    constexpr auto probe_all =
-            JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN;
-
-    int last_probe_index = probe_index;
-
     int current_offset = 0;
-    bool all_match_one = true;
+    bool all_match_one = false;
     size_t probe_size = 0;
 
-    auto& probe_row_match_iter = _probe_row_match<Mapped, with_other_conjuncts>(
-            current_offset, probe_index, probe_size, all_match_one);
-
-    // If not(which means it excceed batch size), probe_index is not increased and
-    // remaining matched rows for the current probe row will be
-    // handled in the next call of this function
-    int multi_matched_output_row_count = 0;
-
-    // Is the last sub block of splitted block
-    bool is_the_last_sub_block = false;
-
-    if (with_other_conjuncts && probe_size != 0) {
-        is_the_last_sub_block = !probe_row_match_iter.ok();
-        _same_to_prev.emplace_back(false);
-        for (int i = 0; i < current_offset - 1; ++i) {
-            _same_to_prev.emplace_back(true);
-        }
-    }
-
     std::unique_ptr<ColumnFilterHelper> mark_column;
     if (is_mark_join) {
         mark_column = std::make_unique<ColumnFilterHelper>(*mcol[mcol.size() - 1]);
@@ -283,121 +178,17 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_process(HashTableType& hash
 
     {
         SCOPED_TIMER(_search_hashtable_timer);
-        using FindResult = KeyGetter::FindResult;
-        FindResult empty = {nullptr, false};
-        while (current_offset < _batch_size && probe_index < probe_rows) {
-            if constexpr (ignore_null && need_null_map_for_probe) {
-                if ((*null_map)[probe_index]) {
-                    if constexpr (probe_all) {
-                        // only full outer / left outer need insert the data of right table
-                        _emplace_element(-1, -1, current_offset);
-                        _probe_indexs.emplace_back(probe_index);
-
-                        if constexpr (with_other_conjuncts) {
-                            _same_to_prev.emplace_back(false);
-                            _visited_map.emplace_back(nullptr);
-                        }
-                    } else {
-                        all_match_one = false;
-                    }
-                    probe_index++;
-                    continue;
-                }
-            }
-
-            const auto& find_result = need_null_map_for_probe && (*null_map)[probe_index]
-                                              ? empty
-                                              : hash_table_ctx.find(key_getter, probe_index);
-
-            auto current_probe_index = probe_index;
-            if constexpr (!with_other_conjuncts &&
-                          (JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
-                           JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
-                           JoinOpType == TJoinOp::LEFT_SEMI_JOIN)) {
-                bool need_go_ahead =
-                        (JoinOpType != TJoinOp::LEFT_SEMI_JOIN) ^ find_result.is_found();
-                if constexpr (is_mark_join) {
-                    ++current_offset;
-                    bool null_result = (need_null_map_for_probe && (*null_map)[probe_index]) ||
-                                       (!need_go_ahead && *_has_null_in_build_side);
-                    if (null_result) {
-                        mark_column->insert_null();
-                    } else {
-                        mark_column->insert_value(need_go_ahead);
-                    }
-                } else {
-                    current_offset += need_go_ahead;
-                }
-                ++probe_index;
-            } else {
-                if (find_result.is_found()) {
-                    auto& mapped = find_result.get_mapped();
-                    auto origin_offset = current_offset;
-
-                    // For mark join, if euqual-matched tuple count for one probe row
-                    // excceeds batch size, it's difficult to implement the logic to
-                    // split them into multiple sub blocks and handle them, keep the original
-                    // logic for now.
-                    if constexpr (is_mark_join && with_other_conjuncts) {
-                        for (auto it = mapped.begin(); it.ok(); ++it) {
-                            _emplace_element(it->block_offset, it->row_num, current_offset);
-                            _visited_map.emplace_back(&it->visited);
-                        }
-                        ++probe_index;
-                    } else if constexpr (with_other_conjuncts || !is_right_semi_anti_join) {
-                        auto multi_match_last_offset = current_offset;
-                        auto it = mapped.begin();
-                        for (; it.ok() && current_offset < _batch_size; ++it) {
-                            _emplace_element(it->block_offset, it->row_num, current_offset);
-
-                            if constexpr (with_other_conjuncts) {
-                                _visited_map.emplace_back(&it->visited);
-                            }
-                        }
-                        probe_row_match_iter = it;
-                        if (!it.ok()) {
-                            // If all matched rows for the current probe row are handled,
-                            // advance to next probe row.
-                            // If not(which means it excceed batch size), probe_index is not increased and
-                            // remaining matched rows for the current probe row will be
-                            // handled in the next call of this function
-                            ++probe_index;
-                        } else if constexpr (with_other_conjuncts) {
-                            multi_matched_output_row_count =
-                                    current_offset - multi_match_last_offset;
-                        }
-                    } else {
-                        ++probe_index;
-                    }
-                    if constexpr (std::is_same_v<Mapped, RowRefListWithFlag>) {
-                        mapped.visited = true;
-                    }
-
-                    if constexpr (with_other_conjuncts) {
-                        _same_to_prev.emplace_back(false);
-                        for (int i = 0; i < current_offset - origin_offset - 1; ++i) {
-                            _same_to_prev.emplace_back(true);
-                        }
-                    }
-                } else if constexpr (probe_all || JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
-                                     JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
-                                     (JoinOpType == TJoinOp::LEFT_SEMI_JOIN && is_mark_join)) {
-                    // only full outer / left outer need insert the data of right table
-                    _emplace_element(-1, -1, current_offset);
-
-                    if constexpr (with_other_conjuncts) {
-                        _same_to_prev.emplace_back(false);
-                        _visited_map.emplace_back(nullptr);
-                    }
-                    ++probe_index;
-                } else {
-                    ++probe_index;
-                }
-            }
-            all_match_one &= (current_offset == _probe_indexs.size() + 1);
-            _probe_indexs.resize(current_offset, current_probe_index);
-        }
-        probe_size = probe_index - last_probe_index + probe_row_match_iter.ok();
+        auto [new_probe_idx, new_build_idx,
+              new_current_offset] = hash_table_ctx.hash_table->template find_batch < JoinOpType,
+              with_other_conjuncts, is_mark_join,
+              need_null_map_for_probe &&
+                      ignore_null > (hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(),
+                                     probe_index, build_index, probe_rows, _probe_indexs.data(),
+                                     _probe_visited, _build_indexs.data(), mark_column.get());
+        probe_index = new_probe_idx;
+        build_index = new_build_idx;
+        current_offset = new_current_offset;
+        probe_size = probe_index - last_probe_index;
     }
 
     build_side_output_column(mcol, *_right_output_slot_flags, current_offset, with_other_conjuncts);
@@ -412,8 +203,9 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_process(HashTableType& hash
     output_block->swap(mutable_block.to_block());
 
     if constexpr (with_other_conjuncts) {
-        return do_other_join_conjuncts(output_block, is_mark_join, multi_matched_output_row_count,
-                                       is_the_last_sub_block);
+        return do_other_join_conjuncts(output_block, is_mark_join,
+                                       hash_table_ctx.hash_table->get_visited(),
+                                       hash_table_ctx.hash_table->has_null_key());
     }
 
     return Status::OK();
@@ -421,8 +213,8 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_process(HashTableType& hash
 
 template <int JoinOpType, typename Parent>
 Status ProcessHashTableProbe<JoinOpType, Parent>::do_other_join_conjuncts(
-        Block* output_block, bool is_mark_join, int multi_matched_output_row_count,
-        bool is_the_last_sub_block) {
+        Block* output_block, bool is_mark_join, std::vector<uint8_t>& visited,
+        bool has_null_in_build_side) {
     // dispose the other join conjunct exec
     auto row_count = output_block->rows();
     if (!row_count) {
@@ -451,313 +243,90 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_other_join_conjuncts(
 
     if constexpr (JoinOpType == TJoinOp::LEFT_OUTER_JOIN ||
                   JoinOpType == TJoinOp::FULL_OUTER_JOIN) {
-        auto new_filter_column = ColumnVector<UInt8>::create(row_count);
+        auto new_filter_column = ColumnUInt8::create(row_count);
         auto* __restrict filter_map = new_filter_column->get_data().data();
 
-        auto null_map_column = ColumnVector<UInt8>::create(row_count, 0);
-        auto* __restrict null_map_data = null_map_column->get_data().data();
-
-        // It contains non-first sub block of splited equal-conjuncts-matched tuples from last probe row
-        if (_row_count_from_last_probe > 0) {
-            _process_splited_equal_matched_tuples(0, _row_count_from_last_probe, filter_column_ptr,
-                                                  null_map_data, filter_map, output_block);
-            // This is the last sub block of splitted block, and no equal-conjuncts-matched tuple
-            // is output in all sub blocks, need to output a tuple for this probe row
-            if (is_the_last_sub_block && !_parent->_is_any_probe_match_row_output) {
-                filter_map[0] = true;
-                null_map_data[0] = true;
-            }
-        }
-        int end_idx = row_count - multi_matched_output_row_count;
         // process equal-conjuncts-matched tuples that are newly generated
         // in this run if there are any.
-        bool has_no_match = true; /// If there was no any match in right table.
-        for (int i = _row_count_from_last_probe; i < end_idx; ++i) {
-            auto join_hit = _visited_map[i] != nullptr;
-            auto other_hit = filter_column_ptr[i];
-            if (!_same_to_prev[i]) {
-                has_no_match = true;
-            }
+        for (int i = 0; i < row_count; ++i) {
+            bool join_hit = _build_indexs[i];
+            bool other_hit = filter_column_ptr[i];
 
-            if (!other_hit) {
-                for (size_t j = 0; j < _right_col_len; ++j) {
-                    typeid_cast<ColumnNullable*>(
-                            std::move(*output_block->get_by_position(j + _right_col_idx).column)
-                                    .assume_mutable()
-                                    .get())
-                            ->get_null_map_data()[i] = true;
-                }
-            }
-            null_map_data[i] = !join_hit || !other_hit;
-
-            // For cases where one probe row matches multiple build rows for equal conjuncts,
-            // all the other-conjuncts-matched tuples should be output.
-            //
-            // Other-conjuncts-NOT-matched tuples fall into two categories:
-            //    1. The beginning consecutive one(s).
-            //       For these tuples, only the last one is marked to output;
-            //       If there are any following other-conjuncts-matched tuples,
-            //       the last tuple is also marked NOT to output.
-            //    2. All the remaining other-conjuncts-NOT-matched tuples.
-            //       All these tuples are marked not to output.
-            if (join_hit) {
-                *_visited_map[i] |= other_hit;
-
-                /// Assuming that a row in the left table matches N rows in the right table after scanning the hash table,
-                /// which means `_same_to_prev[1]` ... `_same_to_prev[N - 1]` are all true.
-                /// If `other_hit` is true, it is outputted as a match.
-                /// However, if `other_hit` is false the current row needs to be outputted (`filter_map[i] = true`)
-                /// and the output of the previous row is cancelled(`filter_map[i - 1] = false`).
-                /// If a row in the left table matches at least one row in the right table (after filtering through other conjunctions, `has_no_match` is true),
-                /// the rows that do not satisfy the other conjunctions do not need to be output.
-                /**
-                 * Assuming match 4 rows in right table:
-                 * ________________________________
-                 * | row index  | other conjuncts |
-                 * +------------+-----------------|
-                 * |      0     |      0          |
-                 * +------------+-----------------|
-                 * |      1     |      0          |
-                 * +------------+-----------------|
-                 * |      2     |      1          |
-                 * +------------+-----------------|
-                 * |      3     |      0          |
-                 * --------------------------------
-                 *
-                 * Scan row 0: `other_hit` is false, `!_same_to_prev[i]` is true
-                 *             set `filter_map[0]` = true
-                 * Scan row 1: `other_hit` is false, !_same_to_prev[i] is false, has_no_match is true, filter_map[i - 1] is true
-                 *             set `filter_map[1] = true`, `filter_map[0] = false`
-                 * Scan row 2: `other_hit` is true, !_same_to_prev[i] is false, has_no_match is true, filter_map[i - 1] is true
-                 *             set filter_map[2] = true, has_no_match = true
-                 *             `_same_to_prev[2] && filter_map[2] && !filter_column_ptr[2 - 1]` is true,
-                 *             so set `filter_map[2 - 1] = false`
-                 * Scan row 3: `other_hit` is false, `!_same_to_prev[i]` is false, has_no_match is true
-                 *             set `filter_map[3]` = false
-                 *
-                 * After scanned the 4 rows,
-                 *      filter_map[0]: false
-                 *      filter_map[1]: false
-                 *      filter_map[2]: true
-                 *      filter_map[3]: false
-                 */
-                filter_map[i] =
-                        other_hit || !_same_to_prev[i] || (has_no_match && filter_map[i - 1]);
-                has_no_match &= !other_hit;
-                // Here to keep only hit join conjunct and other join conjunt is true need to be output.
-                // if not, only some key must keep one row will output will null right table column
-                if (_same_to_prev[i] && filter_map[i] && !filter_column_ptr[i - 1]) {
-                    filter_map[i - 1] = false;
-                }
+            if (!join_hit) {
+                filter_map[i] = _parent->_last_probe_match != _probe_indexs[i];
             } else {
-                filter_map[i] = true;
+                filter_map[i] = other_hit;
+            }
+            if (filter_map[i]) {
+                _parent->_last_probe_match = _probe_indexs[i];
             }
-        }
-
-        // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row
-        if (multi_matched_output_row_count > 0) {
-            _parent->_is_any_probe_match_row_output = false;
-            _process_splited_equal_matched_tuples(row_count - multi_matched_output_row_count,
-                                                  multi_matched_output_row_count, filter_column_ptr,
-                                                  null_map_data, filter_map, output_block);
         }
 
         for (size_t i = 0; i < row_count; ++i) {
             if (filter_map[i]) {
-                _tuple_is_null_right_flags->emplace_back(null_map_data[i]);
+                _tuple_is_null_right_flags->emplace_back(!_build_indexs[i] ||
+                                                         !filter_column_ptr[i]);
+                if constexpr (JoinOpType == TJoinOp::FULL_OUTER_JOIN) {
+                    visited[_build_indexs[i]] = 1;
+                }
             }
         }
         output_block->get_by_position(result_column_id).column = std::move(new_filter_column);
-    } else if constexpr (JoinOpType == TJoinOp::LEFT_SEMI_JOIN) {
-        // TODO: resize in advance
-        auto new_filter_column = ColumnVector<UInt8>::create();
-        auto& filter_map = new_filter_column->get_data();
+    } else if constexpr (JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
+                         JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
+                         JoinOpType == TJoinOp::LEFT_SEMI_JOIN) {
+        auto new_filter_column = ColumnUInt8::create(row_count);
+        auto* __restrict filter_map = new_filter_column->get_data().data();
 
-        size_t start_row_idx = 1;
-        // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks
-        if (_row_count_from_last_probe > 0) {
-            if (_parent->_is_any_probe_match_row_output) {
-                // if any matched tuple for this probe row is output,
-                // ignore all the following tuples for this probe row.
-                for (int row_idx = 0; row_idx < _row_count_from_last_probe; ++row_idx) {
-                    filter_map.emplace_back(false);
-                }
-                start_row_idx += _row_count_from_last_probe;
-                if (_row_count_from_last_probe < row_count) {
-                    filter_map.emplace_back(filter_column_ptr[_row_count_from_last_probe]);
-                }
-            } else {
-                filter_map.emplace_back(filter_column_ptr[0]);
-            }
-        } else {
-            filter_map.emplace_back(filter_column_ptr[0]);
-        }
-        for (size_t i = start_row_idx; i < row_count; ++i) {
-            if (filter_column_ptr[i] || (_same_to_prev[i] && filter_map[i - 1])) {
-                // Only last same element is true, output last one
-                filter_map.push_back(true);
-                filter_map[i - 1] = !_same_to_prev[i] && filter_map[i - 1];
-            } else {
-                filter_map.push_back(false);
-            }
-        }
-        // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row
-        if (multi_matched_output_row_count > 0) {
-            // If a matched row is output, all the equal-matched tuples in
-            // the following sub blocks should be ignored
-            _parent->_is_any_probe_match_row_output = filter_map[row_count - 1];
-        } else if (_row_count_from_last_probe > 0 && !_parent->_is_any_probe_match_row_output) {
-            // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks,
-            // and no matched tuple has been output in all previous run.
-            // If a tuple is output in this run, all the following mathced tuples should be ignored
-            if (filter_map[_row_count_from_last_probe - 1]) {
-                _parent->_is_any_probe_match_row_output = true;
-            }
-        }
-
-        /// FIXME: incorrect result of semi mark join with other conjuncts(null value missed).
         if (is_mark_join) {
             auto mark_column =
                     output_block->get_by_position(orig_columns - 1).column->assume_mutable();
             ColumnFilterHelper helper(*mark_column);
 
-            // For mark join, we only filter rows which have duplicate join keys.
-            // And then, we set matched_map to the join result to do the mark join's filtering.
-            for (size_t i = 1; i < row_count; ++i) {
-                if (!_same_to_prev[i]) {
-                    helper.insert_value(filter_map[i - 1]);
-                    filter_map[i - 1] = true;
-                }
-            }
-            helper.insert_value(filter_map[filter_map.size() - 1]);
-            filter_map[filter_map.size() - 1] = true;
-        }
-
-        output_block->get_by_position(result_column_id).column = std::move(new_filter_column);
-    } else if constexpr (JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
-                         JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
-        auto new_filter_column = ColumnVector<UInt8>::create(row_count);
-        auto* __restrict filter_map = new_filter_column->get_data().data();
-
-        // for left anti join, the probe side is output only when
-        // there are no matched tuples for the probe row.
-
-        // If multiple equal-conjuncts-matched tuples is splitted into several
-        // sub blocks, just filter out all the other-conjuncts-NOT-matched tuples at first,
-        // and when processing the last sub block, check whether there are any
-        // equal-conjuncts-matched tuple is output in all sub blocks,
-        // if there are none, just pick a tuple and output.
-
-        size_t start_row_idx = 1;
-        // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks
-        if (_row_count_from_last_probe > 0 && _parent->_is_any_probe_match_row_output) {
-            // if any matched tuple for this probe row is output,
-            // ignore all the following tuples for this probe row.
-            for (int row_idx = 0; row_idx < _row_count_from_last_probe; ++row_idx) {
-                filter_map[row_idx] = false;
-            }
-            start_row_idx += _row_count_from_last_probe;
-            if (_row_count_from_last_probe < row_count) {
-                filter_map[_row_count_from_last_probe] =
-                        filter_column_ptr[_row_count_from_last_probe] &&
-                        _visited_map[_row_count_from_last_probe];
-            }
-        } else {
-            // Both equal conjuncts and other conjuncts are true
-            filter_map[0] = filter_column_ptr[0] && _visited_map[0];
-        }
-
-        for (size_t i = start_row_idx; i < row_count; ++i) {
-            if ((_visited_map[i] && filter_column_ptr[i]) ||
-                (_same_to_prev[i] && filter_map[i - 1])) {
-                // When either of two conditions is meet:
-                // 1. Both equal conjuncts and other conjuncts are true or same_to_prev
-                // 2. This row is joined from the same build side row as the previous row
-                // Set filter_map[i] to true and filter_map[i - 1] to false if same_to_prev[i]
-                // is true.
+            for (size_t i = 0; i < row_count; ++i) {
                 filter_map[i] = true;
-                filter_map[i - 1] = !_same_to_prev[i] && filter_map[i - 1];
-            } else {
-                filter_map[i] = false;
-            }
-        }
-
-        if (is_mark_join) {
-            auto& matched_map = assert_cast<ColumnVector<UInt8>&>(
-                                        *(output_block->get_by_position(orig_columns - 1)
-                                                  .column->assume_mutable()))
-                                        .get_data();
-            for (int i = 1; i < row_count; ++i) {
-                if (!_same_to_prev[i]) {
-                    matched_map.push_back(!filter_map[i - 1]);
-                    filter_map[i - 1] = true;
+                if (has_null_in_build_side &&
+                    (_build_indexs[i] != 0) ^ (JoinOpType == TJoinOp::LEFT_SEMI_JOIN)) {
+                    helper.insert_null();
+                } else {
+                    helper.insert_value(filter_column_ptr[i]);
                 }
             }
-            matched_map.push_back(!filter_map[row_count - 1]);
-            filter_map[row_count - 1] = true;
         } else {
-            int end_row_idx = 0;
-            if (_row_count_from_last_probe > 0) {
-                end_row_idx = row_count - multi_matched_output_row_count;
-                if (!_parent->_is_any_probe_match_row_output) {
-                    // We are handling euqual-conjuncts matched tuples that are splitted into multiple blocks,
-                    // and no matched tuple has been output in all previous run.
-                    // If a tuple is output in this run, all the following mathced tuples should be ignored
-                    if (filter_map[_row_count_from_last_probe - 1]) {
-                        _parent->_is_any_probe_match_row_output = true;
-                        filter_map[_row_count_from_last_probe - 1] = false;
-                    }
-                    if (is_the_last_sub_block && !_parent->_is_any_probe_match_row_output) {
-                        // This is the last sub block of splitted block, and no equal-conjuncts-matched tuple
-                        // is output in all sub blocks, output a tuple for this probe row
-                        filter_map[0] = true;
+            if constexpr (JoinOpType == TJoinOp::LEFT_SEMI_JOIN) {
+                for (size_t i = 0; i < row_count; ++i) {
+                    if (filter_column_ptr[i]) {
+                        filter_map[i] = _parent->_last_probe_match != _probe_indexs[i];
+                        _parent->_last_probe_match = _probe_indexs[i];
+                    } else {
+                        filter_map[i] = false;
                     }
                 }
-                if (multi_matched_output_row_count > 0) {
-                    // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row
-                    // If a matched row is output, all the equal-matched tuples in
-                    // the following sub blocks should be ignored
-                    _parent->_is_any_probe_match_row_output = filter_map[row_count - 1];
-                    filter_map[row_count - 1] = false;
-                }
-            } else if (multi_matched_output_row_count > 0) {
-                end_row_idx = row_count - multi_matched_output_row_count;
-                // It contains the first sub block of splited equal-conjuncts-matched tuples of the current probe row
-                // If a matched row is output, all the equal-matched tuples in
-                // the following sub blocks should be ignored
-                _parent->_is_any_probe_match_row_output = filter_map[row_count - 1];
-                filter_map[row_count - 1] = false;
             } else {
-                end_row_idx = row_count;
-            }
-
-            // Same to the semi join, but change the last value to opposite value
-            for (int i = 1 + _row_count_from_last_probe; i < end_row_idx; ++i) {
-                if (!_same_to_prev[i]) {
-                    filter_map[i - 1] = !filter_map[i - 1];
+                for (size_t i = 0; i < row_count; ++i) {
+                    if (_build_indexs[i]) {
+                        filter_map[i] = false;
+                        if (filter_column_ptr[i]) {
+                            _parent->_last_probe_match = _probe_indexs[i];
+                        }
+                    } else {
+                        filter_map[i] = _parent->_last_probe_match != _probe_indexs[i];
+                    }
                 }
             }
-            auto non_sub_blocks_matched_row_count =
-                    row_count - _row_count_from_last_probe - multi_matched_output_row_count;
-            if (non_sub_blocks_matched_row_count > 0) {
-                filter_map[end_row_idx - 1] = !filter_map[end_row_idx - 1];
-            }
         }
 
         output_block->get_by_position(result_column_id).column = std::move(new_filter_column);
     } else if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN ||
                          JoinOpType == TJoinOp::RIGHT_ANTI_JOIN) {
         for (int i = 0; i < row_count; ++i) {
-            DCHECK(_visited_map[i]);
-            *_visited_map[i] |= filter_column_ptr[i];
+            visited[_build_indexs[i]] |= filter_column_ptr[i];
         }
     } else if constexpr (JoinOpType == TJoinOp::RIGHT_OUTER_JOIN) {
         auto filter_size = 0;
         for (int i = 0; i < row_count; ++i) {
-            DCHECK(_visited_map[i]);
-            auto result = filter_column_ptr[i];
-            *_visited_map[i] |= result;
-            filter_size += result;
+            visited[_build_indexs[i]] |= filter_column_ptr[i];
+            filter_size += filter_column_ptr[i];
         }
         _tuple_is_null_left_flags->resize_fill(filter_size, 0);
     }
@@ -771,205 +340,54 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_other_join_conjuncts(
                       JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
             orig_columns = _right_col_idx;
         }
-        static_cast<void>(
-                Block::filter_block(output_block, result_column_id,
-                                    is_mark_join ? output_block->columns() : orig_columns));
+        RETURN_IF_ERROR(Block::filter_block(output_block, result_column_id,
+                                            is_mark_join ? output_block->columns() : orig_columns));
     }
 
     return Status::OK();
 }
 
-// For left or full outer join with other conjuncts.
-// If multiple equal-conjuncts-matched tuples is splitted into several
-// sub blocks, just filter out all the other-conjuncts-NOT-matched tuples at first,
-// and when processing the last sub block, check whether there are any
-// equal-conjuncts-matched tuple is output in all sub blocks,
-// if not, just pick a tuple and output.
-template <int JoinOpType, typename Parent>
-void ProcessHashTableProbe<JoinOpType, Parent>::_process_splited_equal_matched_tuples(
-        int start_row_idx, int row_count, const UInt8* __restrict other_hit_column,
-        UInt8* __restrict null_map_data, UInt8* __restrict filter_map, Block* output_block) {
-    int end_row_idx = start_row_idx + row_count;
-    for (int i = start_row_idx; i < end_row_idx; ++i) {
-        auto join_hit = _visited_map[i] != nullptr;
-        auto other_hit = other_hit_column[i];
-
-        if (!other_hit) {
-            for (size_t j = 0; j < _right_col_len; ++j) {
-                typeid_cast<ColumnNullable*>(
-                        std::move(*output_block->get_by_position(j + _right_col_idx).column)
-                                .assume_mutable()
-                                .get())
-                        ->get_null_map_data()[i] = true;
-            }
-        }
-
-        null_map_data[i] = !join_hit || !other_hit;
-        filter_map[i] = other_hit;
-
-        if (join_hit) {
-            *_visited_map[i] |= other_hit;
-        }
-    }
-    _parent->_is_any_probe_match_row_output |=
-            simd::contain_byte(filter_map + start_row_idx, row_count, 1);
-}
-
 template <int JoinOpType, typename Parent>
 template <typename HashTableType>
 Status ProcessHashTableProbe<JoinOpType, Parent>::process_data_in_hashtable(
         HashTableType& hash_table_ctx, MutableBlock& mutable_block, Block* output_block,
         bool* eos) {
-    using Mapped = typename HashTableType::Mapped;
     SCOPED_TIMER(_probe_process_hashtable_timer);
-    if constexpr (std::is_same_v<Mapped, RowRefListWithFlag> ||
-                  std::is_same_v<Mapped, RowRefListWithFlags>) {
-        hash_table_ctx.init_iterator();
-        auto& mcol = mutable_block.mutable_columns();
+    auto& mcol = mutable_block.mutable_columns();
+    *eos = hash_table_ctx.hash_table->template iterate_map<JoinOpType>(_build_indexs);
+    auto block_size = _build_indexs.size();
 
-        bool right_semi_anti_without_other = _is_right_semi_anti && !_have_other_join_conjunct;
-        int right_col_idx =
-                right_semi_anti_without_other ? 0 : _parent->left_table_data_types().size();
-        int right_col_len = _parent->right_table_data_types().size();
-
-        auto& iter = hash_table_ctx.iterator;
-        auto block_size = 0;
-        auto& visited_iter =
-                std::get<ForwardIterator<Mapped>>(_parent->_outer_join_pull_visited_iter);
-        _build_blocks_locs.resize(_batch_size);
-        auto register_build_loc = [&](int8_t offset, int32_t row_nums) {
-            _build_blocks_locs[block_size++] = std::pair<int8_t, int>(offset, row_nums);
-        };
-
-        if (visited_iter.ok()) {
-            if constexpr (std::is_same_v<Mapped, RowRefListWithFlag>) {
-                for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
-                    register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                }
-            } else {
-                for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
-                    if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
-                        if (visited_iter->visited) {
-                            register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                        }
-                    } else {
-                        if (!visited_iter->visited) {
-                            register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                        }
-                    }
-                }
-            }
-            if (!visited_iter.ok()) {
-                ++iter;
-            }
+    if (block_size) {
+        if (mcol.size() < _right_col_len + _right_col_idx) {
+            return Status::InternalError(
+                    "output block invalid, mcol.size()={}, _right_col_len={}, _right_col_idx={}",
+                    mcol.size(), _right_col_len, _right_col_idx);
         }
-
-        for (; iter != hash_table_ctx.hash_table->end() && block_size < _batch_size; ++iter) {
-            auto& mapped = iter->get_second();
-            if constexpr (std::is_same_v<Mapped, RowRefListWithFlag>) {
-                if (mapped.visited) {
-                    if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
-                        visited_iter = mapped.begin();
-                        for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
-                            register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                        }
-                        if (visited_iter.ok()) {
-                            // block_size >= _batch_size, quit for loop
-                            break;
-                        }
-                    }
-                } else {
-                    if constexpr (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) {
-                        visited_iter = mapped.begin();
-                        for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
-                            register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                        }
-                        if (visited_iter.ok()) {
-                            // block_size >= _batch_size, quit for loop
-                            break;
-                        }
-                    }
-                }
-            } else {
-                visited_iter = mapped.begin();
-                for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
-                    if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
-                        if (visited_iter->visited) {
-                            register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                        }
-                    } else {
-                        if (!visited_iter->visited) {
-                            register_build_loc(visited_iter->block_offset, visited_iter->row_num);
-                        }
-                    }
-                }
-                if (visited_iter.ok()) {
-                    // block_size >= _batch_size, quit for loop
-                    break;
-                }
-            }
-        }
-        _build_blocks_locs.resize(block_size);
-
-        auto insert_build_rows = [&](int8_t offset) {
-            for (size_t j = 0; j < right_col_len; ++j) {
-                auto& column = *(*_build_blocks)[offset].get_by_position(j).column;
-                mcol[j + right_col_idx]->insert_indices_from(
-                        column, _build_block_rows.data(),
-                        _build_block_rows.data() + _build_block_rows.size());
-            }
-        };
-        if (_build_blocks->size() > 1) {
-            std::sort(_build_blocks_locs.begin(), _build_blocks_locs.end(),
-                      [](const auto a, const auto b) { return a.first > b.first; });
-            auto start = 0, end = 0;
-            while (start < _build_blocks_locs.size()) {
-                while (end < _build_blocks_locs.size() &&
-                       _build_blocks_locs[start].first == _build_blocks_locs[end].first) {
-                    end++;
-                }
-                auto offset = _build_blocks_locs[start].first;
-                _build_block_rows.resize(end - start);
-                for (int i = 0; start + i < end; i++) {
-                    _build_block_rows[i] = _build_blocks_locs[start + i].second;
-                }
-                start = end;
-                insert_build_rows(offset);
-            }
-        } else if (_build_blocks->size() == 1) {
-            const auto size = _build_blocks_locs.size();
-            _build_block_rows.resize(_build_blocks_locs.size());
-            for (int i = 0; i < size; i++) {
-                _build_block_rows[i] = _build_blocks_locs[i].second;
-            }
-            insert_build_rows(0);
+        for (size_t j = 0; j < _right_col_len; ++j) {
+            const auto& column = *_build_block->safe_get_by_position(j).column;
+            mcol[j + _right_col_idx]->insert_indices_from_join(column, _build_indexs.data(),
+                                                               _build_indexs.data() + block_size);
         }
 
         // just resize the left table column in case with other conjunct to make block size is not zero
         if (_is_right_semi_anti && _have_other_join_conjunct) {
-            auto target_size = mcol[right_col_idx]->size();
-            for (int i = 0; i < right_col_idx; ++i) {
-                mcol[i]->resize(target_size);
+            for (int i = 0; i < _right_col_idx; ++i) {
+                mcol[i]->resize(block_size);
             }
         }
 
         // right outer join / full join need insert data of left table
         if constexpr (JoinOpType == TJoinOp::RIGHT_OUTER_JOIN ||
                       JoinOpType == TJoinOp::FULL_OUTER_JOIN) {
-            for (int i = 0; i < right_col_idx; ++i) {
+            for (int i = 0; i < _right_col_idx; ++i) {
                 assert_cast<ColumnNullable*>(mcol[i].get())->insert_many_defaults(block_size);
             }
             _tuple_is_null_left_flags->resize_fill(block_size, 1);
         }
-        *eos = iter == hash_table_ctx.hash_table->end();
-        output_block->swap(
-                mutable_block.to_block(right_semi_anti_without_other ? right_col_idx : 0));
+        output_block->swap(mutable_block.to_block(0));
         DCHECK(block_size <= _batch_size);
-        return Status::OK();
-    } else {
-        LOG(FATAL) << "Invalid RowRefList";
-        return Status::InvalidArgument("Invalid RowRefList");
     }
+    return Status::OK();
 }
 
 template <int JoinOpType, typename Parent>
diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp
index f422f7919b..aaa591c110 100644
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <array>
 #include <boost/iterator/iterator_facade.hpp>
+#include <climits>
 #include <functional>
 #include <map>
 #include <memory>
@@ -56,6 +57,7 @@
 #include "vec/common/assert_cast.h"
 #include "vec/common/hash_table/hash_map.h"
 #include "vec/common/uint128.h"
+#include "vec/core/block.h"
 #include "vec/core/column_with_type_and_name.h"
 #include "vec/data_types/data_type.h"
 #include "vec/data_types/data_type_nullable.h"
@@ -71,6 +73,8 @@
 
 namespace doris::vectorized {
 
+constexpr uint32_t JOIN_BUILD_SIZE_LIMIT = std::numeric_limits<uint32_t>::max();
+
 template Status HashJoinNode::_extract_join_column<true>(
         Block&, COW<IColumn>::mutable_ptr<ColumnVector<unsigned char>>&,
         std::vector<IColumn const*, std::allocator<IColumn const*>>&,
@@ -88,19 +92,12 @@ HashJoinNode::HashJoinNode(ObjectPool* pool, const TPlanNode& tnode, const Descr
           _hash_output_slot_ids(tnode.hash_join_node.__isset.hash_output_slot_ids
                                         ? tnode.hash_join_node.hash_output_slot_ids
                                         : std::vector<SlotId> {}),
-          _build_block_idx(0),
           _build_side_mem_used(0),
           _build_side_last_mem_used(0) {
     _runtime_filter_descs = tnode.runtime_filters;
     _arena = std::make_shared<Arena>();
     _hash_table_variants = std::make_shared<HashTableVariants>();
     _process_hashtable_ctx_variants = std::make_unique<HashTableCtxVariants>();
-    _build_blocks.reset(new std::vector<Block>());
-
-    // avoid vector expand change block address.
-    // one block can store 4g data, _build_blocks can store 128*4g data.
-    // if probe data bigger than 512g, runtime filter maybe will core dump when insert data.
-    _build_blocks->reserve(HASH_JOIN_MAX_BUILD_BLOCK_COUNT);
 }
 
 Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) {
@@ -228,10 +225,7 @@ Status HashJoinNode::prepare(RuntimeState* state) {
             ADD_CHILD_TIMER(record_profile, "BuildSideMergeBlockTime", "BuildTime");
     _build_table_insert_timer = ADD_TIMER(record_profile, "BuildTableInsertTime");
     _build_expr_call_timer = ADD_TIMER(record_profile, "BuildExprCallTime");
-    _build_table_expanse_timer = ADD_TIMER(record_profile, "BuildTableExpanseTime");
-    _build_table_convert_timer = ADD_TIMER(record_profile, "BuildTableConvertToPartitionedTime");
     _build_side_compute_hash_timer = ADD_TIMER(record_profile, "BuildSideHashComputingTime");
-    _build_runtime_filter_timer = ADD_TIMER(record_profile, "BuildRuntimeFilterTime");
 
     // Probe phase
     auto probe_phase_profile = _probe_phase_profile;
@@ -249,11 +243,6 @@ Status HashJoinNode::prepare(RuntimeState* state) {
     _allocate_resource_timer = ADD_TIMER(runtime_profile(), "AllocateResourceTime");
     _process_other_join_conjunct_timer = ADD_TIMER(runtime_profile(), "OtherJoinConjunctTime");
 
-    _build_buckets_counter = ADD_COUNTER(runtime_profile(), "BuildBuckets", TUnit::UNIT);
-    _build_buckets_fill_counter = ADD_COUNTER(runtime_profile(), "FilledBuckets", TUnit::UNIT);
-
-    _build_collisions_counter = ADD_COUNTER(runtime_profile(), "BuildCollisions", TUnit::UNIT);
-
     RETURN_IF_ERROR(VExpr::prepare(_build_expr_ctxs, state, child(1)->row_desc()));
     RETURN_IF_ERROR(VExpr::prepare(_probe_expr_ctxs, state, child(0)->row_desc()));
 
@@ -308,7 +297,9 @@ bool HashJoinNode::need_more_input_data() const {
 
 void HashJoinNode::prepare_for_next() {
     _probe_index = 0;
+    _build_index = 0;
     _ready_probe = false;
+    _last_probe_match = -1;
     _prepare_probe_block();
 }
 
@@ -462,7 +453,7 @@ Status HashJoinNode::pull(doris::RuntimeState* state, vectorized::Block* output_
     if (!st) {
         return st;
     }
-    RETURN_IF_ERROR(_filter_data_and_build_output(state, output_block, eos, &temp_block));
+    RETURN_IF_ERROR(_filter_data_and_build_output(state, output_block, eos, &temp_block, false));
     // Here make _join_block release the columns' ptr
     _join_block.set_columns(_join_block.clone_empty_columns());
     mutable_join_block.clear();
@@ -725,52 +716,36 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc
     SCOPED_TIMER(_exec_timer);
     SCOPED_TIMER(_build_timer);
 
-    // make one block for each 4 gigabytes
-    constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL;
-
     if (_should_build_hash_table) {
         // If eos or have already met a null value using short-circuit strategy, we do not need to pull
         // data from probe side.
         _build_side_mem_used += in_block->allocated_bytes();
 
+        if (_build_side_mutable_block.empty()) {
+            auto tmp_build_block =
+                    VectorizedUtils::create_empty_columnswithtypename(child(1)->row_desc());
+            _build_side_mutable_block = MutableBlock::build_mutable_block(&tmp_build_block);
+            RETURN_IF_ERROR(_build_side_mutable_block.merge(
+                    *(tmp_build_block.create_same_struct_block(1, false))));
+        }
+
         if (in_block->rows() != 0) {
             SCOPED_TIMER(_build_side_merge_block_timer);
             RETURN_IF_ERROR(_build_side_mutable_block.merge(*in_block));
-        }
-
-        if (UNLIKELY(_build_side_mem_used - _build_side_last_mem_used > BUILD_BLOCK_MAX_SIZE)) {
-            if (_build_blocks->size() == HASH_JOIN_MAX_BUILD_BLOCK_COUNT) {
-                return Status::NotSupported(strings::Substitute(
-                        "data size of right table in hash join > $0",
-                        BUILD_BLOCK_MAX_SIZE * HASH_JOIN_MAX_BUILD_BLOCK_COUNT));
+            if (_build_side_mutable_block.rows() > JOIN_BUILD_SIZE_LIMIT) {
+                return Status::NotSupported(
+                        "Hash join do not support build table rows"
+                        " over:" +
+                        std::to_string(JOIN_BUILD_SIZE_LIMIT));
             }
-            _build_blocks->emplace_back(_build_side_mutable_block.to_block());
-
-            COUNTER_UPDATE(_build_blocks_memory_usage, (*_build_blocks)[_build_block_idx].bytes());
-
-            // TODO:: Rethink may we should do the process after we receive all build blocks ?
-            // which is better.
-            RETURN_IF_ERROR(_process_build_block(state, (*_build_blocks)[_build_block_idx],
-                                                 _build_block_idx));
-
-            _build_side_mutable_block = MutableBlock();
-            ++_build_block_idx;
-            _build_side_last_mem_used = _build_side_mem_used;
         }
     }
 
     if (_should_build_hash_table && eos) {
-        if (!_build_side_mutable_block.empty()) {
-            if (_build_blocks->size() == HASH_JOIN_MAX_BUILD_BLOCK_COUNT) {
-                return Status::NotSupported(strings::Substitute(
-                        "data size of right table in hash join > $0",
-                        BUILD_BLOCK_MAX_SIZE * HASH_JOIN_MAX_BUILD_BLOCK_COUNT));
-            }
-            _build_blocks->emplace_back(_build_side_mutable_block.to_block());
-            COUNTER_UPDATE(_build_blocks_memory_usage, (*_build_blocks)[_build_block_idx].bytes());
-            RETURN_IF_ERROR(_process_build_block(state, (*_build_blocks)[_build_block_idx],
-                                                 _build_block_idx));
-        }
+        DCHECK(!_build_side_mutable_block.empty());
+        _build_block = std::make_shared<Block>(_build_side_mutable_block.to_block());
+        COUNTER_UPDATE(_build_blocks_memory_usage, _build_block->bytes());
+        RETURN_IF_ERROR(_process_build_block(state, *_build_block));
         auto ret = std::visit(Overload {[&](std::monostate&) -> Status {
                                             LOG(FATAL) << "FATAL: uninited hash table";
                                             __builtin_unreachable();
@@ -791,7 +766,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc
             _shared_hash_table_context->status = Status::OK();
             // arena will be shared with other instances.
             _shared_hash_table_context->arena = _arena;
-            _shared_hash_table_context->blocks = _build_blocks;
+            _shared_hash_table_context->block = _build_block;
             _shared_hash_table_context->hash_table_variants = _hash_table_variants;
             _shared_hash_table_context->short_circuit_for_null_in_probe_side =
                     _has_null_in_build_side;
@@ -823,7 +798,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc
                 *_hash_table_variants,
                 *std::static_pointer_cast<HashTableVariants>(
                         _shared_hash_table_context->hash_table_variants));
-        _build_blocks = _shared_hash_table_context->blocks;
+        _build_block = _shared_hash_table_context->block;
 
         if (!_shared_hash_table_context->runtime_filters.empty()) {
             auto ret = std::visit(
@@ -839,7 +814,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc
                                           _build_expr_ctxs, _runtime_filter_descs);
 
                                   RETURN_IF_ERROR(_runtime_filter_slots->init(
-                                          state, arg.hash_table->size(), 0));
+                                          state, arg.hash_table->size()));
                                   RETURN_IF_ERROR(_runtime_filter_slots->copy_from_shared_context(
                                           _shared_hash_table_context));
                                   RETURN_IF_ERROR(_runtime_filter_slots->publish());
@@ -856,7 +831,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc
 
     // Since the comparison of null values is meaningless, null aware left anti join should not output null
     // when the build side is not empty.
-    if (!_build_blocks->empty() && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
+    if (_build_block && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
         _probe_ignore_null = true;
     }
     _init_short_circuit_for_probe();
@@ -955,7 +930,7 @@ void HashJoinNode::_set_build_ignore_flag(Block& block, const std::vector<int>&
     }
 }
 
-Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uint8_t offset) {
+Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block) {
     SCOPED_TIMER(_build_table_timer);
     size_t rows = block.rows();
     if (UNLIKELY(rows == 0)) {
@@ -970,6 +945,14 @@ Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uin
     RETURN_IF_ERROR(_do_evaluate(block, _build_expr_ctxs, *_build_expr_call_timer, res_col_ids));
     if (_join_op == TJoinOp::LEFT_OUTER_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN) {
         _convert_block_to_null(block);
+        // first row is mocked
+        for (int i = 0; i < block.columns(); i++) {
+            auto [column, is_const] = unpack_if_const(block.safe_get_by_position(i).column);
+            assert_cast<ColumnNullable*>(column->assume_mutable().get())
+                    ->get_null_map_column()
+                    .get_data()
+                    .data()[0] = 1;
+        }
     }
     // TODO: Now we are not sure whether a column is nullable only by ExecNode's `row_desc`
     //  so we have to initialize this flag by the first build block.
@@ -986,28 +969,30 @@ Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uin
     Status st = _extract_join_column<true>(block, null_map_val, raw_ptrs, res_col_ids);
 
     st = std::visit(
-            Overload {
-                    [&](std::monostate& arg, auto has_null_value,
-                        auto short_circuit_for_null_in_build_side) -> Status {
-                        LOG(FATAL) << "FATAL: uninited hash table";
-                        __builtin_unreachable();
-                        return Status::OK();
-                    },
-                    [&](auto&& arg, auto has_null_value,
-                        auto short_circuit_for_null_in_build_side) -> Status {
-                        using HashTableCtxType = std::decay_t<decltype(arg)>;
-                        ProcessHashTableBuild<HashTableCtxType, HashJoinNode>
-                                hash_table_build_process(rows, block, raw_ptrs, this,
-                                                         state->batch_size(), offset, state);
-                        return hash_table_build_process
-                                .template run<has_null_value, short_circuit_for_null_in_build_side>(
-                                        arg,
-                                        has_null_value || short_circuit_for_null_in_build_side
-                                                ? &null_map_val->get_data()
-                                                : nullptr,
-                                        &_has_null_in_build_side);
-                    }},
-            *_hash_table_variants, make_bool_variant(_build_side_ignore_null),
+            Overload {[&](std::monostate& arg, auto join_op, auto has_null_value,
+                          auto short_circuit_for_null_in_build_side) -> Status {
+                          LOG(FATAL) << "FATAL: uninited hash table";
+                          __builtin_unreachable();
+                          return Status::OK();
+                      },
+                      [&](auto&& arg, auto&& join_op, auto has_null_value,
+                          auto short_circuit_for_null_in_build_side) -> Status {
+                          using HashTableCtxType = std::decay_t<decltype(arg)>;
+                          using JoinOpType = std::decay_t<decltype(join_op)>;
+
+                          ProcessHashTableBuild<HashTableCtxType, HashJoinNode>
+                                  hash_table_build_process(rows, block, raw_ptrs, this,
+                                                           state->batch_size(), state);
+                          return hash_table_build_process
+                                  .template run<JoinOpType::value, has_null_value,
+                                                short_circuit_for_null_in_build_side>(
+                                          arg,
+                                          has_null_value || short_circuit_for_null_in_build_side
+                                                  ? &null_map_val->get_data()
+                                                  : nullptr,
+                                          &_has_null_in_build_side);
+                      }},
+            *_hash_table_variants, _join_op_variants, make_bool_variant(_build_side_ignore_null),
             make_bool_variant(_short_circuit_for_null_in_build_side));
 
     return st;
@@ -1076,7 +1061,7 @@ void HashJoinNode::_hash_table_init(RuntimeState* state) {
                     return;
                 }
 
-                if (!try_get_hash_map_context_fixed<PartitionedHashMap, HashCRC32, RowRefListType>(
+                if (!try_get_hash_map_context_fixed<JoinFixedHashMap, HashCRC32, RowRefListType>(
                             *_hash_table_variants, _build_expr_ctxs)) {
                     _hash_table_variants->emplace<SerializedHashTableContext<RowRefListType>>();
                 }
@@ -1084,16 +1069,6 @@ void HashJoinNode::_hash_table_init(RuntimeState* state) {
             _join_op_variants, make_bool_variant(_have_other_join_conjunct));
 
     DCHECK(!std::holds_alternative<std::monostate>(*_hash_table_variants));
-
-    std::visit(Overload {[&](std::monostate& arg) {
-                             LOG(FATAL) << "FATAL: uninited hash table";
-                             __builtin_unreachable();
-                         },
-                         [&](auto&& arg) {
-                             arg.hash_table->set_partitioned_threshold(
-                                     state->partitioned_hash_join_rows_threshold());
-                         }},
-               *_hash_table_variants);
 }
 
 void HashJoinNode::_process_hashtable_ctx_variants_init(RuntimeState* state) {
diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h
index 53cb247d0a..7913c49b0c 100644
--- a/be/src/vec/exec/join/vhash_join_node.h
+++ b/be/src/vec/exec/join/vhash_join_node.h
@@ -83,17 +83,17 @@ struct ProcessRuntimeFilterBuild {
         parent->_runtime_filter_slots = std::make_shared<VRuntimeFilterSlots>(
                 parent->_build_expr_ctxs, parent->runtime_filter_descs());
 
-        RETURN_IF_ERROR(parent->_runtime_filter_slots->init(
-                state, hash_table_ctx.hash_table->size(), parent->_build_rf_cardinality));
+        RETURN_IF_ERROR(
+                parent->_runtime_filter_slots->init(state, hash_table_ctx.hash_table->size()));
 
-        if (!parent->_runtime_filter_slots->empty() && !parent->_inserted_rows.empty()) {
+        if (!parent->_runtime_filter_slots->empty() && !parent->_inserted_blocks.empty()) {
             {
-                SCOPED_TIMER(parent->_push_compute_timer);
-                parent->_runtime_filter_slots->insert(parent->_inserted_rows);
+                SCOPED_TIMER(parent->_runtime_filter_compute_timer);
+                parent->_runtime_filter_slots->insert(parent->_inserted_blocks);
             }
         }
         {
-            SCOPED_TIMER(parent->_push_down_timer);
+            SCOPED_TIMER(parent->_publish_runtime_filter_timer);
             RETURN_IF_ERROR(parent->_runtime_filter_slots->publish());
         }
 
@@ -106,143 +106,56 @@ using ProfileCounter = RuntimeProfile::Counter;
 template <class HashTableContext, typename Parent>
 struct ProcessHashTableBuild {
     ProcessHashTableBuild(int rows, Block& acquired_block, ColumnRawPtrs& build_raw_ptrs,
-                          Parent* parent, int batch_size, uint8_t offset, RuntimeState* state)
+                          Parent* parent, int batch_size, RuntimeState* state)
             : _rows(rows),
-              _skip_rows(0),
               _acquired_block(acquired_block),
               _build_raw_ptrs(build_raw_ptrs),
               _parent(parent),
               _batch_size(batch_size),
-              _offset(offset),
-              _state(state),
-              _build_side_compute_hash_timer(parent->_build_side_compute_hash_timer) {}
+              _state(state) {}
 
-    template <bool ignore_null, bool short_circuit_for_null>
+    template <int JoinOpType, bool ignore_null, bool short_circuit_for_null>
     Status run(HashTableContext& hash_table_ctx, ConstNullMapPtr null_map, bool* has_null_key) {
-        using KeyGetter = typename HashTableContext::State;
-        using Mapped = typename HashTableContext::Mapped;
-
-        Defer defer {[&]() {
-            int64_t bucket_size = hash_table_ctx.hash_table->get_buffer_size_in_cells();
-            int64_t filled_bucket_size = hash_table_ctx.hash_table->size();
-            int64_t bucket_bytes = hash_table_ctx.hash_table->get_buffer_size_in_bytes();
-            COUNTER_SET(_parent->_hash_table_memory_usage, bucket_bytes);
-            COUNTER_SET(_parent->_build_buckets_counter, bucket_size);
-            COUNTER_SET(_parent->_build_collisions_counter,
-                        hash_table_ctx.hash_table->get_collisions());
-            COUNTER_SET(_parent->_build_buckets_fill_counter, filled_bucket_size);
-
-            auto hash_table_buckets = hash_table_ctx.hash_table->get_buffer_sizes_in_cells();
-            std::string hash_table_buckets_info;
-            for (auto bucket_count : hash_table_buckets) {
-                hash_table_buckets_info += std::to_string(bucket_count) + ", ";
+        if (short_circuit_for_null || ignore_null) {
+            // first row is mocked and is null
+            for (uint32_t i = 1; i < _rows; i++) {
+                if ((*null_map)[i]) {
+                    *has_null_key = true;
+                }
             }
-            _parent->add_hash_buckets_info(hash_table_buckets_info);
-
-            auto hash_table_sizes = hash_table_ctx.hash_table->sizes();
-            hash_table_buckets_info.clear();
-            for (auto table_size : hash_table_sizes) {
-                hash_table_buckets_info += std::to_string(table_size) + ", ";
+            if (short_circuit_for_null && *has_null_key) {
+                return Status::OK();
             }
-            _parent->add_hash_buckets_filled_info(hash_table_buckets_info);
-        }};
+        }
 
-        KeyGetter key_getter(_build_raw_ptrs);
+        if (!_parent->runtime_filter_descs().empty()) {
+            _parent->_inserted_blocks.insert(&_acquired_block);
+        }
 
         SCOPED_TIMER(_parent->_build_table_insert_timer);
-        hash_table_ctx.hash_table->reset_resize_timer();
-
-        // only not build_unique, we need expanse hash table before insert data
-        // 1. There are fewer duplicate keys, reducing the number of resize hash tables
-        // can improve performance to a certain extent, about 2%-5%
-        // 2. There are many duplicate keys, and the hash table filled bucket is far less than
-        // the hash table build bucket, which may waste a lot of memory.
-        // TODO, use the NDV expansion of the key column in the optimizer statistics
-        if (!_parent->build_unique()) {
-            RETURN_IF_CATCH_EXCEPTION(hash_table_ctx.hash_table->expanse_for_add_elem(
-                    std::min<int>(_rows, config::hash_table_pre_expanse_max_rows)));
-        }
-
-        vector<int>& inserted_rows = _parent->_inserted_rows[&_acquired_block];
-        bool has_runtime_filter = !_parent->runtime_filter_descs().empty();
-        if (has_runtime_filter) {
-            inserted_rows.reserve(_batch_size);
-        }
+        hash_table_ctx.hash_table->template prepare_build<JoinOpType>(_rows, _batch_size,
+                                                                      *has_null_key);
 
         hash_table_ctx.init_serialized_keys(_build_raw_ptrs, _rows,
-                                            null_map ? null_map->data() : nullptr);
-
-        auto& arena = *_parent->arena();
-        auto old_build_arena_memory = arena.size();
-
-        size_t k = 0;
-        bool inserted = false;
-        auto creator = [&](const auto& ctor, auto& key, auto& origin) {
-            HashTableContext::try_presis_key(key, origin, arena);
-            inserted = true;
-            ctor(key, Mapped {k, _offset});
-        };
-
-        bool build_unique = _parent->build_unique();
-#define EMPLACE_IMPL(stmt)                                                    \
-    for (; k < _rows; ++k) {                                                  \
-        if (k % CHECK_FRECUENCY == 0) {                                       \
-            RETURN_IF_CANCELLED(_state);                                      \
-        }                                                                     \
-        if constexpr (short_circuit_for_null) {                               \
-            if ((*null_map)[k]) {                                             \
-                *has_null_key = true;                                         \
-                return Status::OK();                                          \
-            }                                                                 \
-        } else if constexpr (ignore_null) {                                   \
-            if ((*null_map)[k]) {                                             \
-                *has_null_key = true;                                         \
-                continue;                                                     \
-            }                                                                 \
-        }                                                                     \
-        inserted = false;                                                     \
-        [[maybe_unused]] auto& mapped =                                       \
-                hash_table_ctx.lazy_emplace(key_getter, k, creator, nullptr); \
-        stmt;                                                                 \
-    }
-
-        if (has_runtime_filter && build_unique) {
-            EMPLACE_IMPL(
-                    if (inserted) { inserted_rows.push_back(k); } else { _skip_rows++; });
-        } else if (has_runtime_filter && !build_unique) {
-            EMPLACE_IMPL(
-                    if (inserted) { inserted_rows.push_back(k); } else {
-                        mapped.insert({k, _offset}, *_parent->arena());
-                        inserted_rows.push_back(k);
-                    });
-        } else if (!has_runtime_filter && build_unique) {
-            EMPLACE_IMPL(if (!inserted) { _skip_rows++; });
-        } else {
-            EMPLACE_IMPL(if (!inserted) { mapped.insert({k, _offset}, *_parent->arena()); });
-        }
-        _parent->_build_rf_cardinality += inserted_rows.size();
-
-        _parent->_build_arena_memory_usage->add(arena.size() - old_build_arena_memory);
-
-        COUNTER_UPDATE(_parent->_build_table_expanse_timer,
-                       hash_table_ctx.hash_table->get_resize_timer_value());
-        COUNTER_UPDATE(_parent->_build_table_convert_timer,
-                       hash_table_ctx.hash_table->get_convert_timer_value());
+                                            null_map ? null_map->data() : nullptr, true, true,
+                                            hash_table_ctx.hash_table->get_bucket_size());
+        hash_table_ctx.hash_table->build(hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(),
+                                         _rows);
+        hash_table_ctx.bucket_nums.resize(_batch_size);
+        hash_table_ctx.bucket_nums.shrink_to_fit();
 
+        COUNTER_UPDATE(_parent->_hash_table_memory_usage,
+                       hash_table_ctx.hash_table->get_byte_size());
         return Status::OK();
     }
 
 private:
-    const int _rows;
-    int _skip_rows;
+    const uint32_t _rows;
     Block& _acquired_block;
     ColumnRawPtrs& _build_raw_ptrs;
     Parent* _parent = nullptr;
     int _batch_size;
-    uint8_t _offset;
     RuntimeState* _state = nullptr;
-
-    ProfileCounter* _build_side_compute_hash_timer = nullptr;
 };
 
 template <typename RowRefListType>
@@ -325,8 +238,6 @@ using HashTableIteratorVariants =
         std::variant<std::monostate, ForwardIterator<RowRefList>,
                      ForwardIterator<RowRefListWithFlag>, ForwardIterator<RowRefListWithFlags>>;
 
-static constexpr auto HASH_JOIN_MAX_BUILD_BLOCK_COUNT = 128;
-
 class HashJoinNode final : public VJoinNodeBase {
 public:
     HashJoinNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs);
@@ -369,7 +280,7 @@ public:
     bool have_other_join_conjunct() const { return _have_other_join_conjunct; }
     bool is_right_semi_anti() const { return _is_right_semi_anti; }
     bool is_outer_join() const { return _is_outer_join; }
-    std::shared_ptr<std::vector<Block>> build_blocks() const { return _build_blocks; }
+    const std::shared_ptr<vectorized::Block>& build_block() const { return _build_block; }
     std::vector<bool>* left_output_slot_flags() { return &_left_output_slot_flags; }
     std::vector<bool>* right_output_slot_flags() { return &_right_output_slot_flags; }
     bool* has_null_in_build_side() { return &_has_null_in_build_side; }
@@ -387,19 +298,20 @@ private:
     friend struct ProcessHashTableProbe;
 
     void _init_short_circuit_for_probe() {
+        bool empty_block = !_build_block;
         _short_circuit_for_probe =
                 (_has_null_in_build_side && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN &&
                  !_is_mark_join) ||
-                (_build_blocks->empty() && _join_op == TJoinOp::INNER_JOIN && !_is_mark_join) ||
-                (_build_blocks->empty() && _join_op == TJoinOp::LEFT_SEMI_JOIN && !_is_mark_join) ||
-                (_build_blocks->empty() && _join_op == TJoinOp::RIGHT_OUTER_JOIN) ||
-                (_build_blocks->empty() && _join_op == TJoinOp::RIGHT_SEMI_JOIN) ||
-                (_build_blocks->empty() && _join_op == TJoinOp::RIGHT_ANTI_JOIN);
+                (empty_block && _join_op == TJoinOp::INNER_JOIN && !_is_mark_join) ||
+                (empty_block && _join_op == TJoinOp::LEFT_SEMI_JOIN && !_is_mark_join) ||
+                (empty_block && _join_op == TJoinOp::RIGHT_OUTER_JOIN) ||
+                (empty_block && _join_op == TJoinOp::RIGHT_SEMI_JOIN) ||
+                (empty_block && _join_op == TJoinOp::RIGHT_ANTI_JOIN);
 
         //when build table rows is 0 and not have other_join_conjunct and not _is_mark_join and join type is one of LEFT_OUTER_JOIN/FULL_OUTER_JOIN/LEFT_ANTI_JOIN
         //we could get the result is probe table + null-column(if need output)
         _empty_right_table_need_probe_dispose =
-                (_build_blocks->empty() && !_have_other_join_conjunct && !_is_mark_join) &&
+                (empty_block && !_have_other_join_conjunct && !_is_mark_join) &&
                 (_join_op == TJoinOp::LEFT_OUTER_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN ||
                  _join_op == TJoinOp::LEFT_ANTI_JOIN);
     }
@@ -430,21 +342,14 @@ private:
     RuntimeProfile::Counter* _build_table_timer = nullptr;
     RuntimeProfile::Counter* _build_expr_call_timer = nullptr;
     RuntimeProfile::Counter* _build_table_insert_timer = nullptr;
-    RuntimeProfile::Counter* _build_table_expanse_timer = nullptr;
-    RuntimeProfile::Counter* _build_table_convert_timer = nullptr;
     RuntimeProfile::Counter* _probe_expr_call_timer = nullptr;
     RuntimeProfile::Counter* _probe_next_timer = nullptr;
-    RuntimeProfile::Counter* _build_buckets_counter = nullptr;
-    RuntimeProfile::Counter* _build_buckets_fill_counter = nullptr;
     RuntimeProfile::Counter* _search_hashtable_timer = nullptr;
     RuntimeProfile::Counter* _build_side_output_timer = nullptr;
     RuntimeProfile::Counter* _probe_side_output_timer = nullptr;
     RuntimeProfile::Counter* _probe_process_hashtable_timer = nullptr;
     RuntimeProfile::Counter* _build_side_compute_hash_timer = nullptr;
     RuntimeProfile::Counter* _build_side_merge_block_timer = nullptr;
-    RuntimeProfile::Counter* _build_runtime_filter_timer = nullptr;
-
-    RuntimeProfile::Counter* _build_collisions_counter = nullptr;
 
     RuntimeProfile::Counter* _open_timer = nullptr;
     RuntimeProfile::Counter* _allocate_resource_timer = nullptr;
@@ -467,7 +372,7 @@ private:
     HashTableIteratorVariants _outer_join_pull_visited_iter;
     HashTableIteratorVariants _probe_row_match_iter;
 
-    std::shared_ptr<std::vector<Block>> _build_blocks;
+    std::shared_ptr<Block> _build_block;
     Block _probe_block;
     ColumnRawPtrs _probe_columns;
     ColumnUInt8::MutablePtr _null_map_column;
@@ -476,8 +381,10 @@ private:
     bool _has_set_need_null_map_for_build = false;
     bool _probe_ignore_null = false;
     int _probe_index = -1;
+    uint32_t _build_index = 0;
     bool _ready_probe = false;
     bool _probe_eos = false;
+    int _last_probe_match;
 
     bool _build_side_ignore_null = false;
 
@@ -490,9 +397,6 @@ private:
     std::vector<bool> _left_output_slot_flags;
     std::vector<bool> _right_output_slot_flags;
 
-    // for cases when a probe row matches more than batch size build rows.
-    bool _is_any_probe_match_row_output = false;
-    uint8_t _build_block_idx = 0;
     int64_t _build_side_mem_used = 0;
     int64_t _build_side_last_mem_used = 0;
     MutableBlock _build_side_mutable_block;
@@ -501,7 +405,7 @@ private:
 
     Status _materialize_build_side(RuntimeState* state) override;
 
-    Status _process_build_block(RuntimeState* state, Block& block, uint8_t offset);
+    Status _process_build_block(RuntimeState* state, Block& block);
 
     Status _do_evaluate(Block& block, VExprContextSPtrs& exprs,
                         RuntimeProfile::Counter& expr_call_timer, std::vector<int>& res_col_ids);
@@ -539,10 +443,9 @@ private:
     friend struct ProcessRuntimeFilterBuild;
 
     std::vector<TRuntimeFilterDesc> _runtime_filter_descs;
-    std::unordered_map<const Block*, std::vector<int>> _inserted_rows;
+    std::unordered_set<const Block*> _inserted_blocks;
 
     std::vector<IRuntimeFilter*> _runtime_filters;
-    size_t _build_rf_cardinality = 0;
     std::atomic_bool _probe_open_finish = false;
 };
 } // namespace vectorized
diff --git a/be/src/vec/exec/join/vjoin_node_base.cpp b/be/src/vec/exec/join/vjoin_node_base.cpp
index 2401993563..0077fe2a7b 100644
--- a/be/src/vec/exec/join/vjoin_node_base.cpp
+++ b/be/src/vec/exec/join/vjoin_node_base.cpp
@@ -120,8 +120,8 @@ Status VJoinNodeBase::prepare(RuntimeState* state) {
             ADD_CHILD_TIMER(_probe_phase_profile, "BuildOutputBlock", "ProbeTime");
     _probe_rows_counter = ADD_COUNTER_WITH_LEVEL(_probe_phase_profile, "ProbeRows", TUnit::UNIT, 1);
 
-    _push_down_timer = ADD_TIMER(runtime_profile(), "PublishRuntimeFilterTime");
-    _push_compute_timer = ADD_TIMER(runtime_profile(), "PushDownComputeTime");
+    _publish_runtime_filter_timer = ADD_TIMER(runtime_profile(), "PublishRuntimeFilterTime");
+    _runtime_filter_compute_timer = ADD_TIMER(runtime_profile(), "RunmtimeFilterComputeTime");
 
     return Status::OK();
 }
diff --git a/be/src/vec/exec/join/vjoin_node_base.h b/be/src/vec/exec/join/vjoin_node_base.h
index 8d26db22ea..a44bc5513a 100644
--- a/be/src/vec/exec/join/vjoin_node_base.h
+++ b/be/src/vec/exec/join/vjoin_node_base.h
@@ -142,8 +142,8 @@ protected:
     RuntimeProfile* _probe_phase_profile = nullptr;
     RuntimeProfile::Counter* _probe_timer = nullptr;
     RuntimeProfile::Counter* _probe_rows_counter = nullptr;
-    RuntimeProfile::Counter* _push_down_timer = nullptr;
-    RuntimeProfile::Counter* _push_compute_timer = nullptr;
+    RuntimeProfile::Counter* _publish_runtime_filter_timer = nullptr;
+    RuntimeProfile::Counter* _runtime_filter_compute_timer = nullptr;
     RuntimeProfile::Counter* _join_filter_timer = nullptr;
     RuntimeProfile::Counter* _build_output_block_timer = nullptr;
 };
diff --git a/be/src/vec/exec/join/vnested_loop_join_node.cpp b/be/src/vec/exec/join/vnested_loop_join_node.cpp
index 04d0b6a2c3..a5305a4b53 100644
--- a/be/src/vec/exec/join/vnested_loop_join_node.cpp
+++ b/be/src/vec/exec/join/vnested_loop_join_node.cpp
@@ -74,13 +74,13 @@ Status RuntimeFilterBuild<Parent>::operator()(RuntimeState* state) {
     RETURN_IF_ERROR(runtime_filter_slots.init(state));
 
     if (!runtime_filter_slots.empty() && !_parent->build_blocks().empty()) {
-        SCOPED_TIMER(_parent->push_compute_timer());
+        SCOPED_TIMER(_parent->runtime_filter_compute_timer());
         for (auto& build_block : _parent->build_blocks()) {
             RETURN_IF_ERROR(runtime_filter_slots.insert(&build_block));
         }
     }
     {
-        SCOPED_TIMER(_parent->push_down_timer());
+        SCOPED_TIMER(_parent->publish_runtime_filter_timer());
         RETURN_IF_ERROR(runtime_filter_slots.publish());
     }
 
diff --git a/be/src/vec/exec/join/vnested_loop_join_node.h b/be/src/vec/exec/join/vnested_loop_join_node.h
index 7dd63b498f..b309485db5 100644
--- a/be/src/vec/exec/join/vnested_loop_join_node.h
+++ b/be/src/vec/exec/join/vnested_loop_join_node.h
@@ -99,9 +99,13 @@ public:
 
     std::vector<TRuntimeFilterDesc>& runtime_filter_descs() { return _runtime_filter_descs; }
     VExprContextSPtrs& filter_src_expr_ctxs() { return _filter_src_expr_ctxs; }
-    RuntimeProfile::Counter* push_compute_timer() { return _push_compute_timer; }
+    RuntimeProfile::Counter* runtime_filter_compute_timer() {
+        return _runtime_filter_compute_timer;
+    }
     Blocks& build_blocks() { return _build_blocks; }
-    RuntimeProfile::Counter* push_down_timer() { return _push_down_timer; }
+    RuntimeProfile::Counter* publish_runtime_filter_timer() {
+        return _publish_runtime_filter_timer;
+    }
 
 private:
     template <typename JoinOpType, bool set_build_side_flag, bool set_probe_side_flag>
diff --git a/be/src/vec/exec/vset_operation_node.cpp b/be/src/vec/exec/vset_operation_node.cpp
index 7cc025b607..28dfd23ec7 100644
--- a/be/src/vec/exec/vset_operation_node.cpp
+++ b/be/src/vec/exec/vset_operation_node.cpp
@@ -58,7 +58,6 @@ VSetOperationNode<is_intersect>::VSetOperationNode(ObjectPool* pool, const TPlan
         : ExecNode(pool, tnode, descs),
           _valid_element_in_hash_tbl(0),
           _mem_used(0),
-          _build_block_index(0),
           _build_finished(false) {
     _hash_table_variants = std::make_unique<HashTableVariants>();
 }
@@ -219,7 +218,7 @@ void VSetOperationNode<is_intersect>::hash_table_init() {
         }
         return;
     }
-    if (!try_get_hash_map_context_fixed<PartitionedHashMap, HashCRC32, RowRefListWithFlags>(
+    if (!try_get_hash_map_context_fixed<JoinFixedHashMap, HashCRC32, RowRefListWithFlags>(
                 *_hash_table_variants, _child_expr_lists[0])) {
         _hash_table_variants->emplace<SerializedHashTableContext<RowRefListWithFlags>>();
     }
@@ -228,36 +227,46 @@ void VSetOperationNode<is_intersect>::hash_table_init() {
 template <bool is_intersect>
 Status VSetOperationNode<is_intersect>::sink(RuntimeState* state, Block* block, bool eos) {
     SCOPED_TIMER(_exec_timer);
-    constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL;
 
     if (block->rows() != 0) {
         _mem_used += block->allocated_bytes();
         RETURN_IF_ERROR(_mutable_block.merge(*block));
     }
 
-    if (eos || _mutable_block.allocated_bytes() >= BUILD_BLOCK_MAX_SIZE) {
-        _build_blocks.emplace_back(_mutable_block.to_block());
-        RETURN_IF_ERROR(
-                process_build_block(_build_blocks[_build_block_index], _build_block_index, state));
-        _mutable_block.clear();
-        ++_build_block_index;
-
-        if (eos) {
-            if constexpr (is_intersect) {
-                _valid_element_in_hash_tbl = 0;
-            } else {
-                std::visit(
-                        [&](auto&& arg) {
-                            using HashTableCtxType = std::decay_t<decltype(arg)>;
-                            if constexpr (!std::is_same_v<HashTableCtxType, std::monostate>) {
-                                _valid_element_in_hash_tbl = arg.hash_table->size();
-                            }
-                        },
-                        *_hash_table_variants);
-            }
-            _build_finished = true;
-            _can_read = _children.size() == 1;
+    if (block->rows() != 0) {
+        if (_build_block.empty()) {
+            RETURN_IF_ERROR(_mutable_block.merge(*(block->create_same_struct_block(0, false))));
         }
+        RETURN_IF_ERROR(_mutable_block.merge(*block));
+        if (_mutable_block.rows() > std::numeric_limits<uint32_t>::max()) {
+            return Status::NotSupported(
+                    "Hash join do not support build table rows"
+                    " over:" +
+                    std::to_string(std::numeric_limits<uint32_t>::max()));
+        }
+    }
+
+    if (eos) {
+        if (!_mutable_block.empty()) {
+            _build_block = _mutable_block.to_block();
+        }
+        RETURN_IF_ERROR(process_build_block(_build_block, state));
+        _mutable_block.clear();
+
+        if constexpr (is_intersect) {
+            _valid_element_in_hash_tbl = 0;
+        } else {
+            std::visit(
+                    [&](auto&& arg) {
+                        using HashTableCtxType = std::decay_t<decltype(arg)>;
+                        if constexpr (!std::is_same_v<HashTableCtxType, std::monostate>) {
+                            _valid_element_in_hash_tbl = arg.hash_table->size();
+                        }
+                    },
+                    *_hash_table_variants);
+        }
+        _build_finished = true;
+        _can_read = _children.size() == 1;
     }
     return Status::OK();
 }
@@ -310,8 +319,7 @@ Status VSetOperationNode<is_intersect>::hash_table_build(RuntimeState* state) {
 }
 
 template <bool is_intersect>
-Status VSetOperationNode<is_intersect>::process_build_block(Block& block, uint8_t offset,
-                                                            RuntimeState* state) {
+Status VSetOperationNode<is_intersect>::process_build_block(Block& block, RuntimeState* state) {
     size_t rows = block.rows();
     if (rows == 0) {
         return Status::OK();
@@ -326,7 +334,7 @@ Status VSetOperationNode<is_intersect>::process_build_block(Block& block, uint8_
                 using HashTableCtxType = std::decay_t<decltype(arg)>;
                 if constexpr (!std::is_same_v<HashTableCtxType, std::monostate>) {
                     HashTableBuild<HashTableCtxType, is_intersect> hash_table_build_process(
-                            this, rows, raw_ptrs, offset, state);
+                            this, rows, raw_ptrs, state);
                     st = hash_table_build_process(arg, _arena);
                 } else {
                     LOG(FATAL) << "FATAL: uninited hash table";
@@ -342,8 +350,8 @@ void VSetOperationNode<is_intersect>::add_result_columns(RowRefListWithFlags& va
                                                          int& block_size) {
     auto it = value.begin();
     for (auto idx = _build_col_idx.begin(); idx != _build_col_idx.end(); ++idx) {
-        auto& column = *_build_blocks[it->block_offset].get_by_position(idx->first).column;
-        if (_mutable_cols[idx->second]->is_nullable() xor column.is_nullable()) {
+        const auto& column = *_build_block.get_by_position(idx->first).column;
+        if (_mutable_cols[idx->second]->is_nullable() ^ column.is_nullable()) {
             DCHECK(_mutable_cols[idx->second]->is_nullable());
             ((ColumnNullable*)(_mutable_cols[idx->second].get()))
                     ->insert_from_not_nullable(column, it->row_num);
@@ -512,10 +520,6 @@ void VSetOperationNode<is_intersect>::debug_string(int indentation_level,
 template <bool is_intersect>
 void VSetOperationNode<is_intersect>::release_mem() {
     _hash_table_variants = nullptr;
-
-    std::vector<Block> tmp_build_blocks;
-    _build_blocks.swap(tmp_build_blocks);
-
     _probe_block.clear();
 }
 
diff --git a/be/src/vec/exec/vset_operation_node.h b/be/src/vec/exec/vset_operation_node.h
index 4e68965a04..070ad381f4 100644
--- a/be/src/vec/exec/vset_operation_node.h
+++ b/be/src/vec/exec/vset_operation_node.h
@@ -82,7 +82,7 @@ private:
     //It's time to abstract out the same methods and provide them directly to others;
     void hash_table_init();
     Status hash_table_build(RuntimeState* state);
-    Status process_build_block(Block& block, uint8_t offset, RuntimeState* state);
+    Status process_build_block(Block& block, RuntimeState* state);
     Status extract_build_column(Block& block, ColumnRawPtrs& raw_ptrs);
     Status extract_probe_column(Block& block, ColumnRawPtrs& raw_ptrs, int child_id);
     void refresh_hash_table();
@@ -115,11 +115,10 @@ private:
     //record insert column id during probe
     std::vector<uint16_t> _probe_column_inserted_id;
 
-    std::vector<Block> _build_blocks;
+    Block _build_block;
     Block _probe_block;
     ColumnRawPtrs _probe_columns;
     std::vector<MutableColumnPtr> _mutable_cols;
-    int _build_block_index;
     bool _build_finished;
     std::vector<bool> _probe_finished_children_index;
     MutableBlock _mutable_block;
diff --git a/be/src/vec/exprs/vbloom_predicate.cpp b/be/src/vec/exprs/vbloom_predicate.cpp
index 06bd21a6eb..176ecb219c 100644
--- a/be/src/vec/exprs/vbloom_predicate.cpp
+++ b/be/src/vec/exprs/vbloom_predicate.cpp
@@ -88,41 +88,16 @@ Status VBloomPredicate::execute(VExprContext* context, Block* block, int* result
             block->get_by_position(arguments[0]).column->convert_to_full_column_if_const();
     size_t sz = argument_column->size();
     res_data_column->resize(sz);
-    auto ptr = ((ColumnVector<UInt8>*)res_data_column.get())->get_data().data();
+    auto* ptr = ((ColumnVector<UInt8>*)res_data_column.get())->get_data().data();
     auto type = WhichDataType(remove_nullable(block->get_by_position(arguments[0]).type));
     if (type.is_string_or_fixed_string()) {
-        // When _be_exec_version is equal to or greater than 2, we use the new hash method.
-        // This is only to be used if the be_exec_version may be less than 2. If updated, please delete it.
-        if (_be_exec_version >= 2) {
-            for (size_t i = 0; i < sz; i++) {
-                /// TODO: remove virtual function call in get_data_at to improve performance
-                auto ele = argument_column->get_data_at(i);
-                const StringRef v(ele.data, ele.size);
-                ptr[i] = _filter->find_crc32_hash(reinterpret_cast<const void*>(&v));
-            }
-        } else {
-            for (size_t i = 0; i < sz; i++) {
-                auto ele = argument_column->get_data_at(i);
-                const StringRef v(ele.data, ele.size);
-                ptr[i] = _filter->find(reinterpret_cast<const void*>(&v));
-            }
-        }
-    } else if (_be_exec_version > 0 && (type.is_int_or_uint() || type.is_float())) {
-        if (argument_column->is_nullable()) {
-            auto column_nested = reinterpret_cast<const ColumnNullable*>(argument_column.get())
-                                         ->get_nested_column_ptr();
-            auto column_nullmap = reinterpret_cast<const ColumnNullable*>(argument_column.get())
-                                          ->get_null_map_column_ptr();
-            _filter->find_fixed_len(column_nested->get_raw_data().data,
-                                    (uint8*)column_nullmap->get_raw_data().data, sz, ptr);
-        } else {
-            _filter->find_fixed_len(argument_column->get_raw_data().data, nullptr, sz, ptr);
+        for (size_t i = 0; i < sz; i++) {
+            auto ele = argument_column->get_data_at(i);
+            const StringRef v(ele.data, ele.size);
+            ptr[i] = _filter->find(reinterpret_cast<const void*>(&v));
         }
     } else {
-        for (size_t i = 0; i < sz; i++) {
-            ptr[i] = _filter->find(
-                    reinterpret_cast<const void*>(argument_column->get_data_at(i).data));
-        }
+        _filter->find_fixed_len(argument_column, ptr);
     }
 
     if (_data_type->is_nullable()) {
diff --git a/be/src/vec/runtime/shared_hash_table_controller.h b/be/src/vec/runtime/shared_hash_table_controller.h
index 6b31cf07ec..e1c0170904 100644
--- a/be/src/vec/runtime/shared_hash_table_controller.h
+++ b/be/src/vec/runtime/shared_hash_table_controller.h
@@ -53,18 +53,15 @@ struct SharedRuntimeFilterContext {
 
 struct SharedHashTableContext {
     SharedHashTableContext()
-            : hash_table_variants(nullptr),
-              blocks(new std::vector<vectorized::Block>()),
-              signaled(false),
-              short_circuit_for_null_in_probe_side(false) {}
+            : hash_table_variants(nullptr), block(std::make_shared<vectorized::Block>()) {}
 
     Status status;
     std::shared_ptr<Arena> arena;
     std::shared_ptr<void> hash_table_variants;
-    std::shared_ptr<std::vector<Block>> blocks;
+    std::shared_ptr<Block> block;
     std::map<int, SharedRuntimeFilterContext> runtime_filters;
-    bool signaled;
-    bool short_circuit_for_null_in_probe_side;
+    bool signaled {};
+    bool short_circuit_for_null_in_probe_side {};
 };
 
 using SharedHashTableContextPtr = std::shared_ptr<SharedHashTableContext>;
diff --git a/be/test/exprs/bloom_filter_predicate_test.cpp b/be/test/exprs/bloom_filter_predicate_test.cpp
index 4f4ecd7c87..8c33ed13a6 100644
--- a/be/test/exprs/bloom_filter_predicate_test.cpp
+++ b/be/test/exprs/bloom_filter_predicate_test.cpp
@@ -53,9 +53,6 @@ TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) {
     // test not exist val
     int not_exist_val = 0x3355ff;
     EXPECT_FALSE(func->find((const void*)&not_exist_val));
-    // TEST null value
-    func->insert(nullptr);
-    func->find(nullptr);
 }
 
 TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) {
diff --git a/regression-test/data/query_p0/join/mark_join/mark_join.out b/regression-test/data/query_p0/join/mark_join/mark_join.out
new file mode 100644
index 0000000000..ed3575d0e1
--- /dev/null
+++ b/regression-test/data/query_p0/join/mark_join/mark_join.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !test --
+1	1	1	a
+2	2	2	b
+3	-3	\N	c
+3	3	\N	c
+
+-- !test --
+1	1	1	a
+2	2	2	b
+3	-3	\N	c
+3	3	\N	c
+
+-- !test --
+1	1	1	a
+2	2	2	b
+3	-3	\N	c
+3	3	\N	c
+
diff --git a/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy b/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy
index 5a5987ec43..ada61beccb 100644
--- a/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy
+++ b/regression-test/suites/nereids_syntax_p0/sub_query_alias.groovy
@@ -77,7 +77,7 @@ suite("sub_query_alias") {
             select * 
             from lineorder l
         ) t on c.c_custkey = t.lo_custkey
-        order by c.c_custkey
+        order by c.c_custkey,lo_tax
     """
 }
 
diff --git a/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy b/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy
index 0e98510e96..ac87bbc813 100644
--- a/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy
+++ b/regression-test/suites/nereids_syntax_p0/sub_query_correlated.groovy
@@ -428,27 +428,27 @@ suite ("sub_query_correlated") {
     """
 
     order_qt_hash_join_with_other_conjuncts1 """
-        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1;
+        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1,k2;
     """
 
     order_qt_hash_join_with_other_conjuncts2 """
-        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1;
+        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1,k2;
     """
 
     order_qt_hash_join_with_other_conjuncts3 """
-        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1;
+        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1,k2;
     """
 
     order_qt_hash_join_with_other_conjuncts4 """
-        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1;
+        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 < sub_query_correlated_subquery3.k3) OR k1 < 11 ORDER BY k1,k2;
     """
 
     order_qt_same_subquery_in_conjuncts """
-        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1;
+        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1,k2;
     """
 
     order_qt_two_subquery_in_one_conjuncts """
-        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k3 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1;
+        SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 IN (SELECT k3 FROM sub_query_correlated_subquery3) OR k1 < 10 ORDER BY k1,k2;
     """
 
     order_qt_multi_subquery_in_and_scalry """
diff --git a/regression-test/suites/nereids_syntax_p0/view.groovy b/regression-test/suites/nereids_syntax_p0/view.groovy
index c694c37bbe..48e0ca3752 100644
--- a/regression-test/suites/nereids_syntax_p0/view.groovy
+++ b/regression-test/suites/nereids_syntax_p0/view.groovy
@@ -63,7 +63,7 @@ suite("view") {
     qt_select_3 """
         select *
         from v3
-        order by v3.c_custkey, v3.lo_orderkey
+        order by v3.c_custkey, v3.lo_orderkey,lo_tax
     """
 
     qt_select_4 """
@@ -83,7 +83,7 @@ suite("view") {
             from v2
             ) t 
         on l.lo_custkey = t.lo_custkey
-        order by l.lo_custkey, t.lo_custkey, l.lo_linenumber, l.lo_tax
+        order by l.lo_custkey, t.lo_custkey, l.lo_linenumber, t.lo_linenumber, t.lo_shipmode,t.lo_tax
     """
 
     qt_select_6 """
diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
new file mode 100644
index 0000000000..9759a0e9b4
--- /dev/null
+++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("mark_join") {
+    sql "drop table if exists t1;"
+    sql "drop table if exists t2;"
+    sql """
+        create table t1 (
+            k1 int null,
+            k2 int null,
+            k3 bigint null,
+        k4 varchar(100) null
+        )
+        duplicate key (k1,k2,k3)
+        distributed BY hash(k1) buckets 3
+        properties("replication_num" = "1");
+    """
+
+    sql """
+        create table t2 (
+            k1 int null,
+            k2 int null,
+            k3 bigint null,
+        k4 varchar(100) null
+        )
+        duplicate key (k1,k2,k3)
+        distributed BY hash(k1) buckets 3
+        properties("replication_num" = "1");
+    """
+
+    sql "insert into t1 select 1,1,1,'a';"
+    sql "insert into t1 select 2,2,2,'b';"
+    sql "insert into t1 select 3,-3,null,'c';"
+    sql "insert into t1 select 3,3,null,'c';"
+
+    sql "insert into t2 select 1,1,1,'a';"
+    sql "insert into t2 select 2,2,2,'b';"
+    sql "insert into t2 select 3,-3,null,'c';"
+    sql "insert into t2 select 3,3,null,'c';"
+
+    qt_test """
+    select * from t1 where exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2;
+    """
+    qt_test """
+    select * from t1 where not exists (select t2.k3 from t2 where t1.k2 = t2.k2) or k1 < 10 order by k1, k2;
+    """
+    qt_test """
+    select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 = t1.k2) or k1 < 10 order by k1, k2;
+    """
+}