diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index c51f670ab6..eb55bb284d 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -41,7 +41,6 @@ set(VEC_FILES aggregate_functions/aggregate_function_simple_factory.cpp aggregate_functions/aggregate_function_java_udaf.h aggregate_functions/aggregate_function_orthogonal_bitmap.cpp - columns/collator.cpp columns/column.cpp columns/column_array.cpp columns/column_const.cpp diff --git a/be/src/vec/columns/collator.cpp b/be/src/vec/columns/collator.cpp deleted file mode 100644 index a3fa2790ac..0000000000 --- a/be/src/vec/columns/collator.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/Collator.cpp -// and modified by Doris - -#include "vec/columns/collator.h" - -#if USE_ICU -#include -#else -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wunused-private-field" -#pragma clang diagnostic ignored "-Wmissing-noreturn" -#endif -#endif - -#include - -#include "common/logging.h" -#include "vec/common/exception.h" - -Collator::Collator(const std::string& locale_) : locale(boost::algorithm::to_lower_copy(locale_)) { -#if USE_ICU - UErrorCode status = U_ZERO_ERROR; - - collator = ucol_open(locale.c_str(), &status); - if (status != U_ZERO_ERROR) { - ucol_close(collator); - LOG(FATAL) << "Unsupported collation locale: " << locale; - } -#else - LOG(FATAL) << "Collations support is disabled, In Doris"; -#endif -} - -Collator::~Collator() { -#if USE_ICU - ucol_close(collator); -#endif -} - -int Collator::compare(const char* str1, size_t length1, const char* str2, size_t length2) const { -#if USE_ICU - UCharIterator iter1, iter2; - uiter_setUTF8(&iter1, str1, length1); - uiter_setUTF8(&iter2, str2, length2); - - UErrorCode status = U_ZERO_ERROR; - UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status); - - if (status != U_ZERO_ERROR) { - LOG(FATAL) << "ICU collation comparison failed with error code: " - << doris::vectorized::toString(status); - } - - /** Values of enum UCollationResult are equals to what exactly we need: - * UCOL_EQUAL = 0 - * UCOL_GREATER = 1 - * UCOL_LESS = -1 - */ - return compare_result; -#else - (void)str1; - (void)length1; - (void)str2; - (void)length2; - return 0; -#endif -} - -const std::string& Collator::get_locale() const { - return locale; -} - -std::vector Collator::get_available_collations() { - std::vector result; -#if USE_ICU - size_t available_locales_count = ucol_countAvailable(); - for (size_t i = 0; i < available_locales_count; ++i) result.push_back(ucol_getAvailable(i)); -#endif - return result; -} diff --git a/be/src/vec/columns/collator.h b/be/src/vec/columns/collator.h deleted file mode 100644 index 27b9cd54b6..0000000000 --- a/be/src/vec/columns/collator.h +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/Collator.h -// and modified by Doris - -#pragma once - -#include -#include -#include - -struct UCollator; - -class Collator : private boost::noncopyable { -public: - explicit Collator(const std::string& locale_); - ~Collator(); - - int compare(const char* str1, size_t length1, const char* str2, size_t length2) const; - - const std::string& get_locale() const; - - static std::vector get_available_collations(); - -private: - std::string locale; - UCollator* collator; -}; diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index c18d1a55b3..0a7496ba60 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -20,7 +20,6 @@ #include "vec/columns/column_array.h" -#include "vec/columns/collator.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 12701ab5ed..ad5873d4f1 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -20,7 +20,6 @@ #include "vec/columns/column_string.h" -#include "vec/columns/collator.h" #include "vec/columns/columns_common.h" #include "vec/common/arena.h" #include "vec/common/assert_cast.h" @@ -364,57 +363,6 @@ void ColumnString::get_extremes(Field& min, Field& max) const { get(max_idx, max); } -int ColumnString::compare_at_with_collation(size_t n, size_t m, const IColumn& rhs_, - const Collator& collator) const { - const ColumnString& rhs = assert_cast(rhs_); - - return collator.compare(reinterpret_cast(&chars[offset_at(n)]), size_at(n), - reinterpret_cast(&rhs.chars[rhs.offset_at(m)]), - rhs.size_at(m)); -} - -template -struct ColumnString::lessWithCollation { - const ColumnString& parent; - const Collator& collator; - - lessWithCollation(const ColumnString& parent_, const Collator& collator_) - : parent(parent_), collator(collator_) {} - - bool operator()(size_t lhs, size_t rhs) const { - int res = collator.compare( - reinterpret_cast(&parent.chars[parent.offset_at(lhs)]), - parent.size_at(lhs), - reinterpret_cast(&parent.chars[parent.offset_at(rhs)]), - parent.size_at(rhs)); - - return positive ? (res < 0) : (res > 0); - } -}; - -void ColumnString::get_permutation_with_collation(const Collator& collator, bool reverse, - size_t limit, Permutation& res) const { - size_t s = offsets.size(); - res.resize(s); - for (size_t i = 0; i < s; ++i) res[i] = i; - - if (limit >= s) limit = 0; - - if (limit) { - if (reverse) - std::partial_sort(res.begin(), res.begin() + limit, res.end(), - lessWithCollation(*this, collator)); - else - std::partial_sort(res.begin(), res.begin() + limit, res.end(), - lessWithCollation(*this, collator)); - } else { - if (reverse) - std::sort(res.begin(), res.end(), lessWithCollation(*this, collator)); - else - std::sort(res.begin(), res.end(), lessWithCollation(*this, collator)); - } -} - void ColumnString::protect() { get_chars().protect(); get_offsets().protect(); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 9b92890ed4..913ccc2312 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -32,8 +32,6 @@ #include "vec/common/sip_hash.h" #include "vec/core/field.h" -class Collator; - namespace doris::vectorized { /** Column for String values. @@ -153,7 +151,9 @@ public: const size_t new_size = old_size + length + 1; chars.resize(new_size); - if (length) memcpy(chars.data() + old_size, pos, length); + if (length) { + memcpy(chars.data() + old_size, pos, length); + } chars[old_size + length] = 0; offsets.push_back(new_size); } @@ -257,17 +257,9 @@ public: rhs.size_at(m) - 1); } - /// Variant of compare_at for string comparison with respect of collation. - int compare_at_with_collation(size_t n, size_t m, const IColumn& rhs_, - const Collator& collator) const; - void get_permutation(bool reverse, size_t limit, int nan_direction_hint, Permutation& res) const override; - /// Sorting with respect of collation. - void get_permutation_with_collation(const Collator& collator, bool reverse, size_t limit, - Permutation& res) const; - ColumnPtr replicate(const Offsets& replicate_offsets) const override; void replicate(const uint32_t* counts, size_t target_size, IColumn& column) const override; diff --git a/be/src/vec/core/sort_block.cpp b/be/src/vec/core/sort_block.cpp index 2192a4cfaa..d26566e1f9 100644 --- a/be/src/vec/core/sort_block.cpp +++ b/be/src/vec/core/sort_block.cpp @@ -27,16 +27,6 @@ namespace doris::vectorized { -static inline bool needCollation(const IColumn* column, const SortColumnDescription& description) { - if (!description.collator) return false; - - if (!typeid_cast(column)) { /// TODO Nullable(String) - LOG(FATAL) << "Collations could be specified only for String columns."; - } - - return true; -} - ColumnsWithSortDescriptions get_columns_with_sort_description(const Block& block, const SortDescription& description) { size_t size = description.size(); @@ -65,44 +55,20 @@ struct PartialSortingLess { ++it) { int res = it->second.direction * it->first->compare_at(a, b, *it->first, it->second.nulls_direction); - if (res < 0) + if (res < 0) { return true; - else if (res > 0) - return false; - } - return false; - } -}; - -struct PartialSortingLessWithCollation { - const ColumnsWithSortDescriptions& columns; - - explicit PartialSortingLessWithCollation(const ColumnsWithSortDescriptions& columns_) - : columns(columns_) {} - - bool operator()(size_t a, size_t b) const { - for (ColumnsWithSortDescriptions::const_iterator it = columns.begin(); it != columns.end(); - ++it) { - int res; - if (needCollation(it->first, it->second)) { - const ColumnString& column_string = typeid_cast(*it->first); - res = column_string.compare_at_with_collation(a, b, *it->first, - *it->second.collator); - } else - res = it->first->compare_at(a, b, *it->first, it->second.nulls_direction); - - res *= it->second.direction; - if (res < 0) - return true; - else if (res > 0) + } else if (res > 0) { return false; + } } return false; } }; void sort_block(Block& block, const SortDescription& description, UInt64 limit) { - if (!block) return; + if (!block) { + return; + } /// If only one column to sort by if (description.size() == 1) { @@ -117,39 +83,50 @@ void sort_block(Block& block, const SortDescription& description, UInt64 limit) column->get_permutation(reverse, limit, description[0].nulls_direction, perm); size_t columns = block.columns(); - for (size_t i = 0; i < columns; ++i) + for (size_t i = 0; i < columns; ++i) { block.get_by_position(i).column = block.get_by_position(i).column->permute(perm, limit); + } } else { size_t size = block.rows(); IColumn::Permutation perm(size); - for (size_t i = 0; i < size; ++i) perm[i] = i; + for (size_t i = 0; i < size; ++i) { + perm[i] = i; + } - if (limit >= size) limit = 0; + if (limit >= size) { + limit = 0; + } ColumnsWithSortDescriptions columns_with_sort_desc = get_columns_with_sort_description(block, description); { PartialSortingLess less(columns_with_sort_desc); - if (limit) + if (limit) { std::partial_sort(perm.begin(), perm.begin() + limit, perm.end(), less); - else + } else { pdqsort(perm.begin(), perm.end(), less); + } } size_t columns = block.columns(); - for (size_t i = 0; i < columns; ++i) + for (size_t i = 0; i < columns; ++i) { block.get_by_position(i).column = block.get_by_position(i).column->permute(perm, limit); + } } } void stable_get_permutation(const Block& block, const SortDescription& description, IColumn::Permutation& out_permutation) { - if (!block) return; + if (!block) { + return; + } size_t size = block.rows(); out_permutation.resize(size); - for (size_t i = 0; i < size; ++i) out_permutation[i] = i; + for (size_t i = 0; i < size; ++i) { + out_permutation[i] = i; + } ColumnsWithSortDescriptions columns_with_sort_desc = get_columns_with_sort_description(block, description); @@ -159,7 +136,9 @@ void stable_get_permutation(const Block& block, const SortDescription& descripti } bool is_already_sorted(const Block& block, const SortDescription& description) { - if (!block) return true; + if (!block) { + return true; + } size_t rows = block.rows(); @@ -177,26 +156,34 @@ bool is_already_sorted(const Block& block, const SortDescription& description) { size_t prev_position = rows * (i - 1) / num_rows_to_try; size_t curr_position = rows * i / num_rows_to_try; - if (less(curr_position, prev_position)) return false; + if (less(curr_position, prev_position)) { + return false; + } } } - for (size_t i = 1; i < rows; ++i) - if (less(i, i - 1)) return false; + for (size_t i = 1; i < rows; ++i) { + if (less(i, i - 1)) { + return false; + } + } return true; } void stable_sort_block(Block& block, const SortDescription& description) { - if (!block) return; + if (!block) { + return; + } IColumn::Permutation perm; stable_get_permutation(block, description, perm); size_t columns = block.columns(); - for (size_t i = 0; i < columns; ++i) + for (size_t i = 0; i < columns; ++i) { block.safe_get_by_position(i).column = block.safe_get_by_position(i).column->permute(perm, 0); + } } } // namespace doris::vectorized diff --git a/be/src/vec/core/sort_cursor.h b/be/src/vec/core/sort_cursor.h index 0dffb7454b..d6c8613bb1 100644 --- a/be/src/vec/core/sort_cursor.h +++ b/be/src/vec/core/sort_cursor.h @@ -44,24 +44,16 @@ struct SortCursorImpl { size_t pos = 0; size_t rows = 0; - using NeedCollationFlags = std::vector; - - /** Should we use Collator to sort a column? */ - NeedCollationFlags need_collation; - - /** Is there at least one column with Collator. */ - bool has_collation = false; - SortCursorImpl() = default; virtual ~SortCursorImpl() = default; SortCursorImpl(const Block& block, const SortDescription& desc_) - : desc(desc_), sort_columns_size(desc.size()), need_collation(desc.size()) { + : desc(desc_), sort_columns_size(desc.size()) { reset(block); } SortCursorImpl(const Columns& columns, const SortDescription& desc_) - : desc(desc_), sort_columns_size(desc.size()), need_collation(desc.size()) { + : desc(desc_), sort_columns_size(desc.size()) { for (auto& column_desc : desc) { if (!column_desc.column_name.empty()) { LOG(FATAL) << "SortDesctiption should contain column position if SortCursor was " @@ -83,7 +75,9 @@ struct SortCursorImpl { size_t num_columns = columns.size(); - for (size_t j = 0; j < num_columns; ++j) all_columns.push_back(columns[j].get()); + for (size_t j = 0; j < num_columns; ++j) { + all_columns.push_back(columns[j].get()); + } for (size_t j = 0, size = desc.size(); j < size; ++j) { auto& column_desc = desc[j]; @@ -98,7 +92,7 @@ struct SortCursorImpl { } bool isFirst() const { return pos == 0; } - bool isLast() { return pos + 1 >= rows; } + bool isLast() const { return pos + 1 >= rows; } void next() { ++pos; } virtual bool has_next_block() { return false; } @@ -112,7 +106,7 @@ struct ReceiveQueueSortCursorImpl : public SortCursorImpl { const std::vector& ordering_expr, const std::vector& is_asc_order, const std::vector& nulls_first) - : SortCursorImpl(), _ordering_expr(ordering_expr), _block_supplier(block_supplier) { + : _ordering_expr(ordering_expr), _block_supplier(block_supplier) { sort_columns_size = ordering_expr.size(); desc.resize(ordering_expr.size()); @@ -143,7 +137,9 @@ struct ReceiveQueueSortCursorImpl : public SortCursorImpl { Block create_empty_blocks() const { size_t num_columns = columns_num(); MutableColumns columns(num_columns); - for (size_t i = 0; i < num_columns; ++i) columns[i] = all_columns[i]->clone_empty(); + for (size_t i = 0; i < num_columns; ++i) { + columns[i] = all_columns[i]->clone_empty(); + } return _block_ptr->clone_with_columns(std::move(columns)); } @@ -158,8 +154,7 @@ struct SortCursor { SortCursorImpl* impl; SortCursor(SortCursorImpl* impl_) : impl(impl_) {} - SortCursorImpl* operator->() { return impl; } - const SortCursorImpl* operator->() const { return impl; } + SortCursorImpl* operator->() const { return impl; } /// The specified row of this cursor is greater than the specified row of another cursor. int8_t greater_at(const SortCursor& rhs, size_t lhs_pos, size_t rhs_pos) const { @@ -169,15 +164,21 @@ struct SortCursor { int res = direction * impl->sort_columns[i]->compare_at(lhs_pos, rhs_pos, *(rhs.impl->sort_columns[i]), nulls_direction); - if (res > 0) return 1; - if (res < 0) return -1; + if (res > 0) { + return 1; + } + if (res < 0) { + return -1; + } } return 0; } /// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor. bool totally_less(const SortCursor& rhs) const { - if (impl->rows == 0 || rhs.impl->rows == 0) return false; + if (impl->rows == 0 || rhs.impl->rows == 0) { + return false; + } /// The last row of this cursor is no larger than the first row of the another cursor. return greater_at(rhs, impl->rows - 1, 0) == -1; @@ -196,8 +197,7 @@ struct SortBlockCursor { SortCursorImpl* impl; SortBlockCursor(SortCursorImpl* impl_) : impl(impl_) {} - SortCursorImpl* operator->() { return impl; } - const SortCursorImpl* operator->() const { return impl; } + SortCursorImpl* operator->() const { return impl; } /// The specified row of this cursor is greater than the specified row of another cursor. int8_t less_at(const SortBlockCursor& rhs, int rows) const { @@ -207,15 +207,21 @@ struct SortBlockCursor { int res = direction * impl->sort_columns[i]->compare_at(rows, rhs->rows - 1, *(rhs.impl->sort_columns[i]), nulls_direction); - if (res < 0) return 1; - if (res > 0) return -1; + if (res < 0) { + return 1; + } + if (res > 0) { + return -1; + } } return 0; } /// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor. bool totally_greater(const SortBlockCursor& rhs) const { - if (impl->rows == 0 || rhs.impl->rows == 0) return false; + if (impl->rows == 0 || rhs.impl->rows == 0) { + return false; + } /// The last row of this cursor is no larger than the first row of the another cursor. return less_at(rhs, 0) == -1; diff --git a/be/src/vec/core/sort_description.h b/be/src/vec/core/sort_description.h index cf4b820575..6a29e95a59 100644 --- a/be/src/vec/core/sort_description.h +++ b/be/src/vec/core/sort_description.h @@ -26,8 +26,6 @@ #include "vec/core/field.h" #include "vector" -class Collator; - namespace doris::vectorized { struct FillColumnDescription { @@ -45,23 +43,19 @@ struct SortColumnDescription { int direction; /// 1 - ascending, -1 - descending. int nulls_direction; /// 1 - NULLs and NaNs are greater, -1 - less. /// To achieve NULLS LAST, set it equal to direction, to achieve NULLS FIRST, set it opposite. - std::shared_ptr collator = - nullptr; /// Collator for locale-specific comparison of strings bool with_fill = false; FillColumnDescription fill_description = {}; SortColumnDescription(int column_number_, int direction_, int nulls_direction_, - const std::shared_ptr& collator_ = nullptr, bool with_fill_ = false, const FillColumnDescription& fill_description_ = {}) : column_number(column_number_), direction(direction_), nulls_direction(nulls_direction_), - collator(collator_), with_fill(with_fill_), fill_description(fill_description_) {} - SortColumnDescription() {} + SortColumnDescription() = default; bool operator==(const SortColumnDescription& other) const { return column_name == other.column_name && column_number == other.column_number &&