[refactor] remove collator (#10518)

This commit is contained in:
Pxl
2022-07-01 10:35:32 +08:00
committed by GitHub
parent 06e436b7cc
commit a9d23ce337
9 changed files with 75 additions and 290 deletions

View File

@ -41,7 +41,6 @@ set(VEC_FILES
aggregate_functions/aggregate_function_simple_factory.cpp
aggregate_functions/aggregate_function_java_udaf.h
aggregate_functions/aggregate_function_orthogonal_bitmap.cpp
columns/collator.cpp
columns/column.cpp
columns/column_array.cpp
columns/column_const.cpp

View File

@ -1,97 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/Collator.cpp
// and modified by Doris
#include "vec/columns/collator.h"
#if USE_ICU
#include <unicode/ucol.h>
#else
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-private-field"
#pragma clang diagnostic ignored "-Wmissing-noreturn"
#endif
#endif
#include <boost/algorithm/string/case_conv.hpp>
#include "common/logging.h"
#include "vec/common/exception.h"
Collator::Collator(const std::string& locale_) : locale(boost::algorithm::to_lower_copy(locale_)) {
#if USE_ICU
UErrorCode status = U_ZERO_ERROR;
collator = ucol_open(locale.c_str(), &status);
if (status != U_ZERO_ERROR) {
ucol_close(collator);
LOG(FATAL) << "Unsupported collation locale: " << locale;
}
#else
LOG(FATAL) << "Collations support is disabled, In Doris";
#endif
}
Collator::~Collator() {
#if USE_ICU
ucol_close(collator);
#endif
}
int Collator::compare(const char* str1, size_t length1, const char* str2, size_t length2) const {
#if USE_ICU
UCharIterator iter1, iter2;
uiter_setUTF8(&iter1, str1, length1);
uiter_setUTF8(&iter2, str2, length2);
UErrorCode status = U_ZERO_ERROR;
UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);
if (status != U_ZERO_ERROR) {
LOG(FATAL) << "ICU collation comparison failed with error code: "
<< doris::vectorized::toString<int>(status);
}
/** Values of enum UCollationResult are equals to what exactly we need:
* UCOL_EQUAL = 0
* UCOL_GREATER = 1
* UCOL_LESS = -1
*/
return compare_result;
#else
(void)str1;
(void)length1;
(void)str2;
(void)length2;
return 0;
#endif
}
const std::string& Collator::get_locale() const {
return locale;
}
std::vector<std::string> Collator::get_available_collations() {
std::vector<std::string> result;
#if USE_ICU
size_t available_locales_count = ucol_countAvailable();
for (size_t i = 0; i < available_locales_count; ++i) result.push_back(ucol_getAvailable(i));
#endif
return result;
}

View File

@ -1,43 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/Collator.h
// and modified by Doris
#pragma once
#include <boost/noncopyable.hpp>
#include <string>
#include <vector>
struct UCollator;
class Collator : private boost::noncopyable {
public:
explicit Collator(const std::string& locale_);
~Collator();
int compare(const char* str1, size_t length1, const char* str2, size_t length2) const;
const std::string& get_locale() const;
static std::vector<std::string> get_available_collations();
private:
std::string locale;
UCollator* collator;
};

View File

@ -20,7 +20,6 @@
#include "vec/columns/column_array.h"
#include "vec/columns/collator.h"
#include "vec/columns/column_const.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"

View File

@ -20,7 +20,6 @@
#include "vec/columns/column_string.h"
#include "vec/columns/collator.h"
#include "vec/columns/columns_common.h"
#include "vec/common/arena.h"
#include "vec/common/assert_cast.h"
@ -364,57 +363,6 @@ void ColumnString::get_extremes(Field& min, Field& max) const {
get(max_idx, max);
}
int ColumnString::compare_at_with_collation(size_t n, size_t m, const IColumn& rhs_,
const Collator& collator) const {
const ColumnString& rhs = assert_cast<const ColumnString&>(rhs_);
return collator.compare(reinterpret_cast<const char*>(&chars[offset_at(n)]), size_at(n),
reinterpret_cast<const char*>(&rhs.chars[rhs.offset_at(m)]),
rhs.size_at(m));
}
template <bool positive>
struct ColumnString::lessWithCollation {
const ColumnString& parent;
const Collator& collator;
lessWithCollation(const ColumnString& parent_, const Collator& collator_)
: parent(parent_), collator(collator_) {}
bool operator()(size_t lhs, size_t rhs) const {
int res = collator.compare(
reinterpret_cast<const char*>(&parent.chars[parent.offset_at(lhs)]),
parent.size_at(lhs),
reinterpret_cast<const char*>(&parent.chars[parent.offset_at(rhs)]),
parent.size_at(rhs));
return positive ? (res < 0) : (res > 0);
}
};
void ColumnString::get_permutation_with_collation(const Collator& collator, bool reverse,
size_t limit, Permutation& res) const {
size_t s = offsets.size();
res.resize(s);
for (size_t i = 0; i < s; ++i) res[i] = i;
if (limit >= s) limit = 0;
if (limit) {
if (reverse)
std::partial_sort(res.begin(), res.begin() + limit, res.end(),
lessWithCollation<false>(*this, collator));
else
std::partial_sort(res.begin(), res.begin() + limit, res.end(),
lessWithCollation<true>(*this, collator));
} else {
if (reverse)
std::sort(res.begin(), res.end(), lessWithCollation<false>(*this, collator));
else
std::sort(res.begin(), res.end(), lessWithCollation<true>(*this, collator));
}
}
void ColumnString::protect() {
get_chars().protect();
get_offsets().protect();

View File

@ -32,8 +32,6 @@
#include "vec/common/sip_hash.h"
#include "vec/core/field.h"
class Collator;
namespace doris::vectorized {
/** Column for String values.
@ -153,7 +151,9 @@ public:
const size_t new_size = old_size + length + 1;
chars.resize(new_size);
if (length) memcpy(chars.data() + old_size, pos, length);
if (length) {
memcpy(chars.data() + old_size, pos, length);
}
chars[old_size + length] = 0;
offsets.push_back(new_size);
}
@ -257,17 +257,9 @@ public:
rhs.size_at(m) - 1);
}
/// Variant of compare_at for string comparison with respect of collation.
int compare_at_with_collation(size_t n, size_t m, const IColumn& rhs_,
const Collator& collator) const;
void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
Permutation& res) const override;
/// Sorting with respect of collation.
void get_permutation_with_collation(const Collator& collator, bool reverse, size_t limit,
Permutation& res) const;
ColumnPtr replicate(const Offsets& replicate_offsets) const override;
void replicate(const uint32_t* counts, size_t target_size, IColumn& column) const override;

View File

@ -27,16 +27,6 @@
namespace doris::vectorized {
static inline bool needCollation(const IColumn* column, const SortColumnDescription& description) {
if (!description.collator) return false;
if (!typeid_cast<const ColumnString*>(column)) { /// TODO Nullable(String)
LOG(FATAL) << "Collations could be specified only for String columns.";
}
return true;
}
ColumnsWithSortDescriptions get_columns_with_sort_description(const Block& block,
const SortDescription& description) {
size_t size = description.size();
@ -65,44 +55,20 @@ struct PartialSortingLess {
++it) {
int res = it->second.direction *
it->first->compare_at(a, b, *it->first, it->second.nulls_direction);
if (res < 0)
if (res < 0) {
return true;
else if (res > 0)
return false;
}
return false;
}
};
struct PartialSortingLessWithCollation {
const ColumnsWithSortDescriptions& columns;
explicit PartialSortingLessWithCollation(const ColumnsWithSortDescriptions& columns_)
: columns(columns_) {}
bool operator()(size_t a, size_t b) const {
for (ColumnsWithSortDescriptions::const_iterator it = columns.begin(); it != columns.end();
++it) {
int res;
if (needCollation(it->first, it->second)) {
const ColumnString& column_string = typeid_cast<const ColumnString&>(*it->first);
res = column_string.compare_at_with_collation(a, b, *it->first,
*it->second.collator);
} else
res = it->first->compare_at(a, b, *it->first, it->second.nulls_direction);
res *= it->second.direction;
if (res < 0)
return true;
else if (res > 0)
} else if (res > 0) {
return false;
}
}
return false;
}
};
void sort_block(Block& block, const SortDescription& description, UInt64 limit) {
if (!block) return;
if (!block) {
return;
}
/// If only one column to sort by
if (description.size() == 1) {
@ -117,39 +83,50 @@ void sort_block(Block& block, const SortDescription& description, UInt64 limit)
column->get_permutation(reverse, limit, description[0].nulls_direction, perm);
size_t columns = block.columns();
for (size_t i = 0; i < columns; ++i)
for (size_t i = 0; i < columns; ++i) {
block.get_by_position(i).column = block.get_by_position(i).column->permute(perm, limit);
}
} else {
size_t size = block.rows();
IColumn::Permutation perm(size);
for (size_t i = 0; i < size; ++i) perm[i] = i;
for (size_t i = 0; i < size; ++i) {
perm[i] = i;
}
if (limit >= size) limit = 0;
if (limit >= size) {
limit = 0;
}
ColumnsWithSortDescriptions columns_with_sort_desc =
get_columns_with_sort_description(block, description);
{
PartialSortingLess less(columns_with_sort_desc);
if (limit)
if (limit) {
std::partial_sort(perm.begin(), perm.begin() + limit, perm.end(), less);
else
} else {
pdqsort(perm.begin(), perm.end(), less);
}
}
size_t columns = block.columns();
for (size_t i = 0; i < columns; ++i)
for (size_t i = 0; i < columns; ++i) {
block.get_by_position(i).column = block.get_by_position(i).column->permute(perm, limit);
}
}
}
void stable_get_permutation(const Block& block, const SortDescription& description,
IColumn::Permutation& out_permutation) {
if (!block) return;
if (!block) {
return;
}
size_t size = block.rows();
out_permutation.resize(size);
for (size_t i = 0; i < size; ++i) out_permutation[i] = i;
for (size_t i = 0; i < size; ++i) {
out_permutation[i] = i;
}
ColumnsWithSortDescriptions columns_with_sort_desc =
get_columns_with_sort_description(block, description);
@ -159,7 +136,9 @@ void stable_get_permutation(const Block& block, const SortDescription& descripti
}
bool is_already_sorted(const Block& block, const SortDescription& description) {
if (!block) return true;
if (!block) {
return true;
}
size_t rows = block.rows();
@ -177,26 +156,34 @@ bool is_already_sorted(const Block& block, const SortDescription& description) {
size_t prev_position = rows * (i - 1) / num_rows_to_try;
size_t curr_position = rows * i / num_rows_to_try;
if (less(curr_position, prev_position)) return false;
if (less(curr_position, prev_position)) {
return false;
}
}
}
for (size_t i = 1; i < rows; ++i)
if (less(i, i - 1)) return false;
for (size_t i = 1; i < rows; ++i) {
if (less(i, i - 1)) {
return false;
}
}
return true;
}
void stable_sort_block(Block& block, const SortDescription& description) {
if (!block) return;
if (!block) {
return;
}
IColumn::Permutation perm;
stable_get_permutation(block, description, perm);
size_t columns = block.columns();
for (size_t i = 0; i < columns; ++i)
for (size_t i = 0; i < columns; ++i) {
block.safe_get_by_position(i).column =
block.safe_get_by_position(i).column->permute(perm, 0);
}
}
} // namespace doris::vectorized

View File

@ -44,24 +44,16 @@ struct SortCursorImpl {
size_t pos = 0;
size_t rows = 0;
using NeedCollationFlags = std::vector<UInt8>;
/** Should we use Collator to sort a column? */
NeedCollationFlags need_collation;
/** Is there at least one column with Collator. */
bool has_collation = false;
SortCursorImpl() = default;
virtual ~SortCursorImpl() = default;
SortCursorImpl(const Block& block, const SortDescription& desc_)
: desc(desc_), sort_columns_size(desc.size()), need_collation(desc.size()) {
: desc(desc_), sort_columns_size(desc.size()) {
reset(block);
}
SortCursorImpl(const Columns& columns, const SortDescription& desc_)
: desc(desc_), sort_columns_size(desc.size()), need_collation(desc.size()) {
: desc(desc_), sort_columns_size(desc.size()) {
for (auto& column_desc : desc) {
if (!column_desc.column_name.empty()) {
LOG(FATAL) << "SortDesctiption should contain column position if SortCursor was "
@ -83,7 +75,9 @@ struct SortCursorImpl {
size_t num_columns = columns.size();
for (size_t j = 0; j < num_columns; ++j) all_columns.push_back(columns[j].get());
for (size_t j = 0; j < num_columns; ++j) {
all_columns.push_back(columns[j].get());
}
for (size_t j = 0, size = desc.size(); j < size; ++j) {
auto& column_desc = desc[j];
@ -98,7 +92,7 @@ struct SortCursorImpl {
}
bool isFirst() const { return pos == 0; }
bool isLast() { return pos + 1 >= rows; }
bool isLast() const { return pos + 1 >= rows; }
void next() { ++pos; }
virtual bool has_next_block() { return false; }
@ -112,7 +106,7 @@ struct ReceiveQueueSortCursorImpl : public SortCursorImpl {
const std::vector<VExprContext*>& ordering_expr,
const std::vector<bool>& is_asc_order,
const std::vector<bool>& nulls_first)
: SortCursorImpl(), _ordering_expr(ordering_expr), _block_supplier(block_supplier) {
: _ordering_expr(ordering_expr), _block_supplier(block_supplier) {
sort_columns_size = ordering_expr.size();
desc.resize(ordering_expr.size());
@ -143,7 +137,9 @@ struct ReceiveQueueSortCursorImpl : public SortCursorImpl {
Block create_empty_blocks() const {
size_t num_columns = columns_num();
MutableColumns columns(num_columns);
for (size_t i = 0; i < num_columns; ++i) columns[i] = all_columns[i]->clone_empty();
for (size_t i = 0; i < num_columns; ++i) {
columns[i] = all_columns[i]->clone_empty();
}
return _block_ptr->clone_with_columns(std::move(columns));
}
@ -158,8 +154,7 @@ struct SortCursor {
SortCursorImpl* impl;
SortCursor(SortCursorImpl* impl_) : impl(impl_) {}
SortCursorImpl* operator->() { return impl; }
const SortCursorImpl* operator->() const { return impl; }
SortCursorImpl* operator->() const { return impl; }
/// The specified row of this cursor is greater than the specified row of another cursor.
int8_t greater_at(const SortCursor& rhs, size_t lhs_pos, size_t rhs_pos) const {
@ -169,15 +164,21 @@ struct SortCursor {
int res = direction * impl->sort_columns[i]->compare_at(lhs_pos, rhs_pos,
*(rhs.impl->sort_columns[i]),
nulls_direction);
if (res > 0) return 1;
if (res < 0) return -1;
if (res > 0) {
return 1;
}
if (res < 0) {
return -1;
}
}
return 0;
}
/// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor.
bool totally_less(const SortCursor& rhs) const {
if (impl->rows == 0 || rhs.impl->rows == 0) return false;
if (impl->rows == 0 || rhs.impl->rows == 0) {
return false;
}
/// The last row of this cursor is no larger than the first row of the another cursor.
return greater_at(rhs, impl->rows - 1, 0) == -1;
@ -196,8 +197,7 @@ struct SortBlockCursor {
SortCursorImpl* impl;
SortBlockCursor(SortCursorImpl* impl_) : impl(impl_) {}
SortCursorImpl* operator->() { return impl; }
const SortCursorImpl* operator->() const { return impl; }
SortCursorImpl* operator->() const { return impl; }
/// The specified row of this cursor is greater than the specified row of another cursor.
int8_t less_at(const SortBlockCursor& rhs, int rows) const {
@ -207,15 +207,21 @@ struct SortBlockCursor {
int res = direction * impl->sort_columns[i]->compare_at(rows, rhs->rows - 1,
*(rhs.impl->sort_columns[i]),
nulls_direction);
if (res < 0) return 1;
if (res > 0) return -1;
if (res < 0) {
return 1;
}
if (res > 0) {
return -1;
}
}
return 0;
}
/// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor.
bool totally_greater(const SortBlockCursor& rhs) const {
if (impl->rows == 0 || rhs.impl->rows == 0) return false;
if (impl->rows == 0 || rhs.impl->rows == 0) {
return false;
}
/// The last row of this cursor is no larger than the first row of the another cursor.
return less_at(rhs, 0) == -1;

View File

@ -26,8 +26,6 @@
#include "vec/core/field.h"
#include "vector"
class Collator;
namespace doris::vectorized {
struct FillColumnDescription {
@ -45,23 +43,19 @@ struct SortColumnDescription {
int direction; /// 1 - ascending, -1 - descending.
int nulls_direction; /// 1 - NULLs and NaNs are greater, -1 - less.
/// To achieve NULLS LAST, set it equal to direction, to achieve NULLS FIRST, set it opposite.
std::shared_ptr<Collator> collator =
nullptr; /// Collator for locale-specific comparison of strings
bool with_fill = false;
FillColumnDescription fill_description = {};
SortColumnDescription(int column_number_, int direction_, int nulls_direction_,
const std::shared_ptr<Collator>& collator_ = nullptr,
bool with_fill_ = false,
const FillColumnDescription& fill_description_ = {})
: column_number(column_number_),
direction(direction_),
nulls_direction(nulls_direction_),
collator(collator_),
with_fill(with_fill_),
fill_description(fill_description_) {}
SortColumnDescription() {}
SortColumnDescription() = default;
bool operator==(const SortColumnDescription& other) const {
return column_name == other.column_name && column_number == other.column_number &&