[Improvement](string function) optimize substring and in string set (#19257)

* [Improvement](string function) optimize substring and in string set

* update
This commit is contained in:
Gabriel
2023-05-17 14:09:52 +08:00
committed by GitHub
parent 4607a3408e
commit 56809230d1
2 changed files with 114 additions and 54 deletions

View File

@ -60,7 +60,7 @@ public:
}
// Use '|' instead of '||' has better performance by test.
bool find(const T& value) const {
ALWAYS_INLINE bool find(const T& value) const {
if constexpr (N == 1) {
return (value == _data[0]);
}
@ -585,21 +585,26 @@ public:
const doris::vectorized::NullMap* null_map,
doris::vectorized::ColumnUInt8::Container& results) {
auto& col = assert_cast<const doris::vectorized::ColumnString&>(column);
const uint32_t* __restrict offset = col.get_offsets().data();
const uint8_t* __restrict data = col.get_chars().data();
uint8_t* __restrict cursor = const_cast<uint8_t*>(data);
const uint8_t* __restrict null_map_data;
if constexpr (is_nullable) {
null_map_data = null_map->data();
}
auto* __restrict result_data = results.data();
for (size_t i = 0; i < rows; ++i) {
uint32_t len = offset[i] - offset[i - 1];
if constexpr (!is_nullable && !is_negative) {
result_data[i] = _set.find(col.get_data_at(i));
result_data[i] = _set.find(StringRef(cursor, len));
} else if constexpr (!is_nullable && is_negative) {
result_data[i] = !_set.find(col.get_data_at(i));
result_data[i] = !_set.find(StringRef(cursor, len));
} else if constexpr (is_nullable && !is_negative) {
result_data[i] = _set.find(col.get_data_at(i)) & (!null_map_data[i]);
result_data[i] = (!null_map_data[i]) & _set.find(StringRef(cursor, len));
} else { // (is_nullable && is_negative)
result_data[i] = !(_set.find(col.get_data_at(i)) & (!null_map_data[i]));
result_data[i] = !((!null_map_data[i]) & _set.find(StringRef(cursor, len)));
}
cursor += len;
}
}

View File

@ -170,7 +170,7 @@ private:
const PaddedPODArray<Int32>& start, const PaddedPODArray<Int32>& len,
NullMap& null_map, ColumnString::Chars& res_chars,
ColumnString::Offsets& res_offsets) {
int size = offsets.size();
size_t size = offsets.size();
res_offsets.resize(size);
res_chars.reserve(chars.size());
@ -178,63 +178,118 @@ private:
PMR::monotonic_buffer_resource pool {buf.data(), buf.size()};
PMR::vector<size_t> index {&pool};
PMR::vector<std::pair<const unsigned char*, int>> strs(&pool);
strs.resize(size);
auto* __restrict data_ptr = chars.data();
auto* __restrict offset_ptr = offsets.data();
for (int i = 0; i < size; ++i) {
strs[i].first = data_ptr + offset_ptr[i - 1];
strs[i].second = offset_ptr[i] - offset_ptr[i - 1];
}
for (int i = 0; i < size; ++i) {
auto [raw_str, str_size] = strs[i];
const auto& start_value = start[index_check_const(i, Const)];
const auto& len_value = len[index_check_const(i, Const)];
if constexpr (Const) {
const auto start_value = start[0];
const auto len_value = len[0];
if (start_value == 0 || len_value <= 0) {
for (size_t i = 0; i < size; ++i) {
StringOP::push_empty_string(i, res_chars, res_offsets);
}
} else {
for (size_t i = 0; i < size; ++i) {
const int str_size = offset_ptr[i] - offset_ptr[i - 1];
const uint8_t* raw_str = data_ptr + offset_ptr[i - 1];
// return empty string if start > src.length
if (start_value > str_size || start_value < -str_size || str_size == 0) {
StringOP::push_empty_string(i, res_chars, res_offsets);
continue;
}
// reference to string_function.cpp: substring
size_t byte_pos = 0;
index.clear();
for (size_t j = 0, char_size = 0;
j < str_size &&
(start_value <= 0 || index.size() <= start_value + len_value);
j += char_size) {
char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]];
index.push_back(j);
}
// return empty string if start > src.length
if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0) {
StringOP::push_empty_string(i, res_chars, res_offsets);
continue;
}
// reference to string_function.cpp: substring
size_t byte_pos = 0;
index.clear();
for (size_t j = 0, char_size = 0; j < str_size; j += char_size) {
char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]];
index.push_back(j);
if (start_value > 0 && index.size() > start_value + len_value) {
break;
int fixed_pos = start_value;
if (fixed_pos < 0) {
fixed_pos = str_size + fixed_pos + 1;
} else if (fixed_pos > index.size()) {
StringOP::push_null_string(i, res_chars, res_offsets, null_map);
continue;
}
byte_pos = index[fixed_pos - 1];
int fixed_len = str_size - byte_pos;
if (fixed_pos + len_value <= index.size()) {
fixed_len = index[fixed_pos + len_value - 1] - byte_pos;
}
if (byte_pos <= str_size && fixed_len > 0) {
// return StringRef(str.data + byte_pos, fixed_len);
StringOP::push_value_string(
std::string_view {reinterpret_cast<const char*>(raw_str + byte_pos),
(size_t)fixed_len},
i, res_chars, res_offsets);
} else {
StringOP::push_empty_string(i, res_chars, res_offsets);
}
}
}
int fixed_pos = start_value;
if (fixed_pos < -(int)index.size()) {
StringOP::push_empty_string(i, res_chars, res_offsets);
continue;
}
if (fixed_pos < 0) {
fixed_pos = index.size() + fixed_pos + 1;
}
if (fixed_pos > index.size()) {
StringOP::push_null_string(i, res_chars, res_offsets, null_map);
continue;
} else {
PMR::vector<std::pair<const unsigned char*, int>> strs(&pool);
strs.resize(size);
for (int i = 0; i < size; ++i) {
strs[i].first = data_ptr + offset_ptr[i - 1];
strs[i].second = offset_ptr[i] - offset_ptr[i - 1];
}
byte_pos = index[fixed_pos - 1];
int fixed_len = str_size - byte_pos;
if (fixed_pos + len_value <= index.size()) {
fixed_len = index[fixed_pos + len_value - 1] - byte_pos;
}
for (size_t i = 0; i < size; ++i) {
auto [raw_str, str_size] = strs[i];
const auto& start_value = start[i];
const auto& len_value = len[i];
if (byte_pos <= str_size && fixed_len > 0) {
// return StringRef(str.data + byte_pos, fixed_len);
StringOP::push_value_string(
std::string_view {reinterpret_cast<const char*>(raw_str + byte_pos),
(size_t)fixed_len},
i, res_chars, res_offsets);
} else {
StringOP::push_empty_string(i, res_chars, res_offsets);
// return empty string if start > src.length
if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0) {
StringOP::push_empty_string(i, res_chars, res_offsets);
continue;
}
// reference to string_function.cpp: substring
size_t byte_pos = 0;
index.clear();
for (size_t j = 0, char_size = 0; j < str_size; j += char_size) {
char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]];
index.push_back(j);
if (start_value > 0 && index.size() > start_value + len_value) {
break;
}
}
int fixed_pos = start_value;
if (fixed_pos < -(int)index.size()) {
StringOP::push_empty_string(i, res_chars, res_offsets);
continue;
}
if (fixed_pos < 0) {
fixed_pos = index.size() + fixed_pos + 1;
}
if (fixed_pos > index.size()) {
StringOP::push_null_string(i, res_chars, res_offsets, null_map);
continue;
}
byte_pos = index[fixed_pos - 1];
int fixed_len = str_size - byte_pos;
if (fixed_pos + len_value <= index.size()) {
fixed_len = index[fixed_pos + len_value - 1] - byte_pos;
}
if (byte_pos <= str_size && fixed_len > 0) {
// return StringRef(str.data + byte_pos, fixed_len);
StringOP::push_value_string(
std::string_view {reinterpret_cast<const char*>(raw_str + byte_pos),
(size_t)fixed_len},
i, res_chars, res_offsets);
} else {
StringOP::push_empty_string(i, res_chars, res_offsets);
}
}
}
}