[Optimize](function) Optimize locate function by compare across strings (#20290)

Optimize locate function by compare across strings. about 90% speed up test by sum()
This commit is contained in:
ZhangYu0123
2023-06-05 12:43:14 +08:00
committed by GitHub
parent 3c28a71378
commit d03bb4ba7b
3 changed files with 61 additions and 27 deletions

View File

@ -205,28 +205,46 @@ struct StringInStrImpl {
res.resize(size);
if (rdata.size == 0) {
for (int i = 0; i < size; ++i) {
res[i] = 1;
}
std::fill(res.begin(), res.end(), 1);
return Status::OK();
}
const UInt8* begin = ldata.data();
const UInt8* end = begin + ldata.size();
const UInt8* pos = begin;
/// Current index in the array of strings.
size_t i = 0;
std::fill(res.begin(), res.end(), 0);
StringRef rstr_ref(rdata.data, rdata.size);
StringSearch search(&rstr_ref);
for (int i = 0; i < size; ++i) {
const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]);
int l_str_size = loffsets[i] - loffsets[i - 1];
StringRef lstr_ref(l_raw_str, l_str_size);
// Hive returns positions starting from 1.
int loc = search.search(&lstr_ref);
if (loc > 0) {
size_t len = std::min(lstr_ref.size, (size_t)loc);
loc = simd::VStringFunctions::get_char_len(lstr_ref.data, len);
while (pos < end) {
// search return matched substring start offset
pos = (UInt8*)search.search((char*)pos, end - pos);
if (pos >= end) {
break;
}
res[i] = loc + 1;
/// Determine which index it refers to.
/// begin + value_offsets[i] is the start offset of string at i+1
while (begin + loffsets[i] < pos) {
++i;
}
/// We check that the entry does not pass through the boundaries of strings.
if (pos + rdata.size <= begin + loffsets[i]) {
int loc = pos - begin - loffsets[i - 1];
int l_str_size = loffsets[i] - loffsets[i - 1];
size_t len = std::min(l_str_size, loc);
loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len);
res[i] = loc + 1;
}
// move to next string offset
pos = begin + loffsets[i];
++i;
}
return Status::OK();