[Optimize](function) Optimize locate function by compare across strings (#20290)
Optimize locate function by compare across strings. about 90% speed up test by sum()
This commit is contained in:
@ -205,28 +205,46 @@ struct StringInStrImpl {
|
||||
res.resize(size);
|
||||
|
||||
if (rdata.size == 0) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
res[i] = 1;
|
||||
}
|
||||
std::fill(res.begin(), res.end(), 1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
const UInt8* begin = ldata.data();
|
||||
const UInt8* end = begin + ldata.size();
|
||||
const UInt8* pos = begin;
|
||||
|
||||
/// Current index in the array of strings.
|
||||
size_t i = 0;
|
||||
std::fill(res.begin(), res.end(), 0);
|
||||
|
||||
StringRef rstr_ref(rdata.data, rdata.size);
|
||||
StringSearch search(&rstr_ref);
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]);
|
||||
int l_str_size = loffsets[i] - loffsets[i - 1];
|
||||
|
||||
StringRef lstr_ref(l_raw_str, l_str_size);
|
||||
|
||||
// Hive returns positions starting from 1.
|
||||
int loc = search.search(&lstr_ref);
|
||||
if (loc > 0) {
|
||||
size_t len = std::min(lstr_ref.size, (size_t)loc);
|
||||
loc = simd::VStringFunctions::get_char_len(lstr_ref.data, len);
|
||||
while (pos < end) {
|
||||
// search return matched substring start offset
|
||||
pos = (UInt8*)search.search((char*)pos, end - pos);
|
||||
if (pos >= end) {
|
||||
break;
|
||||
}
|
||||
res[i] = loc + 1;
|
||||
|
||||
/// Determine which index it refers to.
|
||||
/// begin + value_offsets[i] is the start offset of string at i+1
|
||||
while (begin + loffsets[i] < pos) {
|
||||
++i;
|
||||
}
|
||||
|
||||
/// We check that the entry does not pass through the boundaries of strings.
|
||||
if (pos + rdata.size <= begin + loffsets[i]) {
|
||||
int loc = pos - begin - loffsets[i - 1];
|
||||
int l_str_size = loffsets[i] - loffsets[i - 1];
|
||||
size_t len = std::min(l_str_size, loc);
|
||||
loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len);
|
||||
res[i] = loc + 1;
|
||||
}
|
||||
|
||||
// move to next string offset
|
||||
pos = begin + loffsets[i];
|
||||
++i;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
||||
Reference in New Issue
Block a user