[opt](function) refactor extract_url to use StringValue (#13508)
change extract_url use stringvalue to repalce std::string to speed up
This commit is contained in:
@ -173,6 +173,9 @@ struct StringValue {
|
||||
// Trims leading and trailing spaces.
|
||||
StringValue trim() const;
|
||||
|
||||
// Find the first position char of appear, return -1 if not found
|
||||
int64_t find_first_of(char c) const;
|
||||
|
||||
void to_string_val(doris_udf::StringVal* sv) const {
|
||||
*sv = doris_udf::StringVal(reinterpret_cast<uint8_t*>(ptr), len);
|
||||
}
|
||||
|
||||
@ -51,4 +51,9 @@ inline StringValue StringValue::trim() const {
|
||||
return StringValue(ptr + begin, end - begin + 1);
|
||||
}
|
||||
|
||||
inline int64_t StringValue::find_first_of(char c) const {
|
||||
const char* p = static_cast<const char*>(memchr(ptr, c, len));
|
||||
return p == nullptr ? -1 : p - ptr;
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
|
||||
@ -346,9 +346,8 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string UrlParser::extract_url(const StringValue& url, const StringValue& name) {
|
||||
std::string result;
|
||||
std::string str_name = name.to_string();
|
||||
StringValue UrlParser::extract_url(StringValue url, StringValue name) {
|
||||
StringValue result("", 0);
|
||||
// Remove leading and trailing spaces.
|
||||
StringValue trimmed_url = url.trim();
|
||||
// find '?'
|
||||
@ -358,45 +357,45 @@ std::string UrlParser::extract_url(const StringValue& url, const StringValue& na
|
||||
// Example: https://doris.apache.org/
|
||||
return result;
|
||||
}
|
||||
|
||||
// find '#'
|
||||
int32_t hash_pos = _s_hash_search.search(&trimmed_url);
|
||||
std::string sub_url = "";
|
||||
StringValue sub_url;
|
||||
if (hash_pos < 0) {
|
||||
sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1)
|
||||
.to_string();
|
||||
sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1);
|
||||
} else {
|
||||
sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1).to_string();
|
||||
sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
|
||||
}
|
||||
|
||||
// find '&' and '=', and extract target parameter
|
||||
// Example: k1=aa&k2=bb&k3=cc&test=dd
|
||||
std::string::size_type and_pod;
|
||||
std::string::size_type len = sub_url.length();
|
||||
std::string key_url;
|
||||
int64_t and_pod;
|
||||
auto len = sub_url.len;
|
||||
StringValue key_url;
|
||||
while (true) {
|
||||
if (len <= 0) {
|
||||
break;
|
||||
}
|
||||
and_pod = sub_url.find_first_of('&');
|
||||
if (and_pod != std::string::npos) {
|
||||
key_url = sub_url.substr(0, and_pod);
|
||||
sub_url = sub_url.substr(and_pod + 1, len - and_pod);
|
||||
if (and_pod != -1) {
|
||||
key_url = sub_url.substring(0, and_pod);
|
||||
sub_url = sub_url.substring(and_pod + 1, len - and_pod);
|
||||
} else {
|
||||
key_url = sub_url;
|
||||
sub_url = "";
|
||||
auto end_pos = sub_url.find_first_of('#');
|
||||
key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
|
||||
sub_url = result;
|
||||
}
|
||||
len = sub_url.length();
|
||||
len = sub_url.len;
|
||||
|
||||
std::string::size_type eq_pod = key_url.find_first_of('=');
|
||||
if (eq_pod == std::string::npos) {
|
||||
auto eq_pod = key_url.find_first_of('=');
|
||||
if (eq_pod == -1) {
|
||||
// invalid url. like: k1&k2=bb
|
||||
continue;
|
||||
}
|
||||
int32_t key_len = key_url.length();
|
||||
std::string key = key_url.substr(0, eq_pod);
|
||||
if (str_name == key) {
|
||||
result = key_url.substr(eq_pod + 1, key_len - eq_pod);
|
||||
return result;
|
||||
int32_t key_len = key_url.len;
|
||||
auto key = key_url.substring(0, eq_pod);
|
||||
if (name == key) {
|
||||
return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
@ -63,7 +63,7 @@ public:
|
||||
// Extract parameter value from url
|
||||
// Example for url:
|
||||
// http://doris.apache.org?k1=aa&k2=bb&k3=cc&test=dd#999
|
||||
static std::string extract_url(const StringValue& url, const StringValue& name);
|
||||
static StringValue extract_url(StringValue url, StringValue name);
|
||||
|
||||
private:
|
||||
// Constants representing parts of a URL.
|
||||
|
||||
@ -1276,12 +1276,9 @@ public:
|
||||
for (int i = 0; i < input_rows_count; ++i) {
|
||||
auto source = url_col->get_data_at(i);
|
||||
auto param = parameter_col->get_data_at(i);
|
||||
StringValue url_str(const_cast<char*>(source.data), source.size);
|
||||
StringValue parameter_str(const_cast<char*>(param.data), param.size);
|
||||
auto res = extract_url(source, param);
|
||||
|
||||
std::string result = extract_url(url_str, parameter_str);
|
||||
|
||||
col_res->insert_data(result.data(), result.length());
|
||||
col_res->insert_data(res.ptr, res.len);
|
||||
}
|
||||
|
||||
block.replace_by_position(result, std::move(col_res));
|
||||
@ -1289,11 +1286,11 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
std::string extract_url(StringValue url, StringValue parameter) {
|
||||
if (url.len == 0 || parameter.len == 0) {
|
||||
return "";
|
||||
StringValue extract_url(StringRef url, StringRef parameter) {
|
||||
if (url.size == 0 || parameter.size == 0) {
|
||||
return StringValue("", 0);
|
||||
}
|
||||
return UrlParser::extract_url(url, parameter);
|
||||
return UrlParser::extract_url(StringValue(url), StringValue(parameter));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -388,6 +388,7 @@
|
||||
"sql-manual/sql-functions/string-functions/split_part",
|
||||
"sql-manual/sql-functions/string-functions/money_format",
|
||||
"sql-manual/sql-functions/string-functions/parse_url",
|
||||
"sql-manual/sql-functions/string-functions/extract_url_parameter",
|
||||
{
|
||||
"type": "category",
|
||||
"label": "Fuzzy Match",
|
||||
|
||||
Reference in New Issue
Block a user