diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h index 13b3852a5d..4878ca31e4 100644 --- a/be/src/runtime/string_value.h +++ b/be/src/runtime/string_value.h @@ -173,6 +173,9 @@ struct StringValue { // Trims leading and trailing spaces. StringValue trim() const; + // Find the first position char of appear, return -1 if not found + int64_t find_first_of(char c) const; + void to_string_val(doris_udf::StringVal* sv) const { *sv = doris_udf::StringVal(reinterpret_cast(ptr), len); } diff --git a/be/src/runtime/string_value.hpp b/be/src/runtime/string_value.hpp index 961e8b86c5..fb8039e661 100644 --- a/be/src/runtime/string_value.hpp +++ b/be/src/runtime/string_value.hpp @@ -51,4 +51,9 @@ inline StringValue StringValue::trim() const { return StringValue(ptr + begin, end - begin + 1); } +inline int64_t StringValue::find_first_of(char c) const { + const char* p = static_cast(memchr(ptr, c, len)); + return p == nullptr ? -1 : p - ptr; +} + } // namespace doris diff --git a/be/src/util/url_parser.cpp b/be/src/util/url_parser.cpp index 00d2783bd6..06ed454251 100644 --- a/be/src/util/url_parser.cpp +++ b/be/src/util/url_parser.cpp @@ -346,9 +346,8 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } } -std::string UrlParser::extract_url(const StringValue& url, const StringValue& name) { - std::string result; - std::string str_name = name.to_string(); +StringValue UrlParser::extract_url(StringValue url, StringValue name) { + StringValue result("", 0); // Remove leading and trailing spaces. StringValue trimmed_url = url.trim(); // find '?' @@ -358,45 +357,45 @@ std::string UrlParser::extract_url(const StringValue& url, const StringValue& na // Example: https://doris.apache.org/ return result; } + // find '#' int32_t hash_pos = _s_hash_search.search(&trimmed_url); - std::string sub_url = ""; + StringValue sub_url; if (hash_pos < 0) { - sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1) - .to_string(); + sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1); } else { - sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1).to_string(); + sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1); } // find '&' and '=', and extract target parameter // Example: k1=aa&k2=bb&k3=cc&test=dd - std::string::size_type and_pod; - std::string::size_type len = sub_url.length(); - std::string key_url; + int64_t and_pod; + auto len = sub_url.len; + StringValue key_url; while (true) { if (len <= 0) { break; } and_pod = sub_url.find_first_of('&'); - if (and_pod != std::string::npos) { - key_url = sub_url.substr(0, and_pod); - sub_url = sub_url.substr(and_pod + 1, len - and_pod); + if (and_pod != -1) { + key_url = sub_url.substring(0, and_pod); + sub_url = sub_url.substring(and_pod + 1, len - and_pod); } else { - key_url = sub_url; - sub_url = ""; + auto end_pos = sub_url.find_first_of('#'); + key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos); + sub_url = result; } - len = sub_url.length(); + len = sub_url.len; - std::string::size_type eq_pod = key_url.find_first_of('='); - if (eq_pod == std::string::npos) { + auto eq_pod = key_url.find_first_of('='); + if (eq_pod == -1) { // invalid url. like: k1&k2=bb continue; } - int32_t key_len = key_url.length(); - std::string key = key_url.substr(0, eq_pod); - if (str_name == key) { - result = key_url.substr(eq_pod + 1, key_len - eq_pod); - return result; + int32_t key_len = key_url.len; + auto key = key_url.substring(0, eq_pod); + if (name == key) { + return key_url.substring(eq_pod + 1, key_len - eq_pod - 1); } } return result; diff --git a/be/src/util/url_parser.h b/be/src/util/url_parser.h index 0d212b1acd..e2a7ca6872 100644 --- a/be/src/util/url_parser.h +++ b/be/src/util/url_parser.h @@ -63,7 +63,7 @@ public: // Extract parameter value from url // Example for url: // http://doris.apache.org?k1=aa&k2=bb&k3=cc&test=dd#999 - static std::string extract_url(const StringValue& url, const StringValue& name); + static StringValue extract_url(StringValue url, StringValue name); private: // Constants representing parts of a URL. diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 2b6fe69764..cc51514b8b 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1276,12 +1276,9 @@ public: for (int i = 0; i < input_rows_count; ++i) { auto source = url_col->get_data_at(i); auto param = parameter_col->get_data_at(i); - StringValue url_str(const_cast(source.data), source.size); - StringValue parameter_str(const_cast(param.data), param.size); + auto res = extract_url(source, param); - std::string result = extract_url(url_str, parameter_str); - - col_res->insert_data(result.data(), result.length()); + col_res->insert_data(res.ptr, res.len); } block.replace_by_position(result, std::move(col_res)); @@ -1289,11 +1286,11 @@ public: } private: - std::string extract_url(StringValue url, StringValue parameter) { - if (url.len == 0 || parameter.len == 0) { - return ""; + StringValue extract_url(StringRef url, StringRef parameter) { + if (url.size == 0 || parameter.size == 0) { + return StringValue("", 0); } - return UrlParser::extract_url(url, parameter); + return UrlParser::extract_url(StringValue(url), StringValue(parameter)); } }; diff --git a/docs/sidebars.json b/docs/sidebars.json index 31193a7e78..57eca79a93 100644 --- a/docs/sidebars.json +++ b/docs/sidebars.json @@ -388,6 +388,7 @@ "sql-manual/sql-functions/string-functions/split_part", "sql-manual/sql-functions/string-functions/money_format", "sql-manual/sql-functions/string-functions/parse_url", + "sql-manual/sql-functions/string-functions/extract_url_parameter", { "type": "category", "label": "Fuzzy Match",