diff --git a/be/src/exprs/string_functions.cpp b/be/src/exprs/string_functions.cpp index 1679eca54b..6a5abaa3c6 100644 --- a/be/src/exprs/string_functions.cpp +++ b/be/src/exprs/string_functions.cpp @@ -25,6 +25,7 @@ #include "runtime/string_value.hpp" #include "runtime/tuple_row.h" #include "util/url_parser.h" +#include // NOTE: be careful not to use string::append. It is not performant. namespace doris { @@ -803,7 +804,7 @@ void StringFunctions::parse_url_prepare( std::stringstream ss; ss << "Invalid URL part: " << AnyValUtil::to_string(*part) << std::endl << "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', " - << "'USERINFO', and 'QUERY')"; + << "'USERINFO', 'PORT' and 'QUERY')"; ctx->set_error(ss.str().c_str()); return; } @@ -815,13 +816,16 @@ StringVal StringFunctions::parse_url( if (url.is_null || part.is_null) { return StringVal::null(); } + std::string part_str = std::string(reinterpret_cast(part.ptr), part.len); + transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper); + StringVal newPart = AnyValUtil::from_string_temp(ctx, part_str); void* state = ctx->get_function_state(FunctionContext::FRAGMENT_LOCAL); UrlParser::UrlPart url_part; if (state != NULL) { url_part = *reinterpret_cast(state); } else { DCHECK(!ctx->is_arg_constant(1)); - url_part = UrlParser::get_url_part(StringValue::from_string_val(part)); + url_part = UrlParser::get_url_part(StringValue::from_string_val(newPart)); } StringValue result; @@ -829,7 +833,7 @@ StringVal StringFunctions::parse_url( // url is malformed, or url_part is invalid. if (url_part == UrlParser::INVALID) { std::stringstream ss; - ss << "Invalid URL part: " << AnyValUtil::to_string(part); + ss << "Invalid URL part: " << AnyValUtil::to_string(newPart); ctx->add_warning(ss.str().c_str()); } else { std::stringstream ss; diff --git a/be/src/util/url_parser.cpp b/be/src/util/url_parser.cpp index 0e66b44341..a4021cae30 100644 --- a/be/src/util/url_parser.cpp +++ b/be/src/util/url_parser.cpp @@ -24,10 +24,11 @@ const StringValue UrlParser::_s_url_authority(const_cast("AUTHORITY"), 9) const StringValue UrlParser::_s_url_file(const_cast("FILE"), 4); const StringValue UrlParser::_s_url_host(const_cast("HOST"), 4); const StringValue UrlParser::_s_url_path(const_cast("PATH"), 4); -const StringValue UrlParser::_s_url_protocol(const_cast("_s_protocol"), 8); +const StringValue UrlParser::_s_url_protocol(const_cast("PROTOCOL"), 8); const StringValue UrlParser::_s_url_query(const_cast("QUERY"), 5); const StringValue UrlParser::_s_url_ref(const_cast("REF"), 3); const StringValue UrlParser::_s_url_userinfo(const_cast("USERINFO"), 8); +const StringValue UrlParser::_s_url_port(const_cast("PORT"), 4); const StringValue UrlParser::_s_protocol(const_cast("://"), 3); const StringValue UrlParser::_s_at(const_cast("@"), 1); const StringValue UrlParser::_s_slash(const_cast("/"), 1); @@ -166,6 +167,36 @@ bool UrlParser::parse_url(const StringValue& url, UrlPart part, StringValue* res break; } + case PORT: { + // Find '@'. + int32_t start_pos = _s_at_search.search(&protocol_end); + + if (start_pos < 0) { + // No '@' was found, i.e., no user:pass info was given, start after _s_protocol. + start_pos = 0; + } else { + // Skip '@'. + start_pos += _s_at.len; + } + + StringValue host_start = protocol_end.substring(start_pos); + // Find ':' to strip out port. + int32_t end_pos = _s_colon_search.search(&host_start); + //no port found + if (end_pos < 0) { + return false; + } + + StringValue port_start_str = protocol_end.substring(end_pos + _s_colon.len); + int32_t port_end_pos = _s_slash_search.search(&port_start_str); + //if '/' not found, try to find '?' + if (port_end_pos < 0) { + port_end_pos = _s_question_search.search(&port_start_str); + } + *result = port_start_str.substring(0, port_end_pos); + break; + } + case INVALID: return false; } @@ -246,9 +277,12 @@ bool UrlParser::parse_url_key( UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { // Quick filter on requested URL part, based on first character. // Hive requires the requested URL part to be all upper case. - switch (part.ptr[0]) { + std::string part_str = part.to_string(); + transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper); + StringValue newPart = StringValue(part_str); + switch (newPart.ptr[0]) { case 'A': { - if (!part.eq(_s_url_authority)) { + if (!newPart.eq(_s_url_authority)) { return INVALID; } @@ -256,7 +290,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } case 'F': { - if (!part.eq(_s_url_file)) { + if (!newPart.eq(_s_url_file)) { return INVALID; } @@ -264,7 +298,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } case 'H': { - if (!part.eq(_s_url_host)) { + if (!newPart.eq(_s_url_host)) { return INVALID; } @@ -272,17 +306,19 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } case 'P': { - if (part.eq(_s_url_path)) { + if (newPart.eq(_s_url_path)) { return PATH; - } else if (part.eq(_s_url_protocol)) { + } else if (newPart.eq(_s_url_protocol)) { return PROTOCOL; + } else if (newPart.eq(_s_url_port)) { + return PORT; } else { return INVALID; } } case 'Q': { - if (!part.eq(_s_url_query)) { + if (!newPart.eq(_s_url_query)) { return INVALID; } @@ -290,7 +326,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } case 'R': { - if (!part.eq(_s_url_ref)) { + if (!newPart.eq(_s_url_ref)) { return INVALID; } @@ -298,7 +334,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) { } case 'U': { - if (!part.eq(_s_url_userinfo)) { + if (!newPart.eq(_s_url_userinfo)) { return INVALID; } diff --git a/be/src/util/url_parser.h b/be/src/util/url_parser.h index e0588ee674..33f698b9d5 100644 --- a/be/src/util/url_parser.h +++ b/be/src/util/url_parser.h @@ -52,7 +52,8 @@ public: PROTOCOL, QUERY, REF, - USERINFO + USERINFO, + PORT }; // Tries to parse the part from url. Places the result in result. @@ -84,6 +85,7 @@ private: static const StringValue _s_url_query; static const StringValue _s_url_ref; static const StringValue _s_url_userinfo; + static const StringValue _s_url_port; // Constants used in searching for URL parts. static const StringValue _s_protocol; static const StringValue _s_at; diff --git a/be/test/exprs/string_functions_test.cpp b/be/test/exprs/string_functions_test.cpp index 3a6e351f60..77217512fb 100644 --- a/be/test/exprs/string_functions_test.cpp +++ b/be/test/exprs/string_functions_test.cpp @@ -529,6 +529,58 @@ TEST_F(StringFunctionsTest, replace) { StringFunctions::replace(ctx, StringVal("http://中国hello:9090"), StringVal("中国hello"), StringVal("华夏zhongguo"))); } +TEST_F(StringFunctionsTest, parse_url) { + ASSERT_EQ(StringVal("facebook.com"), + StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("AUTHORITY"))); + ASSERT_EQ(StringVal("facebook.com"), + StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("authority"))); + + ASSERT_EQ(StringVal("/a/b/c.php"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("FILE"))); + ASSERT_EQ(StringVal("/a/b/c.php"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("file"))); + + ASSERT_EQ(StringVal("/a/b/c.php"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("PATH"))); + ASSERT_EQ(StringVal("/a/b/c.php"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("path"))); + + ASSERT_EQ(StringVal("www.baidu.com"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), StringVal("HOST"))); + ASSERT_EQ(StringVal("www.baidu.com"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), StringVal("host"))); + + ASSERT_EQ(StringVal("http"), + StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("PROTOCOL"))); + ASSERT_EQ(StringVal("http"), + StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("protocol"))); + + ASSERT_EQ(StringVal("a=b"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("QUERY"))); + ASSERT_EQ(StringVal("a=b"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("query"))); + + ASSERT_EQ(StringVal::null(), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("REF"))); + ASSERT_EQ(StringVal::null(), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("ref"))); + + ASSERT_EQ(StringVal::null(), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("USERINFO"))); + ASSERT_EQ(StringVal::null(), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("userinfo"))); + + ASSERT_EQ(StringVal("9090"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("PORT"))); + ASSERT_EQ(StringVal("9090"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c?a=b"), StringVal("PORT"))); + ASSERT_EQ(StringVal::null(), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com?a=b"), StringVal("PORT"))); + ASSERT_EQ(StringVal("9090"), + StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("port"))); + +} + } // namespace doris int main(int argc, char** argv) {