[SQL Function][Bug] Fix parse_url() bug (#4429)

The parameter 'part' of parse_url function does not support lower case, and parse protocol not right.
And This function does not support parse 'port'. 
This PR tries to make parse_url function case insensitive and support parse 'port'.

The issue: #4451
This commit is contained in:
xinghuayu007
2020-09-03 17:06:09 +08:00
committed by GitHub
parent c29d41f675
commit 1a30bcbf36
4 changed files with 108 additions and 14 deletions

View File

@ -25,6 +25,7 @@
#include "runtime/string_value.hpp"
#include "runtime/tuple_row.h"
#include "util/url_parser.h"
#include <algorithm>
// NOTE: be careful not to use string::append. It is not performant.
namespace doris {
@ -803,7 +804,7 @@ void StringFunctions::parse_url_prepare(
std::stringstream ss;
ss << "Invalid URL part: " << AnyValUtil::to_string(*part) << std::endl
<< "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', "
<< "'USERINFO', and 'QUERY')";
<< "'USERINFO', 'PORT' and 'QUERY')";
ctx->set_error(ss.str().c_str());
return;
}
@ -815,13 +816,16 @@ StringVal StringFunctions::parse_url(
if (url.is_null || part.is_null) {
return StringVal::null();
}
std::string part_str = std::string(reinterpret_cast<const char *>(part.ptr), part.len);
transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
StringVal newPart = AnyValUtil::from_string_temp(ctx, part_str);
void* state = ctx->get_function_state(FunctionContext::FRAGMENT_LOCAL);
UrlParser::UrlPart url_part;
if (state != NULL) {
url_part = *reinterpret_cast<UrlParser::UrlPart*>(state);
} else {
DCHECK(!ctx->is_arg_constant(1));
url_part = UrlParser::get_url_part(StringValue::from_string_val(part));
url_part = UrlParser::get_url_part(StringValue::from_string_val(newPart));
}
StringValue result;
@ -829,7 +833,7 @@ StringVal StringFunctions::parse_url(
// url is malformed, or url_part is invalid.
if (url_part == UrlParser::INVALID) {
std::stringstream ss;
ss << "Invalid URL part: " << AnyValUtil::to_string(part);
ss << "Invalid URL part: " << AnyValUtil::to_string(newPart);
ctx->add_warning(ss.str().c_str());
} else {
std::stringstream ss;

View File

@ -24,10 +24,11 @@ const StringValue UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9)
const StringValue UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
const StringValue UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
const StringValue UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
const StringValue UrlParser::_s_url_protocol(const_cast<char*>("_s_protocol"), 8);
const StringValue UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
const StringValue UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
const StringValue UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
const StringValue UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
const StringValue UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
const StringValue UrlParser::_s_protocol(const_cast<char*>("://"), 3);
const StringValue UrlParser::_s_at(const_cast<char*>("@"), 1);
const StringValue UrlParser::_s_slash(const_cast<char*>("/"), 1);
@ -166,6 +167,36 @@ bool UrlParser::parse_url(const StringValue& url, UrlPart part, StringValue* res
break;
}
case PORT: {
// Find '@'.
int32_t start_pos = _s_at_search.search(&protocol_end);
if (start_pos < 0) {
// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
start_pos = 0;
} else {
// Skip '@'.
start_pos += _s_at.len;
}
StringValue host_start = protocol_end.substring(start_pos);
// Find ':' to strip out port.
int32_t end_pos = _s_colon_search.search(&host_start);
//no port found
if (end_pos < 0) {
return false;
}
StringValue port_start_str = protocol_end.substring(end_pos + _s_colon.len);
int32_t port_end_pos = _s_slash_search.search(&port_start_str);
//if '/' not found, try to find '?'
if (port_end_pos < 0) {
port_end_pos = _s_question_search.search(&port_start_str);
}
*result = port_start_str.substring(0, port_end_pos);
break;
}
case INVALID:
return false;
}
@ -246,9 +277,12 @@ bool UrlParser::parse_url_key(
UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
// Quick filter on requested URL part, based on first character.
// Hive requires the requested URL part to be all upper case.
switch (part.ptr[0]) {
std::string part_str = part.to_string();
transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
StringValue newPart = StringValue(part_str);
switch (newPart.ptr[0]) {
case 'A': {
if (!part.eq(_s_url_authority)) {
if (!newPart.eq(_s_url_authority)) {
return INVALID;
}
@ -256,7 +290,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
}
case 'F': {
if (!part.eq(_s_url_file)) {
if (!newPart.eq(_s_url_file)) {
return INVALID;
}
@ -264,7 +298,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
}
case 'H': {
if (!part.eq(_s_url_host)) {
if (!newPart.eq(_s_url_host)) {
return INVALID;
}
@ -272,17 +306,19 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
}
case 'P': {
if (part.eq(_s_url_path)) {
if (newPart.eq(_s_url_path)) {
return PATH;
} else if (part.eq(_s_url_protocol)) {
} else if (newPart.eq(_s_url_protocol)) {
return PROTOCOL;
} else if (newPart.eq(_s_url_port)) {
return PORT;
} else {
return INVALID;
}
}
case 'Q': {
if (!part.eq(_s_url_query)) {
if (!newPart.eq(_s_url_query)) {
return INVALID;
}
@ -290,7 +326,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
}
case 'R': {
if (!part.eq(_s_url_ref)) {
if (!newPart.eq(_s_url_ref)) {
return INVALID;
}
@ -298,7 +334,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
}
case 'U': {
if (!part.eq(_s_url_userinfo)) {
if (!newPart.eq(_s_url_userinfo)) {
return INVALID;
}

View File

@ -52,7 +52,8 @@ public:
PROTOCOL,
QUERY,
REF,
USERINFO
USERINFO,
PORT
};
// Tries to parse the part from url. Places the result in result.
@ -84,6 +85,7 @@ private:
static const StringValue _s_url_query;
static const StringValue _s_url_ref;
static const StringValue _s_url_userinfo;
static const StringValue _s_url_port;
// Constants used in searching for URL parts.
static const StringValue _s_protocol;
static const StringValue _s_at;

View File

@ -529,6 +529,58 @@ TEST_F(StringFunctionsTest, replace) {
StringFunctions::replace(ctx, StringVal("http://中国hello:9090"), StringVal("中国hello"), StringVal("华夏zhongguo")));
}
TEST_F(StringFunctionsTest, parse_url) {
ASSERT_EQ(StringVal("facebook.com"),
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("AUTHORITY")));
ASSERT_EQ(StringVal("facebook.com"),
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("authority")));
ASSERT_EQ(StringVal("/a/b/c.php"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("FILE")));
ASSERT_EQ(StringVal("/a/b/c.php"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("file")));
ASSERT_EQ(StringVal("/a/b/c.php"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("PATH")));
ASSERT_EQ(StringVal("/a/b/c.php"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("path")));
ASSERT_EQ(StringVal("www.baidu.com"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), StringVal("HOST")));
ASSERT_EQ(StringVal("www.baidu.com"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), StringVal("host")));
ASSERT_EQ(StringVal("http"),
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("PROTOCOL")));
ASSERT_EQ(StringVal("http"),
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("protocol")));
ASSERT_EQ(StringVal("a=b"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("QUERY")));
ASSERT_EQ(StringVal("a=b"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("query")));
ASSERT_EQ(StringVal::null(),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("REF")));
ASSERT_EQ(StringVal::null(),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("ref")));
ASSERT_EQ(StringVal::null(),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("USERINFO")));
ASSERT_EQ(StringVal::null(),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("userinfo")));
ASSERT_EQ(StringVal("9090"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("PORT")));
ASSERT_EQ(StringVal("9090"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c?a=b"), StringVal("PORT")));
ASSERT_EQ(StringVal::null(),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com?a=b"), StringVal("PORT")));
ASSERT_EQ(StringVal("9090"),
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("port")));
}
} // namespace doris
int main(int argc, char** argv) {