[SQL Function][Bug] Fix parse_url() bug (#4429)
The parameter 'part' of parse_url function does not support lower case, and parse protocol not right. And This function does not support parse 'port'. This PR tries to make parse_url function case insensitive and support parse 'port'. The issue: #4451
This commit is contained in:
@ -25,6 +25,7 @@
|
||||
#include "runtime/string_value.hpp"
|
||||
#include "runtime/tuple_row.h"
|
||||
#include "util/url_parser.h"
|
||||
#include <algorithm>
|
||||
|
||||
// NOTE: be careful not to use string::append. It is not performant.
|
||||
namespace doris {
|
||||
@ -803,7 +804,7 @@ void StringFunctions::parse_url_prepare(
|
||||
std::stringstream ss;
|
||||
ss << "Invalid URL part: " << AnyValUtil::to_string(*part) << std::endl
|
||||
<< "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', "
|
||||
<< "'USERINFO', and 'QUERY')";
|
||||
<< "'USERINFO', 'PORT' and 'QUERY')";
|
||||
ctx->set_error(ss.str().c_str());
|
||||
return;
|
||||
}
|
||||
@ -815,13 +816,16 @@ StringVal StringFunctions::parse_url(
|
||||
if (url.is_null || part.is_null) {
|
||||
return StringVal::null();
|
||||
}
|
||||
std::string part_str = std::string(reinterpret_cast<const char *>(part.ptr), part.len);
|
||||
transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
|
||||
StringVal newPart = AnyValUtil::from_string_temp(ctx, part_str);
|
||||
void* state = ctx->get_function_state(FunctionContext::FRAGMENT_LOCAL);
|
||||
UrlParser::UrlPart url_part;
|
||||
if (state != NULL) {
|
||||
url_part = *reinterpret_cast<UrlParser::UrlPart*>(state);
|
||||
} else {
|
||||
DCHECK(!ctx->is_arg_constant(1));
|
||||
url_part = UrlParser::get_url_part(StringValue::from_string_val(part));
|
||||
url_part = UrlParser::get_url_part(StringValue::from_string_val(newPart));
|
||||
}
|
||||
|
||||
StringValue result;
|
||||
@ -829,7 +833,7 @@ StringVal StringFunctions::parse_url(
|
||||
// url is malformed, or url_part is invalid.
|
||||
if (url_part == UrlParser::INVALID) {
|
||||
std::stringstream ss;
|
||||
ss << "Invalid URL part: " << AnyValUtil::to_string(part);
|
||||
ss << "Invalid URL part: " << AnyValUtil::to_string(newPart);
|
||||
ctx->add_warning(ss.str().c_str());
|
||||
} else {
|
||||
std::stringstream ss;
|
||||
|
||||
@ -24,10 +24,11 @@ const StringValue UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9)
|
||||
const StringValue UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
|
||||
const StringValue UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
|
||||
const StringValue UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
|
||||
const StringValue UrlParser::_s_url_protocol(const_cast<char*>("_s_protocol"), 8);
|
||||
const StringValue UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
|
||||
const StringValue UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
|
||||
const StringValue UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
|
||||
const StringValue UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
|
||||
const StringValue UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
|
||||
const StringValue UrlParser::_s_protocol(const_cast<char*>("://"), 3);
|
||||
const StringValue UrlParser::_s_at(const_cast<char*>("@"), 1);
|
||||
const StringValue UrlParser::_s_slash(const_cast<char*>("/"), 1);
|
||||
@ -166,6 +167,36 @@ bool UrlParser::parse_url(const StringValue& url, UrlPart part, StringValue* res
|
||||
break;
|
||||
}
|
||||
|
||||
case PORT: {
|
||||
// Find '@'.
|
||||
int32_t start_pos = _s_at_search.search(&protocol_end);
|
||||
|
||||
if (start_pos < 0) {
|
||||
// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
|
||||
start_pos = 0;
|
||||
} else {
|
||||
// Skip '@'.
|
||||
start_pos += _s_at.len;
|
||||
}
|
||||
|
||||
StringValue host_start = protocol_end.substring(start_pos);
|
||||
// Find ':' to strip out port.
|
||||
int32_t end_pos = _s_colon_search.search(&host_start);
|
||||
//no port found
|
||||
if (end_pos < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
StringValue port_start_str = protocol_end.substring(end_pos + _s_colon.len);
|
||||
int32_t port_end_pos = _s_slash_search.search(&port_start_str);
|
||||
//if '/' not found, try to find '?'
|
||||
if (port_end_pos < 0) {
|
||||
port_end_pos = _s_question_search.search(&port_start_str);
|
||||
}
|
||||
*result = port_start_str.substring(0, port_end_pos);
|
||||
break;
|
||||
}
|
||||
|
||||
case INVALID:
|
||||
return false;
|
||||
}
|
||||
@ -246,9 +277,12 @@ bool UrlParser::parse_url_key(
|
||||
UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
// Quick filter on requested URL part, based on first character.
|
||||
// Hive requires the requested URL part to be all upper case.
|
||||
switch (part.ptr[0]) {
|
||||
std::string part_str = part.to_string();
|
||||
transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
|
||||
StringValue newPart = StringValue(part_str);
|
||||
switch (newPart.ptr[0]) {
|
||||
case 'A': {
|
||||
if (!part.eq(_s_url_authority)) {
|
||||
if (!newPart.eq(_s_url_authority)) {
|
||||
return INVALID;
|
||||
}
|
||||
|
||||
@ -256,7 +290,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
}
|
||||
|
||||
case 'F': {
|
||||
if (!part.eq(_s_url_file)) {
|
||||
if (!newPart.eq(_s_url_file)) {
|
||||
return INVALID;
|
||||
}
|
||||
|
||||
@ -264,7 +298,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
}
|
||||
|
||||
case 'H': {
|
||||
if (!part.eq(_s_url_host)) {
|
||||
if (!newPart.eq(_s_url_host)) {
|
||||
return INVALID;
|
||||
}
|
||||
|
||||
@ -272,17 +306,19 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
}
|
||||
|
||||
case 'P': {
|
||||
if (part.eq(_s_url_path)) {
|
||||
if (newPart.eq(_s_url_path)) {
|
||||
return PATH;
|
||||
} else if (part.eq(_s_url_protocol)) {
|
||||
} else if (newPart.eq(_s_url_protocol)) {
|
||||
return PROTOCOL;
|
||||
} else if (newPart.eq(_s_url_port)) {
|
||||
return PORT;
|
||||
} else {
|
||||
return INVALID;
|
||||
}
|
||||
}
|
||||
|
||||
case 'Q': {
|
||||
if (!part.eq(_s_url_query)) {
|
||||
if (!newPart.eq(_s_url_query)) {
|
||||
return INVALID;
|
||||
}
|
||||
|
||||
@ -290,7 +326,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
}
|
||||
|
||||
case 'R': {
|
||||
if (!part.eq(_s_url_ref)) {
|
||||
if (!newPart.eq(_s_url_ref)) {
|
||||
return INVALID;
|
||||
}
|
||||
|
||||
@ -298,7 +334,7 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
||||
}
|
||||
|
||||
case 'U': {
|
||||
if (!part.eq(_s_url_userinfo)) {
|
||||
if (!newPart.eq(_s_url_userinfo)) {
|
||||
return INVALID;
|
||||
}
|
||||
|
||||
|
||||
@ -52,7 +52,8 @@ public:
|
||||
PROTOCOL,
|
||||
QUERY,
|
||||
REF,
|
||||
USERINFO
|
||||
USERINFO,
|
||||
PORT
|
||||
};
|
||||
|
||||
// Tries to parse the part from url. Places the result in result.
|
||||
@ -84,6 +85,7 @@ private:
|
||||
static const StringValue _s_url_query;
|
||||
static const StringValue _s_url_ref;
|
||||
static const StringValue _s_url_userinfo;
|
||||
static const StringValue _s_url_port;
|
||||
// Constants used in searching for URL parts.
|
||||
static const StringValue _s_protocol;
|
||||
static const StringValue _s_at;
|
||||
|
||||
@ -529,6 +529,58 @@ TEST_F(StringFunctionsTest, replace) {
|
||||
StringFunctions::replace(ctx, StringVal("http://中国hello:9090"), StringVal("中国hello"), StringVal("华夏zhongguo")));
|
||||
}
|
||||
|
||||
TEST_F(StringFunctionsTest, parse_url) {
|
||||
ASSERT_EQ(StringVal("facebook.com"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("AUTHORITY")));
|
||||
ASSERT_EQ(StringVal("facebook.com"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("authority")));
|
||||
|
||||
ASSERT_EQ(StringVal("/a/b/c.php"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("FILE")));
|
||||
ASSERT_EQ(StringVal("/a/b/c.php"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("file")));
|
||||
|
||||
ASSERT_EQ(StringVal("/a/b/c.php"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("PATH")));
|
||||
ASSERT_EQ(StringVal("/a/b/c.php"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c.php"), StringVal("path")));
|
||||
|
||||
ASSERT_EQ(StringVal("www.baidu.com"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), StringVal("HOST")));
|
||||
ASSERT_EQ(StringVal("www.baidu.com"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090"), StringVal("host")));
|
||||
|
||||
ASSERT_EQ(StringVal("http"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("PROTOCOL")));
|
||||
ASSERT_EQ(StringVal("http"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://facebook.com/path/p1.php?query=1"), StringVal("protocol")));
|
||||
|
||||
ASSERT_EQ(StringVal("a=b"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("QUERY")));
|
||||
ASSERT_EQ(StringVal("a=b"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("query")));
|
||||
|
||||
ASSERT_EQ(StringVal::null(),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("REF")));
|
||||
ASSERT_EQ(StringVal::null(),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("ref")));
|
||||
|
||||
ASSERT_EQ(StringVal::null(),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("USERINFO")));
|
||||
ASSERT_EQ(StringVal::null(),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("userinfo")));
|
||||
|
||||
ASSERT_EQ(StringVal("9090"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("PORT")));
|
||||
ASSERT_EQ(StringVal("9090"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090/a/b/c?a=b"), StringVal("PORT")));
|
||||
ASSERT_EQ(StringVal::null(),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com?a=b"), StringVal("PORT")));
|
||||
ASSERT_EQ(StringVal("9090"),
|
||||
StringFunctions::parse_url(ctx, StringVal("http://www.baidu.com:9090?a=b"), StringVal("port")));
|
||||
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
Reference in New Issue
Block a user