The parameter 'part' of parse_url function does not support lower case, and parse protocol not right. And This function does not support parse 'port'. This PR tries to make parse_url function case insensitive and support parse 'port'. The issue: #4451
350 lines
10 KiB
C++
350 lines
10 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "util/url_parser.h"
|
|
#include "runtime/string_value.hpp"
|
|
|
|
namespace doris {
|
|
|
|
const StringValue UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9);
|
|
const StringValue UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
|
|
const StringValue UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
|
|
const StringValue UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
|
|
const StringValue UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
|
|
const StringValue UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
|
|
const StringValue UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
|
|
const StringValue UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
|
|
const StringValue UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
|
|
const StringValue UrlParser::_s_protocol(const_cast<char*>("://"), 3);
|
|
const StringValue UrlParser::_s_at(const_cast<char*>("@"), 1);
|
|
const StringValue UrlParser::_s_slash(const_cast<char*>("/"), 1);
|
|
const StringValue UrlParser::_s_colon(const_cast<char*>(":"), 1);
|
|
const StringValue UrlParser::_s_question(const_cast<char*>("?"), 1);
|
|
const StringValue UrlParser::_s_hash(const_cast<char*>("#"), 1);
|
|
const StringSearch UrlParser::_s_protocol_search(&_s_protocol);
|
|
const StringSearch UrlParser::_s_at_search(&_s_at);
|
|
const StringSearch UrlParser::_s_slash_search(&_s_slash);
|
|
const StringSearch UrlParser::_s_colon_search(&_s_colon);
|
|
const StringSearch UrlParser::_s_question_search(&_s_question);
|
|
const StringSearch UrlParser::_s_hash_search(&_s_hash);
|
|
|
|
bool UrlParser::parse_url(const StringValue& url, UrlPart part, StringValue* result) {
|
|
result->ptr = NULL;
|
|
result->len = 0;
|
|
// Remove leading and trailing spaces.
|
|
StringValue trimmed_url = url.trim();
|
|
|
|
// All parts require checking for the _s_protocol.
|
|
int32_t protocol_pos = _s_protocol_search.search(&trimmed_url);
|
|
if (protocol_pos < 0) {
|
|
return false;
|
|
}
|
|
|
|
// Positioned to first char after '://'.
|
|
StringValue protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.len);
|
|
|
|
switch (part) {
|
|
case AUTHORITY: {
|
|
// Find first '/'.
|
|
int32_t end_pos = _s_slash_search.search(&protocol_end);
|
|
*result = protocol_end.substring(0, end_pos);
|
|
break;
|
|
}
|
|
|
|
case FILE:
|
|
case PATH: {
|
|
// Find first '/'.
|
|
int32_t start_pos = _s_slash_search.search(&protocol_end);
|
|
|
|
if (start_pos < 0) {
|
|
// Return empty string. This is what Hive does.
|
|
return true;
|
|
}
|
|
|
|
StringValue path_start = protocol_end.substring(start_pos);
|
|
int32_t end_pos;
|
|
|
|
if (part == FILE) {
|
|
// End _s_at '#'.
|
|
end_pos = _s_hash_search.search(&path_start);
|
|
} else {
|
|
// End string _s_at next '?' or '#'.
|
|
end_pos = _s_question_search.search(&path_start);
|
|
|
|
if (end_pos < 0) {
|
|
// No '?' was found, look for '#'.
|
|
end_pos = _s_hash_search.search(&path_start);
|
|
}
|
|
}
|
|
|
|
*result = path_start.substring(0, end_pos);
|
|
break;
|
|
}
|
|
|
|
case HOST: {
|
|
// Find '@'.
|
|
int32_t start_pos = _s_at_search.search(&protocol_end);
|
|
|
|
if (start_pos < 0) {
|
|
// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
|
|
start_pos = 0;
|
|
} else {
|
|
// Skip '@'.
|
|
start_pos += _s_at.len;
|
|
}
|
|
|
|
StringValue host_start = protocol_end.substring(start_pos);
|
|
// Find ':' to strip out port.
|
|
int32_t end_pos = _s_colon_search.search(&host_start);
|
|
|
|
if (end_pos < 0) {
|
|
// No port was given. search for '/' to determine ending position.
|
|
end_pos = _s_slash_search.search(&host_start);
|
|
}
|
|
|
|
*result = host_start.substring(0, end_pos);
|
|
break;
|
|
}
|
|
|
|
case PROTOCOL: {
|
|
*result = trimmed_url.substring(0, protocol_pos);
|
|
break;
|
|
}
|
|
|
|
case QUERY: {
|
|
// Find first '?'.
|
|
int32_t start_pos = _s_question_search.search(&protocol_end);
|
|
|
|
if (start_pos < 0) {
|
|
// Indicate no query was found.
|
|
return false;
|
|
}
|
|
|
|
StringValue query_start = protocol_end.substring(start_pos + _s_question.len);
|
|
// End string _s_at next '#'.
|
|
int32_t end_pos = _s_hash_search.search(&query_start);
|
|
*result = query_start.substring(0, end_pos);
|
|
break;
|
|
}
|
|
|
|
case REF: {
|
|
// Find '#'.
|
|
int32_t start_pos = _s_hash_search.search(&protocol_end);
|
|
|
|
if (start_pos < 0) {
|
|
// Indicate no user and pass were given.
|
|
return false;
|
|
}
|
|
|
|
*result = protocol_end.substring(start_pos + _s_hash.len);
|
|
break;
|
|
}
|
|
|
|
case USERINFO: {
|
|
// Find '@'.
|
|
int32_t end_pos = _s_at_search.search(&protocol_end);
|
|
|
|
if (end_pos < 0) {
|
|
// Indicate no user and pass were given.
|
|
return false;
|
|
}
|
|
|
|
*result = protocol_end.substring(0, end_pos);
|
|
break;
|
|
}
|
|
|
|
case PORT: {
|
|
// Find '@'.
|
|
int32_t start_pos = _s_at_search.search(&protocol_end);
|
|
|
|
if (start_pos < 0) {
|
|
// No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
|
|
start_pos = 0;
|
|
} else {
|
|
// Skip '@'.
|
|
start_pos += _s_at.len;
|
|
}
|
|
|
|
StringValue host_start = protocol_end.substring(start_pos);
|
|
// Find ':' to strip out port.
|
|
int32_t end_pos = _s_colon_search.search(&host_start);
|
|
//no port found
|
|
if (end_pos < 0) {
|
|
return false;
|
|
}
|
|
|
|
StringValue port_start_str = protocol_end.substring(end_pos + _s_colon.len);
|
|
int32_t port_end_pos = _s_slash_search.search(&port_start_str);
|
|
//if '/' not found, try to find '?'
|
|
if (port_end_pos < 0) {
|
|
port_end_pos = _s_question_search.search(&port_start_str);
|
|
}
|
|
*result = port_start_str.substring(0, port_end_pos);
|
|
break;
|
|
}
|
|
|
|
case INVALID:
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool UrlParser::parse_url_key(
|
|
const StringValue& url,
|
|
UrlPart part,
|
|
const StringValue& key,
|
|
StringValue* result) {
|
|
// Part must be query to ask for a specific query key.
|
|
if (part != QUERY) {
|
|
return false;
|
|
}
|
|
|
|
// Remove leading and trailing spaces.
|
|
StringValue trimmed_url = url.trim();
|
|
|
|
// Search for the key in the url, ignoring malformed URLs for now.
|
|
StringSearch key_search(&key);
|
|
|
|
while (trimmed_url.len > 0) {
|
|
// Search for the key in the current substring.
|
|
int32_t key_pos = key_search.search(&trimmed_url);
|
|
bool match = true;
|
|
|
|
if (key_pos < 0) {
|
|
return false;
|
|
}
|
|
|
|
// Key pos must be != 0 because it must be preceded by a '?' or a '&'.
|
|
// Check that the char before key_pos is either '?' or '&'.
|
|
if (key_pos == 0 ||
|
|
(trimmed_url.ptr[key_pos - 1] != '?' && trimmed_url.ptr[key_pos - 1] != '&')) {
|
|
match = false;
|
|
}
|
|
|
|
// Advance substring beyond matching key.
|
|
trimmed_url = trimmed_url.substring(key_pos + key.len);
|
|
|
|
if (!match) {
|
|
continue;
|
|
}
|
|
|
|
if (trimmed_url.len <= 0) {
|
|
break;
|
|
}
|
|
|
|
// Next character must be '=', otherwise the match cannot be a key in the query part.
|
|
if (trimmed_url.ptr[0] != '=') {
|
|
continue;
|
|
}
|
|
|
|
int32_t pos = 1;
|
|
|
|
// Find ending position of key's value by matching '#' or '&'.
|
|
while (pos < trimmed_url.len) {
|
|
switch (trimmed_url.ptr[pos]) {
|
|
case '#':
|
|
case '&':
|
|
*result = trimmed_url.substring(1, pos - 1);
|
|
return true;
|
|
}
|
|
|
|
++pos;
|
|
}
|
|
|
|
// Ending position is end of string.
|
|
*result = trimmed_url.substring(1);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
|
|
// Quick filter on requested URL part, based on first character.
|
|
// Hive requires the requested URL part to be all upper case.
|
|
std::string part_str = part.to_string();
|
|
transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
|
|
StringValue newPart = StringValue(part_str);
|
|
switch (newPart.ptr[0]) {
|
|
case 'A': {
|
|
if (!newPart.eq(_s_url_authority)) {
|
|
return INVALID;
|
|
}
|
|
|
|
return AUTHORITY;
|
|
}
|
|
|
|
case 'F': {
|
|
if (!newPart.eq(_s_url_file)) {
|
|
return INVALID;
|
|
}
|
|
|
|
return FILE;
|
|
}
|
|
|
|
case 'H': {
|
|
if (!newPart.eq(_s_url_host)) {
|
|
return INVALID;
|
|
}
|
|
|
|
return HOST;
|
|
}
|
|
|
|
case 'P': {
|
|
if (newPart.eq(_s_url_path)) {
|
|
return PATH;
|
|
} else if (newPart.eq(_s_url_protocol)) {
|
|
return PROTOCOL;
|
|
} else if (newPart.eq(_s_url_port)) {
|
|
return PORT;
|
|
} else {
|
|
return INVALID;
|
|
}
|
|
}
|
|
|
|
case 'Q': {
|
|
if (!newPart.eq(_s_url_query)) {
|
|
return INVALID;
|
|
}
|
|
|
|
return QUERY;
|
|
}
|
|
|
|
case 'R': {
|
|
if (!newPart.eq(_s_url_ref)) {
|
|
return INVALID;
|
|
}
|
|
|
|
return REF;
|
|
}
|
|
|
|
case 'U': {
|
|
if (!newPart.eq(_s_url_userinfo)) {
|
|
return INVALID;
|
|
}
|
|
|
|
return USERINFO;
|
|
}
|
|
|
|
default:
|
|
return INVALID;
|
|
}
|
|
}
|
|
|
|
}
|