diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
index faa5890556..29c3f2700c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
@@ -21,34 +21,70 @@ import org.apache.doris.common.UserException;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
-import org.apache.parquet.glob.GlobExpander;
+import org.apache.commons.lang3.StringUtils;
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
+import java.util.Optional;
import java.util.Set;
+import java.util.stream.Collectors;
/**
* This class represents a fully qualified location in S3 for input/output
- * operations expressed as as URI. This implementation is provided to
- * ensure compatibility with Hadoop Path implementations that may introduce
- * encoding issues with native URI implementation.
+ * operations expressed as as URI.
+ *
+ * For AWS S3, uri common styles should be:
+ * 1. AWS Client Style(Hadoop S3 Style): s3://my-bucket/path/to/file?versionId=abc123&partNumber=77&partNumber=88
+ * or
+ * 2. Virtual Host Style: https://my-bucket.s3.us-west-1.amazonaws.com/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
+ * or
+ * 3. Path Style: https://s3.us-west-1.amazonaws.com/my-bucket/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
+ *
+ * Regarding the above-mentioned common styles, we can use isPathStyle to control whether to use path style
+ * or virtual host style.
+ * "Virtual host style" is the currently mainstream and recommended approach to use, so the default value of
+ * isPathStyle is false.
+ *
+ * Other Styles:
+ * 1. Virtual Host AWS Client (Hadoop S3) Mixed Style:
+ * s3://my-bucket.s3.us-west-1.amazonaws.com/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
+ * or
+ * 2. Path AWS Client (Hadoop S3) Mixed Style:
+ * s3://s3.us-west-1.amazonaws.com/my-bucket/resources/doc.txt?versionId=abc123&partNumber=77&partNumber=88
+ *
+ * For these two styles, we can use isPathStyle and forceParsingByStandardUri
+ * to control whether to use.
+ * Virtual Host AWS Client (Hadoop S3) Mixed Style: isPathStyle = false && forceParsingByStandardUri = true
+ * Path AWS Client (Hadoop S3) Mixed Style: isPathStyle = true && forceParsingByStandardUri = true
+ *
+ * When the incoming location is url encoded, the encoded string will be returned.
+ * For getKey(), getQueryParams() will return the encoding string
*/
public class S3URI {
public static final String SCHEME_DELIM = "://";
public static final String PATH_DELIM = "/";
- private static final String QUERY_DELIM = "\\?";
- private static final String FRAGMENT_DELIM = "#";
private static final Set VALID_SCHEMES = ImmutableSet.of("http", "https", "s3", "s3a", "s3n",
- "bos", "oss", "cos", "obs");
+ "bos", "oss", "cos", "obs");
- private String scheme;
- private final String location;
- private final String virtualBucket;
- private final String bucket;
- private final String key;
- private boolean forceVirtualHosted;
+ private static final Set OS_SCHEMES = ImmutableSet.of("s3", "s3a", "s3n",
+ "bos", "oss", "cos", "obs");
+
+ private URI uri;
+
+ private String bucket;
+ private String key;
+
+ private String endpoint;
+
+ private String region;
+
+ private boolean isStandardURL;
+ private boolean isPathStyle;
+ private Map> queryParams;
/**
* Creates a new S3URI based on the bucket and key parsed from the location as defined in:
@@ -59,92 +95,180 @@ public class S3URI {
*
* @param location fully qualified URI
*/
-
public static S3URI create(String location) throws UserException {
- return create(location, false);
+ return create(location, false, false);
}
- public static S3URI create(String location, boolean forceVirtualHosted) throws UserException {
- S3URI s3URI = new S3URI(location, forceVirtualHosted);
- return s3URI;
+ public static S3URI create(String location, boolean isPathStyle) throws UserException {
+ return new S3URI(location, isPathStyle, false);
}
- private S3URI(String location, boolean forceVirtualHosted) throws UserException {
+ public static S3URI create(String location, boolean isPathStyle, boolean forceParsingByStandardUri)
+ throws UserException {
+ return new S3URI(location, isPathStyle, forceParsingByStandardUri);
+ }
+
+ private S3URI(String location, boolean isPathStyle, boolean forceParsingByStandardUri) throws UserException {
if (Strings.isNullOrEmpty(location)) {
throw new UserException("s3 location can not be null");
}
+ this.isPathStyle = isPathStyle;
+ parseUri(location, forceParsingByStandardUri);
+ }
+ private void parseUri(String location, boolean forceParsingStandardUri) throws UserException {
+ validateUri(location);
+
+ if (!forceParsingStandardUri && OS_SCHEMES.contains(uri.getScheme().toLowerCase())) {
+ parseAwsCliStyleUri();
+ } else {
+ parseStandardUri();
+ }
+ parseEndpointAndRegion();
+ }
+
+ private void validateUri(String location) throws UserException {
try {
- // the location need to be normalized to eliminate double "/", or the hadoop aws api
- // won't handle it correctly.
- this.location = new URI(location).normalize().toString();
+ uri = new URI(location);
} catch (URISyntaxException e) {
- throw new UserException("Invalid s3 uri: " + e.getMessage());
+ throw new UserException(e);
+ }
+ if (uri.getScheme() == null || !VALID_SCHEMES.contains(uri.getScheme().toLowerCase())) {
+ throw new UserException("Invalid scheme: " + this.uri);
+ }
+ }
+
+ private void parseAwsCliStyleUri() throws UserException {
+ bucket = uri.getAuthority();
+ if (bucket == null) {
+ throw new UserException("missing bucket: " + uri);
+ }
+ String path = uri.getRawPath();
+ if (path.length() > 1) {
+ key = path.substring(1);
+ } else {
+ throw new UserException("missing key: " + uri);
}
- this.forceVirtualHosted = forceVirtualHosted;
- String[] schemeSplit = this.location.split(SCHEME_DELIM);
- if (schemeSplit.length != 2) {
- throw new UserException("Invalid s3 uri: " + this.location);
+ addQueryParamsIfNeeded();
+
+ isStandardURL = false;
+ this.isPathStyle = false;
+ }
+
+ private void parseStandardUri() throws UserException {
+ if (uri.getHost() == null) {
+ throw new UserException("Invalid S3 URI: no hostname: " + uri);
}
- this.scheme = schemeSplit[0];
- if (!VALID_SCHEMES.contains(scheme.toLowerCase())) {
- throw new UserException("Invalid scheme: " + this.location);
- }
+ addQueryParamsIfNeeded();
- String[] authoritySplit = schemeSplit[1].split(PATH_DELIM, 2);
- if (authoritySplit.length != 2) {
- throw new UserException("Invalid s3 uri: " + this.location);
- }
- if (authoritySplit[1].trim().isEmpty()) {
- throw new UserException("Invalid s3 key: " + this.location);
+ if (isPathStyle) {
+ parsePathStyleUri();
+ } else {
+ parseVirtualHostedStyleUri();
}
+ isStandardURL = true;
+ }
- // Strip query and fragment if they exist
- String path = authoritySplit[1];
- path = path.split(QUERY_DELIM)[0];
- path = path.split(FRAGMENT_DELIM)[0];
- if (this.forceVirtualHosted) {
- // If forceVirtualHosted is true, the s3 client will NOT automatically convert to virtual-hosted style.
- // So we do some convert manually. Eg:
- // endpoint: http://cos.ap-beijing.myqcloud.com
- // bucket/path: my_bucket/file.txt
- // `virtualBucket` will be "my_bucket"
- // `bucket` will be `file.txt`
- // So that when assembling the real endpoint will be: http://my_bucket.cos.ap-beijing.myqcloud.com/file.txt
- this.virtualBucket = authoritySplit[0];
- String[] paths = path.split("/", 2);
- this.bucket = paths[0];
- if (paths.length > 1) {
- key = paths[1];
+ private void addQueryParamsIfNeeded() {
+ if (uri.getQuery() != null) {
+ queryParams = splitQueryString(uri.getRawQuery()).stream().map((s) -> s.split("="))
+ .map((s) -> s.length == 1 ? new String[] {s[0], null} : s).collect(
+ Collectors.groupingBy((a) -> a[0],
+ Collectors.mapping((a) -> a[1], Collectors.toList())));
+ }
+ }
+
+ private static List splitQueryString(String queryString) {
+ List results = new ArrayList<>();
+ StringBuilder result = new StringBuilder();
+
+ for (int i = 0; i < queryString.length(); ++i) {
+ char character = queryString.charAt(i);
+ if (character != '&') {
+ result.append(character);
} else {
- key = "";
+ String param = result.toString();
+ results.add(param);
+ result.setLength(0);
+ }
+ }
+
+ String param = result.toString();
+ results.add(param);
+ return results;
+ }
+
+ private void parsePathStyleUri() throws UserException {
+ String path = uri.getRawPath();
+
+ if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
+ int index = path.indexOf('/', 1);
+
+ if (index == -1) {
+ // No trailing slash, e.g., "https://s3.amazonaws.com/bucket"
+ bucket = path.substring(1);
+ throw new UserException("missing key: " + uri);
+ } else {
+ bucket = path.substring(1, index);
+ if (index != path.length() - 1) {
+ key = path.substring(index + 1);
+ } else {
+ throw new UserException("missing key: " + uri);
+ }
}
} else {
- // If forceVirtualHosted is false, let the s3 client to determine how to covert endpoint, eg:
- // For s3 endpoint(start with "s3."), it will convert to virtual-hosted style.
- // For others, keep as it is (maybe path-style, maybe virtual-hosted style.)
- this.virtualBucket = "";
- this.bucket = authoritySplit[0];
- key = path;
+ throw new UserException("missing bucket: " + this.uri);
}
}
- public List expand(String path) {
- return GlobExpander.expand(path);
+ private void parseVirtualHostedStyleUri() throws UserException {
+ bucket = uri.getHost().split("\\.")[0];
+
+ String path = uri.getRawPath();
+ if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
+ key = path.substring(1);
+ } else {
+ throw new UserException("missing key: " + this.uri);
+ }
}
- public String getScheme() {
- return this.scheme;
- }
+ private void parseEndpointAndRegion() {
+ // parse endpoint
+ if (isStandardURL) {
+ if (isPathStyle) {
+ endpoint = uri.getAuthority();
+ } else { // virtual_host_style
+ if (uri.getAuthority() == null) {
+ endpoint = null;
+ return;
+ }
+ String[] splits = uri.getAuthority().split("\\.", 2);
+ if (splits.length < 2) {
+ endpoint = null;
+ return;
+ }
+ endpoint = splits[1];
+ }
+ } else {
+ endpoint = null;
+ }
+ if (endpoint == null) {
+ return;
+ }
- public String getBucketScheme() {
- return scheme + "://" + bucket;
- }
-
- public String getVirtualBucket() {
- return virtualBucket;
+ // parse region
+ String[] endpointSplits = endpoint.split("\\.");
+ if (endpointSplits.length < 2) {
+ return;
+ }
+ if (endpointSplits[0].contains("oss-")) {
+ // compatible with the endpoint: oss-cn-bejing.aliyuncs.com
+ region = endpointSplits[0];
+ return;
+ }
+ region = endpointSplits[1];
}
/**
@@ -161,15 +285,30 @@ public class S3URI {
return key;
}
- /*
- * @return original, unmodified location
- */
- public String getLocation() {
- return location;
+ public Optional