[Opt](orc)Optimize the merge io when orc reader read multiple tiny stripes. (#42004) (#44239)

bp #42004

Co-authored-by: kaka11chen <kaka11.chen@gmail.com>
This commit is contained in:
daidai
2024-11-22 11:01:41 +08:00
committed by GitHub
parent 75f8323f45
commit 702abbff0f
15 changed files with 3043 additions and 17 deletions

View File

@ -447,6 +447,12 @@ public class SessionVariable implements Serializable, Writable {
public static final String ENABLE_ORC_LAZY_MAT = "enable_orc_lazy_materialization";
public static final String ORC_TINY_STRIPE_THRESHOLD_BYTES = "orc_tiny_stripe_threshold_bytes";
public static final String ORC_ONCE_MAX_READ_BYTES = "orc_once_max_read_bytes";
public static final String ORC_MAX_MERGE_DISTANCE_BYTES = "orc_max_merge_distance_bytes";
public static final String ENABLE_PARQUET_FILTER_BY_MIN_MAX = "enable_parquet_filter_by_min_max";
public static final String ENABLE_ORC_FILTER_BY_MIN_MAX = "enable_orc_filter_by_min_max";
@ -1648,6 +1654,46 @@ public class SessionVariable implements Serializable, Writable {
public boolean enableOrcLazyMat = true;
@VariableMgr.VarAttr(
name = ORC_TINY_STRIPE_THRESHOLD_BYTES,
description = {"在orc文件中如果一个stripe的字节大小小于`orc_tiny_stripe_threshold`,"
+ "我们认为该stripe为 tiny stripe。对于多个连续的tiny stripe我们会进行读取优化,即一次性读多个tiny stripe."
+ "如果你不想使用该优化,可以将该值设置为0。默认为 8M。",
"In an orc file, if the byte size of a stripe is less than `orc_tiny_stripe_threshold`,"
+ "we consider the stripe to be a tiny stripe. For multiple consecutive tiny stripes,"
+ "we will perform read optimization, that is, read multiple tiny stripes at a time."
+ "If you do not want to use this optimization, you can set this value to 0."
+ "The default is 8M."},
needForward = true,
setter = "setOrcTinyStripeThresholdBytes")
public long orcTinyStripeThresholdBytes = 8L * 1024L * 1024L;
@VariableMgr.VarAttr(
name = ORC_ONCE_MAX_READ_BYTES,
description = {"在使用tiny stripe读取优化的时候,会对多个tiny stripe合并成一次IO,"
+ "该参数用来控制每次IO请求的最大字节大小。你不应该将值设置的小于`orc_tiny_stripe_threshold`。默认为 8M。",
"When using tiny stripe read optimization, multiple tiny stripes will be merged into one IO."
+ "This parameter is used to control the maximum byte size of each IO request."
+ "You should not set the value less than `orc_tiny_stripe_threshold`."
+ "The default is 8M."},
needForward = true,
setter = "setOrcOnceMaxReadBytes")
public long orcOnceMaxReadBytes = 8L * 1024L * 1024L;
@VariableMgr.VarAttr(
name = ORC_MAX_MERGE_DISTANCE_BYTES,
description = {"在使用tiny stripe读取优化的时候,由于tiny stripe并不一定连续。"
+ "当两个tiny stripe之间距离大于该参数时,我们不会将其合并成一次IO。默认为 1M。",
"When using tiny stripe read optimization, since tiny stripes are not necessarily continuous,"
+ "when the distance between two tiny stripes is greater than this parameter,"
+ "we will not merge them into one IO. The default value is 1M."},
needForward = true,
setter = "setOrcMaxMergeDistanceBytes")
public long orcMaxMergeDistanceBytes = 1024L * 1024L;
@VariableMgr.VarAttr(
name = ENABLE_PARQUET_FILTER_BY_MIN_MAX,
description = {"控制 parquet reader 是否启用 min-max 值过滤。默认为 true。",
@ -2662,6 +2708,32 @@ public class SessionVariable implements Serializable, Writable {
this.parallelExecInstanceNum = val;
}
public void setOrcTinyStripeThresholdBytes(String value) throws Exception {
long val = checkFieldLongValue(ORC_TINY_STRIPE_THRESHOLD_BYTES, 0, value);
this.orcTinyStripeThresholdBytes = val;
}
public void setOrcOnceMaxReadBytes(String value) throws Exception {
long val = checkFieldLongValue(ORC_ONCE_MAX_READ_BYTES, 0, value);
this.orcOnceMaxReadBytes = val;
}
public void setOrcMaxMergeDistanceBytes(String value) throws Exception {
long val = checkFieldLongValue(ORC_MAX_MERGE_DISTANCE_BYTES, 0, value);
this.orcMaxMergeDistanceBytes = val;
}
private long checkFieldLongValue(String variableName, long minValue, String value) throws Exception {
long val = Long.parseLong(value);
if (val < minValue) {
throw new Exception(
variableName + " value should greater than or equal " + String.valueOf(minValue)
+ ", you set value is: " + value);
}
return val;
}
private int checkFieldValue(String variableName, int minValue, String value) throws Exception {
int val = Integer.valueOf(value);
if (val < minValue) {
@ -3684,6 +3756,11 @@ public class SessionVariable implements Serializable, Writable {
tResult.setAdaptivePipelineTaskSerialReadOnLimit(adaptivePipelineTaskSerialReadOnLimit);
tResult.setInListValueCountThreshold(inListValueCountThreshold);
tResult.setEnableAutoCreateWhenOverwrite(enableAutoCreateWhenOverwrite);
tResult.setOrcTinyStripeThresholdBytes(orcTinyStripeThresholdBytes);
tResult.setOrcMaxMergeDistanceBytes(orcMaxMergeDistanceBytes);
tResult.setOrcOnceMaxReadBytes(orcOnceMaxReadBytes);
return tResult;
}