Limit the max string length to 1024 while collecting column stats to control BE memory usage. (#32470)
This commit is contained in:
@ -82,6 +82,30 @@ public abstract class BaseAnalysisTask {
|
||||
+ "NOW() "
|
||||
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}";
|
||||
|
||||
protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT "
|
||||
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
|
||||
+ "${catalogId} AS `catalog_id`, "
|
||||
+ "${dbId} AS `db_id`, "
|
||||
+ "${tblId} AS `tbl_id`, "
|
||||
+ "${idxId} AS `idx_id`, "
|
||||
+ "'${colId}' AS `col_id`, "
|
||||
+ "NULL AS `part_id`, "
|
||||
+ "${rowCount} AS `row_count`, "
|
||||
+ "${ndvFunction} as `ndv`, "
|
||||
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
|
||||
+ "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
|
||||
+ "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
|
||||
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
|
||||
+ "NOW() "
|
||||
+ "FROM ( "
|
||||
+ " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` "
|
||||
+ " FROM "
|
||||
+ " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` "
|
||||
+ " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
|
||||
+ " ${sampleHints} ${limit}) as `t0` "
|
||||
+ " GROUP BY `t0`.`colValue` "
|
||||
+ ") as `t1` ";
|
||||
|
||||
protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "
|
||||
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
|
||||
+ "${catalogId} AS `catalog_id`, "
|
||||
|
||||
@ -129,7 +129,11 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
|
||||
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
|
||||
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
|
||||
} else {
|
||||
sb.append(DUJ1_ANALYZE_TEMPLATE);
|
||||
if (col.getType().isStringType()) {
|
||||
sb.append(DUJ1_ANALYZE_STRING_TEMPLATE);
|
||||
} else {
|
||||
sb.append(DUJ1_ANALYZE_TEMPLATE);
|
||||
}
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, true));
|
||||
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
|
||||
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
|
||||
|
||||
@ -160,7 +160,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
|
||||
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
|
||||
} else {
|
||||
params.put("dataSizeFunction", getDataSizeFunction(col, true));
|
||||
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
|
||||
if (col.getType().isStringType()) {
|
||||
sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE);
|
||||
} else {
|
||||
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
|
||||
}
|
||||
}
|
||||
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
|
||||
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
|
||||
|
||||
@ -159,11 +159,10 @@ public class OlapAnalysisTaskTest {
|
||||
+ " IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, "
|
||||
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
|
||||
+ " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
|
||||
+ "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
|
||||
+ "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() "
|
||||
+ "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) "
|
||||
+ "as `count` FROM (SELECT `${colName}` FROM "
|
||||
+ "`catalogName`.`${dbName}`.`${tblName}` "
|
||||
+ " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
|
||||
+ "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`"
|
||||
+ " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
|
||||
return;
|
||||
}
|
||||
};
|
||||
@ -183,7 +182,7 @@ public class OlapAnalysisTaskTest {
|
||||
};
|
||||
|
||||
OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
|
||||
olapAnalysisTask.col = new Column("test", PrimitiveType.STRING);
|
||||
olapAnalysisTask.col = new Column("test", PrimitiveType.INT);
|
||||
olapAnalysisTask.tbl = tableIf;
|
||||
AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
|
||||
analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
|
||||
@ -322,7 +321,8 @@ public class OlapAnalysisTaskTest {
|
||||
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
|
||||
+ "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
|
||||
+ "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
|
||||
+ "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
|
||||
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM "
|
||||
+ "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user