Limit the max string length to 1024 while collecting column stats to control BE memory usage. (#32470)

This commit is contained in:
Jibing-Li
2024-03-28 17:22:57 +08:00
committed by yiguolei
parent 0499d4013e
commit dcddd88e01
4 changed files with 40 additions and 8 deletions

View File

@ -82,6 +82,30 @@ public abstract class BaseAnalysisTask {
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}";
protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT "
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "
+ "${dbId} AS `db_id`, "
+ "${tblId} AS `tbl_id`, "
+ "${idxId} AS `idx_id`, "
+ "'${colId}' AS `col_id`, "
+ "NULL AS `part_id`, "
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
+ "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
+ " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` "
+ " FROM "
+ " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` "
+ " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
+ " ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`colValue` "
+ ") as `t1` ";
protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "

View File

@ -129,7 +129,11 @@ public class ExternalAnalysisTask extends BaseAnalysisTask {
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
if (col.getType().isStringType()) {
sb.append(DUJ1_ANALYZE_STRING_TEMPLATE);
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
}
params.put("dataSizeFunction", getDataSizeFunction(col, true));
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");

View File

@ -160,7 +160,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
} else {
params.put("dataSizeFunction", getDataSizeFunction(col, true));
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
if (col.getType().isStringType()) {
sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE);
} else {
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
}
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "

View File

@ -159,11 +159,10 @@ public class OlapAnalysisTaskTest {
+ " IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, "
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
+ " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
+ "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) "
+ "as `count` FROM (SELECT `${colName}` FROM "
+ "`catalogName`.`${dbName}`.`${tblName}` "
+ " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
+ "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`"
+ " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
return;
}
};
@ -183,7 +182,7 @@ public class OlapAnalysisTaskTest {
};
OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
olapAnalysisTask.col = new Column("test", PrimitiveType.STRING);
olapAnalysisTask.col = new Column("test", PrimitiveType.INT);
olapAnalysisTask.tbl = tableIf;
AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
@ -322,7 +321,8 @@ public class OlapAnalysisTaskTest {
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM "
+ "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql);
return;
}
};