Revert "[fix](auto-partition) Fix auto partition concurrent conflict (#26166)" (#26448)

This reverts commit f22611769944e78c28f1b0a1eeb7b7414a16e8db.
This commit is contained in:
zclllyybb
2023-11-06 16:39:19 +08:00
committed by GitHub
parent c0d9a8d53c
commit 1a83a39aec
12 changed files with 41 additions and 259 deletions

View File

@ -84,7 +84,6 @@ void TabletsChannel::_init_profile(RuntimeProfile* profile) {
_slave_replica_timer = ADD_TIMER(_profile, "SlaveReplicaTime");
_add_batch_timer = ADD_TIMER(_profile, "AddBatchTime");
_write_block_timer = ADD_TIMER(_profile, "WriteBlockTime");
_incremental_open_timer = ADD_TIMER(_profile, "IncrementalOpenTabletTime");
_memory_usage_counter = memory_usage->AddHighWaterMarkCounter("Total", TUnit::BYTES);
_write_memory_usage_counter = memory_usage->AddHighWaterMarkCounter("Write", TUnit::BYTES);
_flush_memory_usage_counter = memory_usage->AddHighWaterMarkCounter("Flush", TUnit::BYTES);
@ -121,14 +120,13 @@ Status TabletsChannel::open(const PTabletWriterOpenRequest& request) {
}
Status TabletsChannel::incremental_open(const PTabletWriterOpenRequest& params) {
SCOPED_TIMER(_incremental_open_timer);
if (_state == kInitialized) { // haven't opened
return open(params);
}
std::lock_guard<std::mutex> l(_lock);
std::vector<SlotDescriptor*>* index_slots = nullptr;
int32_t schema_hash = 0;
for (const auto& index : _schema->indexes()) {
for (auto& index : _schema->indexes()) {
if (index->index_id == _index_id) {
index_slots = &index->slots;
schema_hash = index->schema_hash;
@ -139,12 +137,14 @@ Status TabletsChannel::incremental_open(const PTabletWriterOpenRequest& params)
return Status::InternalError("unknown index id, key={}", _key.to_string());
}
// update tablets
std::vector<int64_t> tablet_ids;
tablet_ids.reserve(params.tablets_size());
size_t incremental_tablet_num = 0;
std::stringstream ss;
ss << "LocalTabletsChannel txn_id: " << _txn_id << " load_id: " << print_id(params.id())
<< " incremental open delta writer: ";
for (const auto& tablet : params.tablets()) {
for (auto& tablet : params.tablets()) {
if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) {
continue;
}

View File

@ -196,7 +196,6 @@ private:
RuntimeProfile::Counter* _slave_replica_timer = nullptr;
RuntimeProfile::Counter* _add_batch_timer = nullptr;
RuntimeProfile::Counter* _write_block_timer = nullptr;
RuntimeProfile::Counter* _incremental_open_timer = nullptr;
};
template <typename Request>

View File

@ -167,7 +167,7 @@ Default:100
the max txn number which bdbje can rollback when trying to rejoin the group
#### `grpc_threadmgr_threads_nums`
### `grpc_threadmgr_threads_nums`
Default: 4096
@ -2763,9 +2763,3 @@ Forbid LocalDeployManager drop nodes to prevent errors in the cluster.info file
Default: mysql
To ensure compatibility with the MySQL ecosystem, Doris includes a built-in database called mysql. If this database conflicts with a user's own database, please modify this field to replace the name of the Doris built-in MySQL database with a different name.
#### `max_auto_partition_num`
Default value: 2000
For auto-partitioned tables to prevent users from accidentally creating a large number of partitions, the number of partitions allowed per OLAP table is `max_auto_partition_num`. Default 2000.

View File

@ -173,7 +173,7 @@ Doris 元数据将保存在这里。 强烈建议将此目录的存储为:
元数据会同步写入到多个 Follower FE,这个参数用于控制 Master FE 等待 Follower FE 发送 ack 的超时时间。当写入的数据较大时,可能 ack 时间较长,如果超时,会导致写元数据失败,FE 进程退出。此时可以适当调大这个参数。
#### `grpc_threadmgr_threads_nums`
### `grpc_threadmgr_threads_nums`
默认值: 4096
@ -2759,12 +2759,6 @@ show data (其他用法:HELP SHOW DATA)
#### `mysqldb_replace_name`
默认值:mysql
Default: mysql
Doris 为了兼用 mysql 周边工具生态,会内置一个名为 mysql 的数据库,如果该数据库与用户自建数据库冲突,请修改这个字段,为 doris 内置的 mysql database 更换一个名字
#### `max_auto_partition_num`
默认值:2000
对于自动分区表,防止用户意外创建大量分区,每个OLAP表允许的分区数量为`max_auto_partition_num`。默认2000。

View File

@ -2279,11 +2279,4 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = true)
public static int publish_topic_info_interval_ms = 30000; // 30s
@ConfField(mutable = true, masterOnly = true, description = {
"对于自动分区表,防止用户意外创建大量分区,每个OLAP表允许的分区数量为`max_auto_partition_num`。默认2000。",
"For auto-partitioned tables to prevent users from accidentally creating a large number of partitions, "
+ "the number of partitions allowed per OLAP table is `max_auto_partition_num`. Default 2000."
})
public static int max_auto_partition_num = 2000;
}

View File

@ -25,7 +25,6 @@ import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.thrift.TStringLiteral;
import com.github.javaparser.quality.Preconditions;
import com.google.common.collect.Maps;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -114,20 +113,9 @@ public class PartitionExprUtil {
return null;
}
// In one calling, because we have partition values filter, the same partition
// value won't make duplicate AddPartitionClause.
// But if there's same partition values in two calling of this. we may have the
// different partition name because we have timestamp suffix here.
// Should check existence of partitions in this table. so need at least readlock
// first.
// @return <newName, newPartitionClause>
// @return existPartitionIds will save exist partition's id.
public static Map<String, AddPartitionClause> getNonExistPartitionAddClause(OlapTable olapTable,
ArrayList<TStringLiteral> partitionValues, PartitionInfo partitionInfo, ArrayList<Long> existPartitionIds)
public static Map<String, AddPartitionClause> getAddPartitionClauseFromPartitionValues(OlapTable olapTable,
ArrayList<TStringLiteral> partitionValues, PartitionInfo partitionInfo)
throws AnalysisException {
Preconditions.checkArgument(!partitionInfo.isMultiColumnPartition(),
"now dont support multi key columns in auto-partition.");
Map<String, AddPartitionClause> result = Maps.newHashMap();
ArrayList<Expr> partitionExprs = partitionInfo.getPartitionExprs();
PartitionType partitionType = partitionInfo.getType();
@ -144,14 +132,6 @@ public class PartitionExprUtil {
continue;
}
filterPartitionValues.add(value);
// check if this key value has been covered by some partition.
Long id = partitionInfo.contains(partitionValue, partitionType);
if (id != null) { // found
existPartitionIds.add(id);
continue;
}
if (partitionType == PartitionType.RANGE) {
String beginTime = value;
DateLiteral beginDateTime = new DateLiteral(beginTime, partitionColumnType);
@ -167,24 +147,21 @@ public class PartitionExprUtil {
listValues.add(Collections.singletonList(lowerValue));
partitionKeyDesc = PartitionKeyDesc.createIn(
listValues);
// the partition's name can't contain some special characters. so some string
// values(like a*b and ab) will get same partition name. to distingush them, we
// have to add a timestamp.
partitionName += getFormatPartitionValue(lowerValue.getStringValue());
if (partitionColumnType.isStringType()) {
partitionName += "_" + System.currentTimeMillis();
}
} else {
throw new AnalysisException("auto-partition only support range and list partition");
throw new AnalysisException("now only support range and list partition");
}
Map<String, String> partitionProperties = Maps.newHashMap();
DistributionDesc distributionDesc = olapTable.getDefaultDistributionInfo().toDistributionDesc();
SinglePartitionDesc partitionDesc = new SinglePartitionDesc(true, partitionName,
SinglePartitionDesc singleRangePartitionDesc = new SinglePartitionDesc(true, partitionName,
partitionKeyDesc, partitionProperties);
AddPartitionClause addPartitionClause = new AddPartitionClause(partitionDesc,
AddPartitionClause addPartitionClause = new AddPartitionClause(singleRangePartitionDesc,
distributionDesc, partitionProperties, false);
result.put(partitionName, addPartitionClause);
}

View File

@ -322,29 +322,6 @@ public class Database extends MetaObject implements Writable, DatabaseIf<Table>
return Math.max(leftReplicaQuota, 0L);
}
public long getReplicaCountWithoutLock() {
readLock();
try {
long usedReplicaCount = 0;
for (Table table : this.idToTable.values()) {
if (table.getType() != TableType.OLAP) {
continue;
}
OlapTable olapTable = (OlapTable) table;
usedReplicaCount = usedReplicaCount + olapTable.getReplicaCount();
}
return usedReplicaCount;
} finally {
readUnlock();
}
}
public long getReplicaQuotaLeftWithoutLock() {
long leftReplicaQuota = replicaQuotaSize - getReplicaCountWithoutLock();
return Math.max(leftReplicaQuota, 0L);
}
public void checkDataSizeQuota() throws DdlException {
Pair<Double, String> quotaUnitPair = DebugUtil.getByteUint(dataQuotaBytes);
String readableQuota = DebugUtil.DECIMAL_FORMAT_SCALE_3.format(quotaUnitPair.first) + " "

View File

@ -2899,12 +2899,7 @@ public class Env {
}
public void addPartition(Database db, String tableName, AddPartitionClause addPartitionClause) throws DdlException {
getInternalCatalog().addPartition(db, tableName, addPartitionClause, false);
}
public void addPartitionSkipLock(Database db, OlapTable table, AddPartitionClause addPartitionClause)
throws DdlException {
getInternalCatalog().addPartition(db, table.getName(), addPartitionClause, true);
getInternalCatalog().addPartition(db, tableName, addPartitionClause);
}
public void addPartitionLike(Database db, String tableName, AddPartitionLikeClause addPartitionLikeClause)

View File

@ -965,10 +965,6 @@ public class OlapTable extends Table {
return partition;
}
public int getPartitionNum() {
return idToPartition.size();
}
// get all partitions except temp partitions
public Collection<Partition> getPartitions() {
return idToPartition.values();

View File

@ -19,7 +19,6 @@ package org.apache.doris.catalog;
import org.apache.doris.analysis.DateLiteral;
import org.apache.doris.analysis.Expr;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.analysis.MaxLiteral;
import org.apache.doris.analysis.PartitionDesc;
import org.apache.doris.analysis.PartitionValue;
@ -30,7 +29,6 @@ import org.apache.doris.common.FeMetaVersion;
import org.apache.doris.common.io.Text;
import org.apache.doris.common.io.Writable;
import org.apache.doris.thrift.TStorageMedium;
import org.apache.doris.thrift.TStringLiteral;
import org.apache.doris.thrift.TTabletType;
import com.google.common.base.Preconditions;
@ -151,99 +149,6 @@ public class PartitionInfo implements Writable {
}
}
// only for auto partition. now only support one column.
// @return: null for not contain. otherwise partition id.
public Long contains(TStringLiteral key, PartitionType partitionType) throws AnalysisException {
if (idToItem.isEmpty() && idToTempItem.isEmpty()) {
return null;
}
if (partitionType == PartitionType.LIST) {
PartitionValue keyValue = new PartitionValue(key.getValue());
PrimitiveType toType;
if (!idToItem.isEmpty()) {
PartitionItem aItem = idToItem.values().iterator().next();
toType = ((ListPartitionItem) aItem).getItems().get(0).getTypes().get(0);
} else {
PartitionItem aItem = idToTempItem.values().iterator().next();
toType = ((ListPartitionItem) aItem).getItems().get(0).getTypes().get(0);
}
LiteralExpr detectExpr = LiteralExpr.create(keyValue.getStringValue(), Type.fromPrimitiveType(toType));
for (Map.Entry<Long, PartitionItem> entry : idToItem.entrySet()) {
Long id = entry.getKey();
ListPartitionItem item = (ListPartitionItem) (entry.getValue()); // a item is a partiton
// in one list partition, there's maybe many acceptable value
for (PartitionKey keysInItem : item.getItems()) {
Preconditions.checkArgument(keysInItem.getKeys().size() == 1,
"only support 1 column in auto partition now");
if (detectExpr.compareLiteral(keysInItem.getKeys().get(0)) == 0) {
return id;
}
}
}
for (Map.Entry<Long, PartitionItem> entry : idToTempItem.entrySet()) {
Long id = entry.getKey();
ListPartitionItem item = (ListPartitionItem) (entry.getValue()); // a item is a partiton
// in one list partition, there's maybe many acceptable value
for (PartitionKey keysInItem : item.getItems()) {
Preconditions.checkArgument(keysInItem.getKeys().size() == 1,
"only support 1 column in auto partition now");
if (detectExpr.compareLiteral(keysInItem.getKeys().get(0)) == 0) {
return id;
}
}
}
} else if (partitionType == PartitionType.RANGE) {
PartitionValue keyValue = new PartitionValue(key.getValue());
PrimitiveType toType;
if (!idToItem.isEmpty()) {
PartitionItem aItem = idToItem.values().iterator().next();
toType = ((RangePartitionItem) aItem).getItems().lowerEndpoint().getTypes().get(0);
} else {
PartitionItem aItem = idToTempItem.values().iterator().next();
toType = ((RangePartitionItem) aItem).getItems().lowerEndpoint().getTypes().get(0);
}
LiteralExpr detectExpr = LiteralExpr.create(keyValue.getStringValue(), Type.fromPrimitiveType(toType));
for (Map.Entry<Long, PartitionItem> entry : idToItem.entrySet()) {
Long id = entry.getKey();
RangePartitionItem item = (RangePartitionItem) (entry.getValue());
// lower/upper for each columns
PartitionKey lower = item.getItems().lowerEndpoint();
PartitionKey upper = item.getItems().lowerEndpoint();
Preconditions.checkArgument(lower.getKeys().size() == 1 && upper.getKeys().size() == 1,
"only support 1 column in auto partition now");
LiteralExpr lowerKey = lower.getKeys().get(0);
LiteralExpr upperKey = lower.getKeys().get(0);
if (detectExpr.compareLiteral(lowerKey) >= 0
&& (detectExpr.compareLiteral(upperKey) < 0 || upperKey instanceof MaxLiteral)) {
return id;
}
}
for (Map.Entry<Long, PartitionItem> entry : idToTempItem.entrySet()) {
Long id = entry.getKey();
RangePartitionItem item = (RangePartitionItem) (entry.getValue());
// lower/upper for each columns
PartitionKey lower = item.getItems().lowerEndpoint();
PartitionKey upper = item.getItems().lowerEndpoint();
Preconditions.checkArgument(lower.getKeys().size() == 1 && upper.getKeys().size() == 1,
"only support 1 column in auto partition now");
LiteralExpr lowerKey = lower.getKeys().get(0);
LiteralExpr upperKey = lower.getKeys().get(0);
if (detectExpr.compareLiteral(lowerKey) >= 0
&& (detectExpr.compareLiteral(upperKey) < 0 || upperKey instanceof MaxLiteral)) {
return id;
}
}
} else {
throw new AnalysisException("Only support List/Range on checking partition's inclusion");
}
return null;
}
public PartitionItem handleNewSinglePartitionDesc(SinglePartitionDesc desc,
long partitionId, boolean isTemp) throws DdlException {
Preconditions.checkArgument(desc.isAnalyzed());

View File

@ -1370,7 +1370,7 @@ public class InternalCatalog implements CatalogIf<Database> {
} finally {
table.readUnlock();
}
addPartition(db, tableName, clause, false);
addPartition(db, tableName, clause);
} catch (UserException e) {
throw new DdlException("Failed to ADD PARTITION " + addPartitionLikeClause.getPartitionName()
@ -1378,10 +1378,7 @@ public class InternalCatalog implements CatalogIf<Database> {
}
}
// if skipLock = true. there's not any lock operation. In generally it means we
// have a relative process outside and under a same huge lock.
public void addPartition(Database db, String tableName, AddPartitionClause addPartitionClause, boolean skipLock)
throws DdlException {
public void addPartition(Database db, String tableName, AddPartitionClause addPartitionClause) throws DdlException {
SinglePartitionDesc singlePartitionDesc = addPartitionClause.getSingeRangePartitionDesc();
DistributionDesc distributionDesc = addPartitionClause.getDistributionDesc();
boolean isTempPartition = addPartitionClause.isTempPartition();
@ -1394,9 +1391,7 @@ public class InternalCatalog implements CatalogIf<Database> {
// check
OlapTable olapTable = db.getOlapTableOrDdlException(tableName);
if (!skipLock) {
olapTable.readLock();
}
olapTable.readLock();
try {
olapTable.checkNormalStateForAlter();
// check partition type
@ -1528,11 +1523,8 @@ public class InternalCatalog implements CatalogIf<Database> {
} catch (AnalysisException e) {
throw new DdlException(e.getMessage());
} finally {
if (!skipLock) {
olapTable.readUnlock();
}
olapTable.readUnlock();
}
// now we still hold the read lock.
Preconditions.checkNotNull(distributionInfo);
Preconditions.checkNotNull(olapTable);
@ -1546,7 +1538,7 @@ public class InternalCatalog implements CatalogIf<Database> {
long bucketNum = distributionInfo.getBucketNum();
long replicaNum = singlePartitionDesc.getReplicaAlloc().getTotalReplicaNum();
long totalReplicaNum = indexNum * bucketNum * replicaNum;
if (totalReplicaNum >= db.getReplicaQuotaLeftWithoutLock()) { // this may have a little risk
if (totalReplicaNum >= db.getReplicaQuotaLeftWithLock()) {
throw new DdlException("Database " + db.getFullName() + " table " + tableName + " add partition increasing "
+ totalReplicaNum + " of replica exceeds quota[" + db.getReplicaQuota() + "]");
}
@ -1574,12 +1566,9 @@ public class InternalCatalog implements CatalogIf<Database> {
olapTable.storeRowColumn(),
binlogConfig, dataProperty.isStorageMediumSpecified());
// check again.
// if we have lock outside, skip the check cuz the table wouldn'tbe delete.
if (!skipLock) {
olapTable = db.getOlapTableOrDdlException(tableName);
olapTable.writeLockOrDdlException();
}
// check again
olapTable = db.getOlapTableOrDdlException(tableName);
olapTable.writeLockOrDdlException();
try {
olapTable.checkNormalStateForAlter();
// check partition name
@ -1634,6 +1623,8 @@ public class InternalCatalog implements CatalogIf<Database> {
}
}
if (metaChanged) {
throw new DdlException("Table[" + tableName + "]'s meta has been changed. try again.");
}
@ -1672,9 +1663,7 @@ public class InternalCatalog implements CatalogIf<Database> {
LOG.info("succeed in creating partition[{}], temp: {}", partitionId, isTempPartition);
} finally {
if (!skipLock) {
olapTable.writeUnlock();
}
olapTable.writeUnlock();
}
} catch (DdlException e) {
for (Long tabletId : tabletIdSet) {

View File

@ -226,7 +226,6 @@ import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Streams;
import org.apache.commons.collections.CollectionUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -3256,7 +3255,8 @@ public class FrontendServiceImpl implements FrontendService.Iface {
return result;
}
// extract request's partitions
OlapTable olapTable = (OlapTable) table;
PartitionInfo partitionInfo = olapTable.getPartitionInfo();
ArrayList<TStringLiteral> partitionValues = new ArrayList<TStringLiteral>();
for (int i = 0; i < request.partitionValues.size(); i++) {
if (request.partitionValues.get(i).size() != 1) {
@ -3267,71 +3267,34 @@ public class FrontendServiceImpl implements FrontendService.Iface {
}
partitionValues.add(request.partitionValues.get(i).get(0));
}
// get the table and its partitions.
OlapTable olapTable = (OlapTable) table;
PartitionInfo partitionInfo = olapTable.getPartitionInfo();
// generate the partitions from value.
Map<String, AddPartitionClause> addPartitionClauseMap; // name to partition. each is one partition.
ArrayList<Long> existPartitionIds = Lists.newArrayList();
Map<String, AddPartitionClause> addPartitionClauseMap;
try {
// Lock from here
olapTable.writeLockOrDdlException();
// won't get duplicate values. If exist, the origin partition will save id in
// existPartitionIds, no go to return ClauseMap
addPartitionClauseMap = PartitionExprUtil.getNonExistPartitionAddClause(olapTable,
partitionValues, partitionInfo, existPartitionIds);
} catch (DdlException ddlEx) {
errorStatus.setErrorMsgs(Lists.newArrayList(ddlEx.getMessage()));
result.setStatus(errorStatus);
return result;
addPartitionClauseMap = PartitionExprUtil.getAddPartitionClauseFromPartitionValues(olapTable,
partitionValues, partitionInfo);
} catch (AnalysisException ex) {
olapTable.writeUnlock();
errorStatus.setErrorMsgs(Lists.newArrayList(ex.getMessage()));
result.setStatus(errorStatus);
return result;
}
// check partition's number limit.
int partitionNum = olapTable.getPartitionNum() + addPartitionClauseMap.size();
if (partitionNum > Config.max_auto_partition_num) {
olapTable.writeUnlock();
String errorMessage = String.format(
"create partition failed. partition numbers %d will exceed limit variable max_auto_partition_num%d",
partitionNum, Config.max_auto_partition_num);
LOG.warn(errorMessage);
errorStatus.setErrorMsgs(Lists.newArrayList(errorMessage));
result.setStatus(errorStatus);
return result;
}
// add partitions to table. will write metadata.
try {
for (AddPartitionClause addPartitionClause : addPartitionClauseMap.values()) {
Env.getCurrentEnv().addPartitionSkipLock(db, olapTable, addPartitionClause);
for (AddPartitionClause addPartitionClause : addPartitionClauseMap.values()) {
try {
// here maybe check and limit created partitions num
Env.getCurrentEnv().addPartition(db, olapTable.getName(), addPartitionClause);
} catch (DdlException e) {
LOG.warn(e);
errorStatus.setErrorMsgs(
Lists.newArrayList(String.format("create partition failed. error:%s", e.getMessage())));
result.setStatus(errorStatus);
return result;
}
} catch (DdlException e) {
LOG.warn(e);
errorStatus.setErrorMsgs(
Lists.newArrayList(String.format("create partition failed. error:%s", e.getMessage())));
result.setStatus(errorStatus);
return result;
} finally {
// read/write metadata finished. free lock.
olapTable.writeUnlock();
}
// build partition & tablets
List<TOlapTablePartition> partitions = Lists.newArrayList();
List<TTabletLocation> tablets = Lists.newArrayList();
// two part: we create + we found others create(before we try to create and after we found loss in BE)
List<Partition> returnPartitions = Streams
.concat(existPartitionIds.stream().map(id -> olapTable.getPartition(id)),
addPartitionClauseMap.keySet().stream().map(str -> olapTable.getPartition(str)))
.collect(Collectors.toList());
for (Partition partition : returnPartitions) {
for (String partitionName : addPartitionClauseMap.keySet()) {
Partition partition = table.getPartition(partitionName);
TOlapTablePartition tPartition = new TOlapTablePartition();
tPartition.setId(partition.getId());
int partColNum = partitionInfo.getPartitionColumns().size();