dss磁盘检测

This commit is contained in:
liuzhanfeng2 2024-04-22 18:42:21 +08:00
parent 0e0bd1c6e1
commit 328c3fe350
5 changed files with 89 additions and 27 deletions

View File

@ -1414,6 +1414,31 @@ void* KerberosStatusCheckMain(void* arg)
}
}
void CheckSharedDiskUsage(uint32 &vgdataPathUsage, uint32 &vglogPathUsage)
{
FILE *fp;
char result[1024];
double percent1 = 0.0, percent2 = 0.0;
fp = popen("dsscmd lsvg | awk 'NR==2 || NR==3 {print $NF}'", "r");
if (fp == NULL) {
write_runlog(ERROR, "Failed to exec command(dsscmd lsvg).\n");
}
if (fgets(result, sizeof(result)-1, fp) != NULL) {
sscanf(result, "%lf", &percent1);
}
if (fgets(result, sizeof(result)-1, fp) != NULL) {
sscanf(result, "%lf", &percent2);
}
vgdataPathUsage = (uint)percent1;
vglogPathUsage = (uint)percent2;
pclose(fp);
}
/**
* @brief Get DN node log path disk usage and datapath disk usage, send them to the CMS
*
@ -1428,6 +1453,14 @@ void CheckDiskForDNDataPath()
status.dataPathUsage = GetDiskUsageForPath(g_currentNode->datanode[ii].datanodeLocalDataPath);
status.readOnly = g_dnReadOnly[ii];
status.instanceType = INSTANCE_TYPE_DATANODE;
if (IsCusResExistLocal()) {
CheckSharedDiskUsage(status.vgdataPathUsage, status.vglogPathUsage);
write_runlog(DEBUG1, "vgdataPathUsage:%u, vglogPathUsage:%u.\n",
status.vgdataPathUsage, status.vglogPathUsage);
} else {
status.vgdataPathUsage = 0;
status.vglogPathUsage = 0;
}
write_runlog(DEBUG1, "[%s] msgType:%d, instanceId:%u, logPathUsage:%u, dataPathUsage:%u.\n",
__FUNCTION__, status.msgType, status.instanceId, status.logPathUsage, status.dataPathUsage);

View File

@ -124,6 +124,8 @@ static void InitDnReadOnlyInfo(DataNodeReadOnlyInfo *instance, uint32 i, uint32
{
instance->instanceId = g_node[i].datanode[j].datanodeId;
instance->dataDiskUsage = 0;
instance->vgdataDiskUsage = 0;
instance->vglogDiskUsage = 0;
instance->ddbValue = 0;
instance->node = g_node[i].node;
instance->finalState = false;
@ -219,7 +221,10 @@ static ReadOnlyFsmEvent GetReadOnlyFsmEvent(const DataNodeReadOnlyInfo *instance
{
if (instance->dataDiskUsage == 0) {
return DISK_USAGE_INIT;
} else if (instance->dataDiskUsage >= g_readOnlyThreshold) {
} else if (GetIsSharedStorageMode() && instance->vgdataDiskUsage == 0) {
return DISK_USAGE_INIT;
} else if (instance->dataDiskUsage >= g_readOnlyThreshold ||
instance->vgdataDiskUsage >= g_readOnlyThreshold || instance->vglogDiskUsage >= g_readOnlyThreshold) {
g_allHealth = false;
return DISK_USAGE_EXCEEDS_THRESHOLD;
} else {
@ -326,7 +331,7 @@ static void PreAlarmForNodeThreshold()
DynamicNodeReadOnlyInfo *curNodeInfo = &g_dynamicNodeReadOnlyInfo[i];
/* log usage */
if (curNodeInfo->logDiskUsage >= preAlarmThreshhold) {
write_runlog(LOG, "[%s] [logDisk usage] Pre Alarm threshold reached, node=%u, usage=%u.\n",
write_runlog(LOG, "[%s] [logDisk usage] Pre Alarm threshold reached, node=%u, log_disk_usage=%u.\n",
__FUNCTION__, g_node[i].node, curNodeInfo->logDiskUsage);
ReportLogStorageAlarm(ALM_AT_Fault, curNodeInfo->instanceName, i);
} else {
@ -346,9 +351,12 @@ static void PreAlarmForNodeThreshold()
/* DN */
for (uint32 j = 0; j < curNodeInfo->dataNodeCount; j++) {
DataNodeReadOnlyInfo *curDn = &curNodeInfo->dataNode[j];
if (curDn->dataDiskUsage >= preAlarmThreshhold) {
write_runlog(LOG, "[%s] [dataDisk usage] Pre Alarm threshold reached, instanceId=%u, usage=%u\n",
__FUNCTION__, curDn->instanceId, curDn->dataDiskUsage);
if (curDn->dataDiskUsage >= preAlarmThreshhold || curDn->vgdataDiskUsage >= preAlarmThreshhold ||
curDn->vglogDiskUsage >= preAlarmThreshhold) {
write_runlog(LOG, "[%s] [dataDisk usage] Pre Alarm threshold reached, instanceId=%u,"
"disk_usage=%u, shared_disk_usage_for_data=%u, shared_disk_usage_for_log=%u.\n",
__FUNCTION__, curDn->instanceId, curDn->dataDiskUsage,
curDn->vgdataDiskUsage, curDn->vglogDiskUsage);
ReportReadOnlyPreAlarm(ALM_AT_Fault, curDn->instanceName, curDn->instanceId);
} else {
ReportReadOnlyPreAlarm(ALM_AT_Resume, curDn->instanceName, curDn->instanceId);
@ -360,11 +368,10 @@ static void PreAlarmForNodeThreshold()
static bool IsStorageDetectContinue()
{
bool isEnable = IsBoolCmParamTrue(g_enableSetReadOnly);
bool isNotShareDisk = g_dnArbitrateMode != SHARE_DISK;
bool isPrimary = g_HA_status->local_role == CM_SERVER_PRIMARY;
bool isNeedSyncDdb = IsNeedSyncDdb();
bool isNotUpgrade = undocumentedVersion == 0;
return (isEnable && isPrimary && isNeedSyncDdb && isNotShareDisk && isNotUpgrade);
return (isEnable && isPrimary && isNeedSyncDdb && isNotUpgrade);
}
static void GetReadOnlyCmd(char *command, size_t commandLen, const DataNodeReadOnlyInfo *instance, bool readOnly)
@ -400,9 +407,12 @@ static bool IsPeerPrimaryReadOnly(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActDoNoting(DataNodeReadOnlyInfo *instance)
{
if (instance->dataDiskUsage >= g_readOnlyThreshold) {
write_runlog(LOG, "[%s] instance %u is transaction read only, disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
if (instance->dataDiskUsage >= g_readOnlyThreshold || instance->vgdataDiskUsage >= g_readOnlyThreshold ||
instance->vglogDiskUsage >= g_readOnlyThreshold) {
write_runlog(LOG, "[%s] instance %u is transaction read only, disk_usage:%u,"
"shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
}
instance->finalState = true;
return false;
@ -411,8 +421,9 @@ bool ReadOnlyActDoNoting(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActSetDdbTo0(DataNodeReadOnlyInfo *instance)
{
write_runlog(LOG, "[%s] instance %u is not read only and ddb is 1, need set ddb to 0,"
"disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
"disk_usage:%u, shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->ddbValue = 0;
instance->finalState = false;
return true;
@ -421,8 +432,9 @@ bool ReadOnlyActSetDdbTo0(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActSetDdbTo1(DataNodeReadOnlyInfo *instance)
{
write_runlog(LOG, "[%s] instance %u is not read only and ddb is 0, need set ddb to 1,"
" disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
" disk_usage:%u, shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->ddbValue = 1;
instance->finalState = false;
return true;
@ -431,8 +443,9 @@ bool ReadOnlyActSetDdbTo1(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActSetReadOnlyOn(DataNodeReadOnlyInfo *instance)
{
write_runlog(LOG, "[%s] instance %u is not read only and ddb is 1, set default_transaction_read_only on,"
" disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
" disk_usage:%u, shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->finalState = false;
char command[CM_MAX_COMMAND_LEN] = {0};
@ -452,8 +465,9 @@ bool ReadOnlyActSetReadOnlyOn(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActSetReadOnlyOff(DataNodeReadOnlyInfo *instance)
{
write_runlog(LOG, "[%s] instance %u is read only and ddb is 1, set default_transaction_read_only off,"
" disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
" disk_usage:%u, shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->finalState = false;
char command[CM_MAX_COMMAND_LEN] = {0};
@ -472,16 +486,20 @@ bool ReadOnlyActSetReadOnlyOff(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActRecordManuallySetReadOnly(DataNodeReadOnlyInfo *instance)
{
write_runlog(WARNING, "[%s] instance %u set read only manually, disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
write_runlog(WARNING, "[%s] instance %u set read only manually, disk_usage:%u,"
"shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->finalState = false;
return false;
}
bool ReadOnlyActSetDdbTo1Conditional(DataNodeReadOnlyInfo *instance)
{
write_runlog(WARNING, "[%s] instance %u set read only manually, disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
write_runlog(WARNING, "[%s] instance %u set read only manually, disk_usage:%u,"
"shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->finalState = false;
if (instance->instanceType == INSTANCE_TYPE_COORDINATE) {
@ -499,8 +517,10 @@ bool ReadOnlyActSetDdbTo1Conditional(DataNodeReadOnlyInfo *instance)
bool ReadOnlyActRecordDiskUsageAbnormal(DataNodeReadOnlyInfo *instance)
{
write_runlog(WARNING, "[%s] instance %u disk usage abnormal, disk_usage:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage, g_readOnlyThreshold);
write_runlog(WARNING, "[%s] instance %u disk usage abnormal, disk_usage:%u,"
"shared_disk_usage_for_data:%u, shared_disk_usage_for_log:%u, read_only_threshold:%u\n",
__FUNCTION__, instance->instanceId, instance->dataDiskUsage,
instance->vgdataDiskUsage, instance->vglogDiskUsage, g_readOnlyThreshold);
instance->finalState = false;
return false;
}

View File

@ -263,10 +263,13 @@ void process_agent_to_cm_heartbeat_msg(
void process_agent_to_cm_disk_usage_msg(const AgentToCmDiskUsageStatusReport *diskUsage)
{
const int maxUsage = 100;
if (diskUsage->dataPathUsage > maxUsage || diskUsage->logPathUsage > maxUsage) {
if (diskUsage->dataPathUsage > maxUsage || diskUsage->logPathUsage > maxUsage ||
diskUsage->vgdataPathUsage > maxUsage || diskUsage->vglogPathUsage > maxUsage) {
write_runlog(ERROR,
"the percentage of disk usage is illegal, it must be [0-100], dataDiskUsage=%u, logDiskUsage=%u.\n",
diskUsage->dataPathUsage, diskUsage->logPathUsage);
"the percentage of disk usage is illegal, it must be [0-100], dataDiskUsage=%u,"
"logDiskUsage=%u, vgdataDiskUsage=%u, vglogDiskUsage:%u.\n",
diskUsage->dataPathUsage, diskUsage->logPathUsage,
diskUsage->vgdataPathUsage, diskUsage->vglogPathUsage);
return;
}
@ -288,6 +291,8 @@ void process_agent_to_cm_disk_usage_msg(const AgentToCmDiskUsageStatusReport *di
DataNodeReadOnlyInfo *curDn = &curNodeInfo->dataNode[j];
if (diskUsage->instanceId == curDn->instanceId) {
curDn->dataDiskUsage = diskUsage->dataPathUsage;
curDn->vgdataDiskUsage = diskUsage->vgdataPathUsage;
curDn->vglogDiskUsage = diskUsage->vglogPathUsage;
curDn->readOnly = diskUsage->readOnly;
curDn->instanceType = INSTANCE_TYPE_DATANODE;
curNodeInfo->logDiskUsage = diskUsage->logPathUsage;

View File

@ -1239,6 +1239,8 @@ typedef struct {
uint32 instanceId;
uint32 dataPathUsage;
uint32 logPathUsage;
uint32 vgdataPathUsage;
uint32 vglogPathUsage;
int instanceType;
bool readOnly;
char reserved[16];

View File

@ -100,6 +100,8 @@ typedef struct DataNodeReadOnlyInfoT {
uint32 node;
uint32 instanceId;
uint32 dataDiskUsage;
uint32 vgdataDiskUsage;
uint32 vglogDiskUsage;
int instanceType;
char ddbValue;
bool readOnly;