This shows you the differences between two versions of the page.
| Both sides previous revision Previous revision Next revision | Previous revision | ||
| gpfs:gpfs_healthcheck [2021/10/05 23:19] manu | gpfs:gpfs_healthcheck [2022/08/29 23:05] (current) manu | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| ====== GPFS health ====== | ====== GPFS health ====== | ||
| + | |||
| + | mmhealth thresholds (+GUI +REST) monitors the usage: | ||
| + | <cli prompt='$'> | ||
| + | $ mmhealth thresholds list | ||
| + | ### Threshold Rules ### | ||
| + | rule_name metric  error warn direction filterBy groupBy sensitivity | ||
| + | ---------------------------------------------------------------------------------------------------------------------------- | ||
| + | InodeCapUtil_Rule Fileset_inode 90.0 80.0  high gpfs_cluster_name,gpfs_fs_name,gpfs_fset_name 300 | ||
| + | DataCapUtil_Rule DataPool_capUtil 97.0 90.0  high gpfs_cluster_name,gpfs_fs_name,gpfs_diskpool_name 300 | ||
| + | .. | ||
| + | </cli> | ||
| List all events | List all events | ||
| Line 16: | Line 27: | ||
| Clear ALL messages on the Web (GUI) interface | Clear ALL messages on the Web (GUI) interface | ||
| <cli prompt='#'> | <cli prompt='#'> | ||
| - | [root@prscale-a-01 ~]# /usr/lpp/mmfs/gui/cli/lshealth --clear | + | [root@prscale-a-01 ~]# /usr/lpp/mmfs/gui/cli/lshealth --reset | 
| + | </cli> | ||
| + | |||
| + | Already resolved errors that continue to be displayed in mmhealth and the GUI: | ||
| + | How to remove them (and this annoying TIPS): | ||
| + | <cli prompt='#'> | ||
| + | mmdsh -N <NODE or all> mmsysmonc clearDB | ||
| + | mmdsh -N <NODE or all> mmsysmoncontrol restart | ||
| + | mmhealth event hide <EventName>  | ||
| </cli> | </cli> | ||
| Line 146: | Line 165: | ||
| </cli> | </cli> | ||
| + | Check Protocols components | ||
| + | <cli prompt='#'> | ||
| + | [root@prscale-a-01 ~]# mmces state show -a -Y | ||
| + | mmces:stateShow:HEADER:version:reserved:reserved:NODE:AUTH:BLOCK:NETWORK:HDFS_NAMENODE:AUTH_OBJ:NFS:OBJ:SMB:CES: | ||
| + | mmces:stateShow:0:1:::prscale-a-01:DISABLED:DISABLED:HEALTHY:DISABLED:DISABLED:HEALTHY:DISABLED:HEALTHY:HEALTHY: | ||
| + | mmces:stateShow:0:1:::prscale-b-01:DISABLED:DISABLED:HEALTHY:DISABLED:DISABLED:HEALTHY:DISABLED:HEALTHY:HEALTHY: | ||
| + | </cli> | ||
| + | |||
| + | Check particular event | ||
| + | <cli prompt='#'> | ||
| + | [root@prscale-a-02 ~]# mmhealth cluster show | ||
| + | |||
| + | Component  Total  Failed  Degraded  Healthy  Other | ||
| + | ------------------------------------------------------------------------------------- | ||
| + | NODE 3 0 0 0 3 | ||
| + | GPFS 3 0 0 0 3 | ||
| + | NETWORK  3  0 0 3 0 | ||
| + | FILESYSTEM  3 0 0 3 0 | ||
| + | DISK 31  0 0 31  0 | ||
| + | CES 2  0 0 2 0 | ||
| + | CESIP 1  0 0 1 0 | ||
| + | FILESYSMGR  1 0 0 1 0 | ||
| + | GUI 3  0 1 2 0 | ||
| + | PERFMON  3  0 0 3 0 | ||
| + | THRESHOLD  3  0 0 3 0 | ||
| + | </cli> | ||
| + | |||
| + | <code> | ||
| + | mmhealth cluster show [ NODE | GPFS | NETWORK [ UserDefinedSubComponent ] | ||
| + | | FILESYSTEM  [UserDefinedSubComponent ]| DISK [UserDefinedSubComponent ] | ||
| + | | CES |AUTH | AUTH_OBJ | BLOCK | CESNETWORK | NFS | OBJECT | SMB | ||
| + | | HADOOP |CLOUDGATEWAY | GUI | PERFMON | THRESHOLD | ||
| + | | AFM [UserDefinedSubComponent]  ] | ||
| + | [-Y] [--verbose] | ||
| + | </code> | ||
| + | |||
| + | <cli prompt='#'> | ||
| + | [root@prscale-a-02 ~]# mmhealth cluster show GUI | ||
| + | |||
| + | Component  Node  Status  Reasons | ||
| + | ------------------------------------------------------------------------------------------ | ||
| + | GUI prscale-q-b-01  HEALTHY  - | ||
| + | GUI prscale-a-02  HEALTHY  - | ||
| + | GUI prscale-a-01  DEGRADED  gui_refresh_task_failed | ||
| + | </cli> | ||
| + | |||
| + | <code> | ||
| + | [root@prscale-a-01 ~]# /usr/lpp/mmfs/gui/cli/runtask help --debug | ||
| + | [AFM_FILESET_STATE, AFM_NODE_MAPPING, ALTER_HOST_NAME, CALLBACK, CALLHOME, CALLHOME_STATUS, CAPACITY_LICENSE, CES_ADDRESS, CES_STATE, CES_SERVICE_STATE, CES_USER_AUTH_SERVICE, CLUSTER_CONFIG, CONNECTION_STATUS, DAEMON_CONFIGURATION, DF, DISK_USAGE, DISKS, FILESETS, FILESYSTEM_MOUNT, FILESYSTEMS, FILE_AUDIT_LOG_CONFIG, GUI_CONFIG_CHECK, GPFS_JOBS, DIGEST_NOTIFICATION_TASK, HEALTH_STATES, HEALTH_TRIGGERED, HOST_STATES, HOST_STATES_CLIENTS, INODES, KEYSTORE, LOG_REMOVER, MASTER_GUI_ELECTION, MOUNT_CONFIG, NFS_EXPORTS, NFS_EXPORTS_DEFAULTS, NFS_SERVICE, NODE_LICENSE, NODECLASS, OBJECT_STORAGE_POLICY, OS_DETECT, PM_MONITOR, PM_SENSORS, PM_TOPOLOGY, POLICIES, QUOTA, QUOTA_DEFAULTS, QUOTA_ID_RESOLVE, QUOTA_MAIL, RDMA_INTERFACES, REMOTE_CONFIG, REMOTE_CLUSTER, REMOTE_FILESETS, REMOTE_GPFS_CONFIG, REMOTE_HEALTH_STATES, SMB_GLOBALS, SMB_SHARES, SNAPSHOTS, SNAPSHOTS_FS_USAGE, SNAPSHOT_MANAGER, SQL_STATISTICS, STATE_MAIL, STORAGE_POOL, SYSTEMUTIL_DF, TCT_ACCOUNT, TCT_CLOUD_SERVICE, TCT_NODECLASS, THRESHOLDS, WATCHFOLDER, WATCHFOLDER_STATUS, TASK_CHAIN] | ||
| + | </code> | ||
| + | |||
| + | <cli prompt='#'> | ||
| + | [root@prscale-a-01 ~]# /usr/lpp/mmfs/gui/cli/runtask CLUSTER_CONFIG --debug | ||
| + | debug: locale=en_US | ||
| + | debug: Running 'mmsdrquery 'sdrq_cluster_info' all ' on node localhost | ||
| + | debug: Running 'mmsdrquery 'sdrq_nsd_info' all ' on node localhost | ||
| + | debug: Running 'mmlscluster -Y ' on node localhost | ||
| + | debug: Running 'mmsdrquery 'sdrq_node_info' all ' on node localhost | ||
| + | debug: Running 'mmlsnodeclass 'GUI_MGMT_SERVERS' -Y ' on node localhost | ||
| + | debug: Running 'mmlsnodeclass 'GUI_SERVERS' -Y ' on node localhost | ||
| + | EFSSG1000I The command completed successfully. | ||
| + | </cli> | ||
| + | <cli prompt='#'> | ||
| + | [root@prscale-a-01 ~]# mmhealth event show gui_refresh_task_failed | ||
| + | Event Name: gui_refresh_task_failed | ||
| + | Event ID: 998254 | ||
| + | Description:  One or more GUI refresh tasks failed. This could mean that data in the GUI is outdated. | ||
| + | Cause:  There can be several reasons. | ||
| + | User Action:  1.) Check if there is additional information available by executing '/usr/lpp/mmfs/gui/cli/lstasklog [taskname]'. 2.) Run the specified task manually on the CLI by executing '/usr/lpp/mmfs/gui/cli/runtask [taskname] --debug'. 3.) Check the GUI logs under /var/log/cnlog/mgtsrv. 4.) Contact IBM Support if this error persists or occurs more often. | ||
| + | Severity:  WARNING | ||
| + | State:  DEGRADED | ||
| + | [root@prscale-a-01 ~]# mmhealth event resolve 998254 | ||
| + | The specified event gui_refresh_task_failed is not manually resolvable. | ||
| + | </cli> | ||
| <code> | <code> | ||