This script with an exception list will check the percent usage of each filesystem, and you can also use wildcards.
root@aixserver - /opt/freeware/lib/nagios/plugins > cat MY_checkfs.cfg #fs_name:pct_warning:pct_critical # You can add exception in this file, else the default is used #Example: #/db*:98:100 , 98 and 100 whill be used for all filesystems /db* #default:90:95 /tmp:80:95 /usr:96:98 /opt:95:98 /cdrom:100:100 /mnt:100:100
root@aixserver - /opt/freeware/lib/nagios/plugins > cat MY_checkfs.sh
#!/usr/bin/ksh93 #set -x ################################################## #@(#) send an alert if filesystems usage is higher than threshold ################################################## # associate file is checkfs.cfg (exceptions) # please do not modify this script use checkfs.cfg # checkfs.cfg format: # fs_name:pctwarning_threshold:pctcritical_threshold # version: 1.1 10-02-2014 Manu ################################################## dir=`dirname $0` #. $dir/.env sn=`basename $0 | cut -d. -f1` configfile=/opt/freeware/lib/nagios/plugins/$sn.cfg logpath=/tmp logname=$logpath/$sn.log tmpcfg=$logpath/$sn.txt tmpscript=$logpath/$sn.scr nb_file=25 nb_dir=15 DATE=$(date +"%Y-%m-%d %H:%M") STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 STATE_DEPENDENT=4 #echo "$sn: $DATE" > $logname #------------------------------------------------ # generate_case_file () #------------------------------------------------ generate_case_file() { pctwarn_def=$(grep "^#default" $configfile | cut -d":" -f2) pctcrit_def=$(grep "^#default" $configfile | cut -d":" -f3) cat $configfile | grep -v "^ *#" | sed '/^$/d' > $tmpcfg cat $tmpcfg | awk -F':' '{print $1") pctwarn="$2" pctcrit="$3}' | sed '/^$/d;$G' | sed '$s/^/\*\)\ pctwarn=pctwarn_def\ pctcrit=pctcrit_def /' | sed "s/pctwarn_def/$pctwarn_def/" | sed "s/pctcrit_def/$pctcrit_def/" | sed '2,$s/^/\\n/' | sed 's/$/;;/' > $tmpcfg } #------------------------------------------------ # check_fs () #------------------------------------------------ check_fs() { EXIT_CODE=0 MSG_WARN="" MSG_CRIT="" OUTPUT="All filesystems OK" clauses=$(cat $tmpcfg) for i in $(df -k | egrep -v "Filesystem|:|/proc" | grep -v ":" | grep "/dev/" | awk '{print $4 $7}') do fs=`echo $i | cut -f 2 -d%` pct_fs=`echo $i | cut -f 1 -d%` mail="" cmd="case $fs in \n $clauses \n esac" echo $cmd > $tmpscript . $tmpscript > /dev/null if [ $pct_fs -gt $pctwarn ] then if [ $pct_fs -gt $pctcrit ] then MSG_CRIT="$MSG_CRIT $fs used $pct_fs" EXIT_CODE=$(echo $STATE_CRITICAL) else MSG_WARN="$MSG_WARN $fs used $pct_fs" if [ $STATE_WARNING -gt $EXIT_CODE ] then EXIT_CODE=$(echo $STATE_WARNING) fi fi #info_fs >> $logname 2>&1 & fi done if [ "$MSG_CRIT" != "" ] then OUTPUT="DISK CRITICAL - $MSG_CRIT" fi if [ "$MSG_WARN" != "" ] then if [ "$MSG_CRIT" != "" ] then OUTPUT="$OUTPUT -- DISK WARNING - $MSG_WARN" else OUTPUT="DISK WARNING - $MSG_WARN" fi fi rm $tmpcfg $tmpscript > /dev/null 2>&1 } #------------------------------------------------ # info_fs () #------------------------------------------------ info_fs() { echo "####### Here are the $nb_file biggest files from filesystem $fs" find $fs -xdev -type f -ls | sort +6nr | head -$nb_file echo "\n####### Here are the $nb_dir biggest directories from filesystem $fs" du -ms $fs/* | sort -nr | head -$nb_dir } generate_case_file #>> $logname 2>&1 check_fs #>> $logname 2>&1 echo $OUTPUT exit $EXIT_CODE
root@aixserver - /opt/freeware/lib/nagios/plugins > cat check_fs.cfg #filesystem_name:alert_w(%):alert_c(%) /tmp:80:95 /usr:96:98 /cdrom:100:100 /mnt:100:100 /export/aix6100-09:99:100 /export/aix7100-03:99:100 /export/aix7200-02:99:100 /export/mksysb:95:99 /export/vios226:99:100 /export/vios310:99:100 /export/software:99:100
root@aixserver - /opt/freeware/lib/nagios/plugins > cat check_fs.sh
#!/bin/sh #set -x ################################################## #@(#) check filesystems usage is higher than threshold ################################################## # associate file is check_fs.cfg (exceptions) # please do not modify this script use check_fs.cfg # check_fs.cfg format: # fs_name:pct_warn:pct_crit # check also GPFS quotas if used # version: 1.0 10-2020 Manu ################################################## STATUS=0 STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 MSG="" configfile=/usr/local/nagios/libexec/check_fs.cfg os=$(uname -a | awk '{print $1}') defaultwarn=92 defaultcrit=98 #------------------------------------------------ # check_gpfs () #------------------------------------------------ check_gpfs() { # test gpfs quota if applicable lsfs -a | sed '1d' | awk '{print $4}' | grep "mmfs" > /dev/null 2>&1 if [ $? -eq 0 ] then quotaalert=2000000 for i in sybase kplus do used=$(mmlsquota -u $i |grep clkpfs|awk '{print $3}') max=$(mmlsquota -u $i |grep clkpfs|awk '{print $4}') diff=$(expr $max - $used) if [ $diff -lt $quotaalert ] then echo "***** GPFS Quota-WARNING *****" >> $logname echo " User $i has ounly $diff KB free quota " >> $logname echo " Reduce used space or use mmedquota -u $i to modifie quota" >> $logname fi done fi } #------------------------------------------------ # check_aix_fs () #------------------------------------------------ check_aix_fs() { df -k | egrep -v "Filesystem|:|/proc" | grep -v ":" | grep "/dev/" | awk '{print $4,$7}' | sed 's/\%//' | while read pct fs do if [ -e $configfile ] then line=$(grep "^$fs:" $configfile) if [ "$line" != "" ] then warn=$(echo $line |cut -d':' -f2) crit=$(echo $line |cut -d':' -f3) else warn=$defaultwarn crit=$defaultcrit fi else warn=$defaultwarn crit=$defaultcrit fi if [ "$pct" -gt "$crit" ] then MSG=$(echo "$MSG $fs:${pct}%") STATE=${STATE_CRITICAL} else if [ "$pct" -gt "$warn" ] then MSG=$(echo "$MSG $fs:${pct}%") STATE=${STATE_WARNING} fi fi if [ "$STATUS" -lt "$STATE" ] then STATUS=$STATE fi done } #------------------------------------------------ # check_lnx_fs () #------------------------------------------------ check_lnx_fs() { df -k | egrep -v "Filesystem|:|/proc" | grep -v ":" | grep "/dev/" | rev | awk '{print $1,$2}' | rev | sed 's/\%//' | while read pct fs do if [ -e $configfile ] then line=$(grep "^$fs:" $configfile) if [ "$line" != "" ] then warn=$(echo $line |cut -d':' -f2) crit=$(echo $line |cut -d':' -f3) else warn=$defaultwarn crit=$defaultcrit fi else warn=$defaultwarn crit=$defaultcrit fi if [ "$pct" -gt "$crit" ] then MSG=$(echo "$MSG $fs:${pct}%") STATE=${STATE_CRITICAL} else if [ "$pct" -gt "$warn" ] then MSG=$(echo "$MSG $fs:${pct}%") STATE=${STATE_WARNING} fi fi if [ "$STATUS" -lt "$STATE" ] then STATUS=$STATE fi done } ########################################### if [ "$os" = "Linux" ] then check_lnx_fs else if [ "$os" = "AIX" ] then check_aix_fs else echo "########## Unknown OS" STATUS=$STATE_UNKNOWN fi fi #check_gpfs case "$STATUS" in "$STATE_OK") MSG=$(echo "OK") ;; "$STATE_WARNING") MSG=$(echo "WARNING: $MSG") ;; "$STATE_CRITICAL") MSG=$(echo "CRITICAL: $MSG") ;; esac echo $MSG exit $STATUS
#!/bin/ksh93 # # AIX check NTP for Nagios v1.0 06/2023 # EIF # STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 STATE_DEPENDENT=4 MAXSEC=30 # max delay allowed in seconds OUTPUT=$(ntpq -p 2>&1 | sed 's/$/;/g') if [ $(echo $OUTPUT | grep refused >/dev/null 2>&1; echo $?) -eq 0 ] then EXIT_CODE=$STATE_CRITICAL OUTPUT="Process: xntpd not running" else if [[ $(echo $OUTPUT | tr ';' '\n' | sed '/^$/d' | sed 's/^\ //g' | tail +3) == "" ]] then EXIT_CODE=$STATE_CRITICAL OUTPUT="Process: xntpd not running" else OFFSET=$(echo $OUTPUT | tr ';' '\n' | sed '/^$/d' | sed 's/^\ //g' | tail +3 | rev | awk '{print $2}' | rev | cut -d'.' -f1 | sed 's/+//' | sed 's/-//' | sort -u | tail -1) if [ $(echo $OUTPUT | tr ';' '\n' | sed '/^$/d' | sed 's/^\ //g' | tail +3 | grep -q '^\*'; echo $?) -eq 0 ] then EXIT_CODE=$STATE_OK OUTPUT="Process: xntpd offset ${OFFSET}s synchronized OK" else if [ $(echo $OFFSET) -lt "$MAXSEC" ] then EXIT_CODE=$STATE_OK OUTPUT="Process: xntpd offset ${OFFSET}s synchronized OK" else EXIT_CODE=$STATE_WARNING OUTPUT="Process: xntpd offset ${OFFSET}s Not synchronized" fi fi fi fi echo $OUTPUT | tr ';' '\n' | sed '/^$/d' | sed 's/^\ //g' | tail +3 | rev | awk '{print $2}' | rev | cut -d'.' -f1 | sed 's/-//' echo $OUTPUT exit $EXIT_CODE
As root:
#!/bin/ksh93 # # AIX check NTP for Nagios v1.0 06/2023 # STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 STATE_DEPENDENT=4 STATE=$(/usr/bin/lssrc -ls xntpd > /dev/null 2>&1; echo $?) if [ "$STATE" -ne 0 ] then EXIT_CODE=$STATE_CRITICAL OUTPUT="Process: xntpd not running" else CODE=$(/usr/bin/lssrc -ls xntpd | tr -s ' ' | tr ' ' ';' | grep -i 'Leap;indicator:' | cut -d':' -f2- | cut -d'(' -f1 | sed 's/;//g') if [ "$CODE" == "00" ] then EXIT_CODE=$STATE_OK OUTPUT="Process: xntpd synchronized OK" else EXIT_CODE=$STATE_CRITICAL OUTPUT="Process: xntpd Not synchronized" fi fi echo $OUTPUT exit $EXIT_CODE
#!/usr/bin/ksh # set -x ################################################## #@(#) check mail server and mailq # best practice is to stop sendmail service on AIX # as it's not required to send email # but if you leave it running, it happens that # the service hangs, then no mails 'll be sent ################################################## # v1.1 12-2020 ################################################## # Return codes: STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 DS='smtp.local.lu' # Add here the SMTP server to check MAILDIR=/var/spool/mail TMPfile=/tmp/check_mail.tmp # no warning before 3 days mailwarning=3 # max days mail pending mailcritical=10 # max days mail pending if [ $(grep "^DS${DS}" /etc/sendmail.cf > /dev/nul 2>&1; echo $?) -ne "0" ] then echo "CRITICAL: config file error /etc/sendmail.cf" exit $STATE_CRITICAL else if [ $(ls $MAILDIR | grep -v lost | wc -l) != "0" ] then cat /dev/null > $TMPfile for file1 in $(find $MAILDIR -type f) do grep '^Date:' $file1| cut -d' ' -f2-5 | head -5 | tr ' ' ';' >> $TMPfile done for line in $(cat $TMPfile) do val=$(echo $line | sed 's/;/\ /g') /opt/freeware/bin/date -d"$val" "+%s" >> $TMPfile.1 done val=$(cat $TMPfile.1 | sort | head -1) val1=$(/opt/freeware/bin/date "+%s") days=$(echo "scale=0;($val1-$val)/3600/24" | bc) rm $TMPfile* 2>/dev/null if [ "$days" -gt "$mailcritical" ] then echo "CRITICAL: mail pending $days days" exit $STATE_CRITICAL else if [ "$days" -gt "$mailwarning" ] then echo "WARNING: mail pending $days days" exit $STATE_WARNING fi fi else echo "OK" exit $STATE_OK fi fi
#!/usr/bin/ksh # check VCPU usage in percent # according to a definition set by the app . owner # the virtual cpu usage > 99% CPUs left on the system will result # in an errorcode 2 # for nagios to be critical and alerted SAR="/usr/sbin/sar" STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 STATE_DEPENDENT=4 LIST_WARNING_THRESHOLD=${LIST_WARNING_THRESHOLD:-"95"} LIST_CRITICAL_THRESHOLD=${LIST_CRITICAL_THRESHOLD:-"99"} INTERVAL_SEC=${INTERVAL_SEC:="1"} NUM_REPORT=${NUM_REPORT:="3"} print_usage() { echo "" echo "$PROGNAME $RELEASE - CPU Utilization check script for Nagios" echo "" echo "Usage: check_cpu_stats.sh -w -c (-i -n)" echo "" echo " -w Warning threshold in % for warn_user,warn_system,warn_iowait CPU (default : ${LIST_WARNING_THRESHOLD})" echo " Exit with WARNING status if cpu exceeds warn_n" echo " -c Critical threshold in % for crit_user,crit_system,crit_iowait CPU (default : ${LIST_CRITICAL_THRESHOLD})" echo " Exit with CRITICAL status if cpu exceeds crit_n" echo " -h Show this page" echo "" echo "Usage: $PROGNAME -w 95 -c 99" echo "" exit 0 } # Parse parameters while [ $# -gt 0 ]; do case "$1" in -h | --help) print_usage exit $STATE_OK ;; -w | --warning) shift LIST_WARNING_THRESHOLD=$1 ;; -c | --critical) shift LIST_CRITICAL_THRESHOLD=$1 ;; esac shift done NUM_VCPUS=$(lsdev -Cc processor|wc -l|awk '{print $1}') PHYSC=$($SAR ${INTERVAL_SEC} ${NUM_REPORT} | tail -1 | awk '{print $6}') PCTUSED=$(echo "scale=2\n ${PHYSC}*100/${NUM_VCPUS}" | bc) if [ $PCTUSED -gt $LIST_WARNING_THRESHOLD ] then if [ $PCTUSED -gt $LIST_CRITICAL_THRESHOLD ] then echo "CRITICAL - CPU usage at $PCTUSED%" exit $STATE_CRITICAL else echo "WARNING - CPU usage at $PCTUSED%" exit $STATE_WARNING fi else echo "OK - CPU usage at $PCTUSED%" exit $STATE_OK fi
# cat check_paths.sh
#!/bin/sh #@(#) v1.0 Count number of paths per disk # v1.1 add sudo linux # v1.2 change for VSCSI # v1.3 add verbose (-v), improvements linux # On linux, add into /etc/sudoers the following lines for linux: # nagios ALL=(ALL) NOPASSWD: /usr/sbin/multipath # nrpe ALL=(ALL) NOPASSWD: /usr/sbin/multipath # number of path per type of disk pathviosfc=4 pathviosscsi=2 pathviossas=1 pathlparfc=8 pathlparscsi=2 pathlparsas=1 STATUS=0 STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 MSG="" verbose="" # specified value for nb paths if [ "$npath" == "" ] then nbpath=$pathlparfc fi os=$(uname -a | awk '{print $1}') grepp() { [ $# -eq 1 ] && perl -00ne "print if /$1/i" || perl -00ne "print if /$1/i" < "$2";} #--------------------- count_linux_path() { tempfile=/tmp/multipath.txt if [ ! -x /usr/sbin/multipath ] || [ $(lsscsi -s | grep -q VMware; echo $?) -eq 0 ] then MSG="OK: no multipathing" verbose="$MSG" STATUS=$STATE_OK else if [ $(timeout 30 sudo /usr/sbin/multipath -ll | grep -v "policy=" | grep -v "size=" | tr -s ' ' | sed 's/\ /;/g' | sed '/^mpath/i \\n' > $tempfile ; echo $?) -ne "0" ] then MSG="$MSG Maybe error on sudo config" verbose="$MSG" STATUS=$STATE_UNKNOWN else for i in $(cat $tempfile | grep '^mpath' | awk -F';' '{print $1}') do pathok=$(cat $tempfile | grepp "^$i;" | grep -v "policy=" | grep -v "size=" | grep -v '^mpath' | grep active | wc -l | awk '{print $1}') pathok_pct=$(echo "scale=1;100*$pathok/$nbpath" | bc | cut -d '.' -f1) verbose="$verbose $i;$pathok/$nbpath" # verbose message if [ "$pathok_pct" -lt "50" ] then MSG="$MSG $i;$pathok/$nbpath" if [ $STATUS -lt $STATE_CRITICAL ] then STATUS=$STATE_CRITICAL fi else if [ "$pathok_pct" -ne "100" ] then MSG="$MSG $i;$pathok/$nbpath" if [ $STATUS -lt $STATE_CRITICAL ] then STATUS=$STATE_WARNING fi fi fi done fi fi rm -f $tempfile 2>/dev/null } #--------------------- count_aix_path() { # check not available disks nbdisknok=$(lsdev -Cc disk | grep -v Available | wc -l | awk '{print $1}') if [ "$nbdisknok" -ne "0" ] then MSG="$MSG WARNING: $nbdisknok disks defined" verbose="$MSG" STATUS=$STATE_WARNING else STATUS=$STATE_OK fi for line in $(lsdev -Cc disk | tr -s ' ' | sed 's/\ /:/' | sed 's/\ /:/' | sed 's/\ /,/g') do hdisk=$(echo $line | awk -F':' '{print $1}') if [ "$(echo $line | cut -d':' -f3- | tr 'A-Z' 'a-z' | grep -q mpio; echo $?)" -eq "0" ] then if [ ! -e /usr/ios/cli/ioscli ] then # type LPAR FC nbpath=$pathlparfc else # type VIOS FC nbpath=$pathviosfc fi else if [ "$(echo $line | cut -d':' -f3- | tr 'A-Z' 'a-z' | grep -q scsi; echo $?)" -eq "0" ] then if [ ! -e /usr/ios/cli/ioscli ] then # type LPAR SCSI nbpath=$pathlparscsi else # type VIOS SCSI nbpath=$pathviosscsi fi else if [ "$(echo $line | cut -d':' -f3- | tr 'A-Z' 'a-z' | grep -q sas; echo $?)" -eq "0" ] then if [ ! -e /usr/ios/cli/ioscli ] then # type LPAR SAS nbpath=$pathlparsas else # type VIOS SAS nbpath=$pathviossas fi fi fi fi pathok=$(lspath -l $hdisk | grep Enabled | wc -l | awk '{print $1}') pathok_pct=$(echo "scale=1;100*$pathok/$nbpath" | bc | cut -d '.' -f1) verbose="$verbose $hdisk;$pathok/$nbpath" if [ "$pathok_pct" -lt "50" ] then MSG="$MSG $hdisk;$pathok/$nbpath" if [ $STATUS -lt $STATE_CRITICAL ] then STATUS=$STATE_CRITICAL fi else if [ "$pathok_pct" -ne "100" ] then MSG="$MSG $hdisk;$pathok/$nbpath" if [ $STATUS -lt $STATE_CRITICAL ] then STATUS=$STATE_WARNING fi fi fi done } ###################### if [ "$os" = "Linux" ] then count_linux_path else if [ "$os" = "AIX" ] then count_aix_path else echo "########## Unknown OS" STATUS=$STATE_UNKNOWN fi fi if [ $STATUS -eq $STATE_OK ] then echo "OK" else echo "$MSG" fi # For debug if [ "$1" = "-v" ] then echo "$verbose" | tr ' ' '\n' fi exit $STATUS
Check Shared Ethernet Adapter status on VIOS
# cat check_sea.sh
#!/bin/ksh # v1.1 eif check LACP # # Add sudoers: #nagios ALL=(ALL) NOPASSWD: /usr/bin/entstat #nrpe ALL=(ALL) NOPASSWD: /usr/bin/entstat #valeurs retour Nagios STATE_OK=0 STATE_WARNING=1 STATE_CRITICAL=2 STATE_UNKNOWN=3 STATE_DEPENDENT=4 EXITSTATUS=$STATE_UNKNOWN # Default Exit Code as UNKNOWN. #Recuperation variable hostname=$(hostname) SEA=$(lsdev -Cc adapter | grep Shared|grep -v Defined | wc -l | awk '{print $1}') #Initialisation if [ $SEA = "0" ] then MSG="No SEA available on $hostname" EXITSTATUS=2 #Exit status critical else #Verification ensemble des cartes si disponible for ent in $(lsdev -Cc adapter | grep Shared | awk '{print $1}') do # check physical ports / links count_all=$(sudo /usr/bin/entstat -d $ent 2>&1 | grep -i status | grep "Physical Port Link Status" | wc -l | awk '{print $1}') count_up=$(sudo /usr/bin/entstat -d $ent 2>&1 | grep -i status | grep "Physical Port Link Status" | grep Up |wc -l | awk '{print $1}') if [ $count_up -ne $count_all ] then MSG="One or more Ethernet port down" EXITSTATUS=1 #Exit status Warning else MSG="No ethernet port issue" EXITSTATUS=0 #Exit status OK fi # check physical ports Speed, must be the same on all ports nb_speed=$(sudo /usr/bin/entstat -d $ent 2>&1 | grep "Physical Port Speed" | sort -u | wc -l | awk '{print $1}') if [ $nb_speed != "1" ] then MSG="Ethernet Ports bad speed" EXITSTATUS=1 #Exit status Warning fi # check LACP count_all=$(sudo /usr/bin/entstat -d $ent 2>&1 | grep "Synchronization" | wc -l | awk '{print $1}') count_up=$(sudo /usr/bin/entstat -d $ent 2>&1 | grep "Synchronization" | grep 'IN_SYNC' | wc -l | awk '{print $1}') echo "count_all=$count_all count_up=$count_up" >> /tmp/aaa if [ $count_up -ne $count_all ] then MSG="LACP not synced" EXITSTATUS=1 #Exit status Warning fi done fi echo $MSG exit $EXITSTATUS