User Tools

Site Tools


aix:scripts_category:nagios_aix_check

This is an old revision of the document!


Scripts for AIX monitoring with Nagios

Monitor filesystems

This script with an exception list will check the percent usage of each filesystem, and you can also use wildcards.

root@aixserver - /opt/freeware/lib/nagios/plugins > cat MY_checkfs.cfg
#fs_name:pct_warning:pct_critical
# You can add exception in this file, else the default is used
#Example:
#/db*:98:100   , 98 and 100 whill be used for all filesystems /db*
#default:90:95
/tmp:80:95
/usr:96:98
/opt:95:98
/cdrom:100:100
/mnt:100:100
root@aixserver - /opt/freeware/lib/nagios/plugins > cat MY_checkfs.sh
#!/usr/bin/ksh93
#set -x
##################################################
#@(#) send an alert if filesystems usage is higher than threshold
##################################################
# associate file is checkfs.cfg (exceptions)
# please do not modify this script use checkfs.cfg
# checkfs.cfg format:
# fs_name:pctwarning_threshold:pctcritical_threshold
# version: 1.1 10-02-2014 Manu
##################################################

dir=`dirname $0`
#. $dir/.env
sn=`basename $0 | cut -d. -f1`

configfile=/opt/freeware/lib/nagios/plugins/$sn.cfg
logpath=/tmp
logname=$logpath/$sn.log
tmpcfg=$logpath/$sn.txt
tmpscript=$logpath/$sn.scr
nb_file=25
nb_dir=15
DATE=$(date +"%Y-%m-%d %H:%M")

STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4

#echo "$sn: $DATE" > $logname

#------------------------------------------------
# generate_case_file ()
#------------------------------------------------
generate_case_file()
{
pctwarn_def=$(grep "^#default" $configfile | cut -d":" -f2)
pctcrit_def=$(grep "^#default" $configfile | cut -d":" -f3)
cat $configfile | grep -v "^ *#" | sed '/^$/d' > $tmpcfg
cat $tmpcfg | awk -F':' '{print $1") pctwarn="$2" pctcrit="$3}' | sed '/^$/d;$G' | sed '$s/^/\*\)\ pctwarn=pctwarn_def\ pctcrit=pctcrit_def /' | sed "s/pctwarn_def/$pctwarn_def/" | sed "s/pctcrit_def/$pctcrit_def/" | sed '2,$s/^/\\n/' | sed 's/$/;;/' > $tmpcfg
}

#------------------------------------------------
# check_fs ()
#------------------------------------------------
check_fs()
{
EXIT_CODE=0
MSG_WARN=""
MSG_CRIT=""
OUTPUT="All filesystems OK"
clauses=$(cat $tmpcfg)
for i in $(df -k | egrep -v "Filesystem|:|/proc" | grep -v ":" | grep "/dev/" | awk '{print $4 $7}')
do
  fs=`echo $i | cut -f 2 -d%`
  pct_fs=`echo $i | cut -f 1 -d%`
  mail=""
  cmd="case $fs in \n $clauses                      \n esac"
  echo $cmd > $tmpscript
  . $tmpscript > /dev/null
  if [ $pct_fs -gt $pctwarn ]
  then
    if [ $pct_fs -gt $pctcrit ]
    then
      MSG_CRIT="$MSG_CRIT $fs used $pct_fs"
      EXIT_CODE=$(echo $STATE_CRITICAL)
    else
      MSG_WARN="$MSG_WARN $fs used $pct_fs"
      if [ $STATE_WARNING -gt $EXIT_CODE ]
      then
        EXIT_CODE=$(echo $STATE_WARNING)
      fi
    fi
    #info_fs >> $logname 2>&1 &
  fi
done

if [ "$MSG_CRIT" != "" ]
then
  OUTPUT="DISK CRITICAL - $MSG_CRIT"
fi
if [ "$MSG_WARN" != "" ]
then
  if [ "$MSG_CRIT" != "" ]
  then
    OUTPUT="$OUTPUT -- DISK WARNING - $MSG_WARN"
  else
    OUTPUT="DISK WARNING - $MSG_WARN"
  fi
fi
rm $tmpcfg $tmpscript > /dev/null 2>&1
}

#------------------------------------------------
# info_fs ()
#------------------------------------------------
info_fs()
{
echo "####### Here are the $nb_file biggest files from filesystem $fs"
find $fs  -xdev -type f -ls | sort +6nr | head -$nb_file
echo "\n####### Here are the $nb_dir biggest directories from filesystem $fs"
du -ms $fs/* | sort -nr | head -$nb_dir
}

generate_case_file #>> $logname 2>&1
check_fs #>> $logname 2>&1
echo $OUTPUT
exit $EXIT_CODE

check_mailq_aix

#!/usr/bin/ksh
# set -x
##################################################
#@(#) check mail server and mailq
# best practice is to stop sendmail service on AIX
# as it's not required to send email
# but if you leave it running, it happens that
# the service hangs, then no mails 'll be sent
##################################################
# v1.1 12-2020
##################################################

# Return codes:
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

DS='smtp.local.lu'             # Add here the SMTP server to check

MAILDIR=/var/spool/mail
TMPfile=/tmp/check_mail.tmp

# no warning before 3 days
mailwarning=3   # max days mail pending
mailcritical=10 # max days mail pending

if [ $(grep "^DS${DS}" /etc/sendmail.cf > /dev/nul 2>&1; echo $?) -ne "0" ]
then
  echo "CRITICAL: config file error /etc/sendmail.cf"
  exit $STATE_CRITICAL
else
  if [ $(ls $MAILDIR | grep -v lost | wc -l) != "0" ]
  then
    cat /dev/null > $TMPfile
    for file1 in $(find $MAILDIR -type f)
    do
      grep '^Date:' $file1| cut -d' ' -f2-5 | head -5 | tr ' ' ';' >> $TMPfile
    done
    for line in $(cat $TMPfile)
    do
      val=$(echo $line | sed 's/;/\ /g')
      /opt/freeware/bin/date -d"$val" "+%s" >> $TMPfile.1
    done

    val=$(cat $TMPfile.1 | sort | head -1)
    val1=$(/opt/freeware/bin/date "+%s")
    days=$(echo "scale=0;($val1-$val)/3600/24" | bc)
    rm $TMPfile* 2>/dev/null
    if [ "$days" -gt "$mailcritical" ]
    then
      echo "CRITICAL: mail pending $days days"
      exit $STATE_CRITICAL
    else
      if [ "$days" -gt "$mailwarning" ]
      then
        echo "WARNING: mail pending $days days"
        exit $STATE_WARNING
      fi
    fi
  else
    echo "OK"
    exit $STATE_OK
  fi
fi

check_aix_cpu.sh

#!/usr/bin/ksh

# check VCPU usage in percent
# according to a definition set by the app . owner
# the virtual cpu  usage > 99% CPUs left on the system will result
# in an errorcode 2
# for nagios to be critical and alerted

SAR="/usr/sbin/sar"

STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4

LIST_WARNING_THRESHOLD=${LIST_WARNING_THRESHOLD:-"95"}
LIST_CRITICAL_THRESHOLD=${LIST_CRITICAL_THRESHOLD:-"99"}
INTERVAL_SEC=${INTERVAL_SEC:="1"}
NUM_REPORT=${NUM_REPORT:="3"}

print_usage() {
        echo ""
        echo "$PROGNAME $RELEASE - CPU Utilization check script for Nagios"
        echo ""
        echo "Usage: check_cpu_stats.sh -w -c (-i -n)"
        echo ""
        echo "  -w  Warning threshold in % for warn_user,warn_system,warn_iowait CPU (default : ${LIST_WARNING_THRESHOLD})"
        echo "  Exit with WARNING status if cpu exceeds warn_n"
        echo "  -c  Critical threshold in % for crit_user,crit_system,crit_iowait CPU (default : ${LIST_CRITICAL_THRESHOLD})"
        echo "  Exit with CRITICAL status if cpu exceeds crit_n"
        echo "  -h  Show this page"
        echo ""
    echo "Usage: $PROGNAME -w 95 -c 99"
    echo ""
    exit 0
}

# Parse parameters
while [ $# -gt 0 ]; do
    case "$1" in
        -h | --help)
            print_usage
            exit $STATE_OK
            ;;
        -w | --warning)
                shift
                LIST_WARNING_THRESHOLD=$1
                ;;
        -c | --critical)
               shift
                LIST_CRITICAL_THRESHOLD=$1
                ;;
        esac
shift
done

NUM_VCPUS=$(lsdev -Cc processor|wc -l|awk '{print $1}')
PHYSC=$($SAR ${INTERVAL_SEC} ${NUM_REPORT} | tail -1 | awk '{print $6}')

PCTUSED=$(echo "scale=2\n ${PHYSC}*100/${NUM_VCPUS}" | bc)

if [ $PCTUSED -gt $LIST_WARNING_THRESHOLD ]
then
  if [ $PCTUSED -gt $LIST_CRITICAL_THRESHOLD ]
  then
    echo "CRITICAL - CPU usage at $PCTUSED%"
    exit $STATE_CRITICAL
  else
    echo "WARNING - CPU usage at $PCTUSED%"
    exit $STATE_WARNING
  fi
else
  echo "OK - CPU usage at $PCTUSED%"
  exit $STATE_OK
fi

Check path disks

# cat check_paths.sh
#!/bin/sh
#@(#) v1.0 Count number of paths per disk

nbpathlpar=8
nbpathvios=4
nbpath=$1
STATUS=0

STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

MSG=""

# check if server is VIOS
if [ -e "/usr/ios/cli/ioscli" ]
then
  nbpath1=$nbpathvios
else
  nbpath1=$nbpathlpar
fi

# specified value for nb paths
if [ "$npath" == "" ]
then
  nbpath=$nbpath1
fi

os=$(uname -a | awk '{print $1}')

grepp() { [ $# -eq 1 ] && perl -00ne "print if /$1/i" || perl -00ne "print if /$1/i" < "$2";}

#---------------------
count_linux_path()
{
for disk in $(/usr/sbin/multipath -ll | awk '{print $1}' | grep '^mpath')
do
  echo $disk $(/usr/sbin/multipath -ll | grep -v "policy=" | grep -v "size=" | sed '/^mpath/i \\n' | grepp $disk | grep -v '^mpath' | grep active | wc -l) | sed 's/\ /;/g' | sed 's/$/;/' | grep -v ";$numberpath;"
  ($verbose) && echo $disk $(/usr/sbin/multipath -ll | grep -v "policy=" | grep -v "size=" | sed '/^mpath/i \\n' | grepp $disk | grep -v '^mpath' | grep active | wc -l) | sed 's/\ /;/g' | sed 's/$/;/'
done
}

#---------------------
count_aix_path()
{
if [ $(lsdev -Cc disk | grep -v Available | wc -l | awk '{print $1}') -ne "0" ]
then
  MSG="$MSG WARNING: disks defined"
  STATUS=$STATE_WARNING
else
  STATUS=$STATE_OK
fi

for i in $(lsdev -Cc disk | grep -i mpio | awk '{print $1}')
do
  pathok=$(lspath -l $i | grep Enabled | wc -l | awk '{print $1}')
  pathok_pct=$(echo "scale=1;100*$pathok/$nbpath" | bc | cut -d '.' -f1)
  if [ "$pathok_pct" -le "50" ]
  then
    MSG="$MSG $i;$pathok/$nbpath"
    if [ $STATUS -lt $STATE_CRITICAL ]
    then
      STATUS=$STATE_CRITICAL
    fi
  else
    if [ "$pathok_pct" -ne "100" ]
    then
      MSG="$MSG $i;$pathok/$nbpath"
      if [ $STATUS -lt $STATE_CRITICAL ]
      then
        STATUS=$STATE_WARNING
      fi
    fi
  fi
done
}


if [ "$os" = "Linux" ]
then
  count_linux_path
else
  if [ "$os" = "AIX" ]
  then
    count_aix_path
  else
    echo "########## Unknown OS"
        STATUS=$STATE_UNKNOWN
  fi
fi

if [ $STATUS -eq $STATE_OK ]
then
  echo "OK"
else
  echo "$MSG"
fi

exit $STATUS
aix/scripts_category/nagios_aix_check.1611046198.txt.gz · Last modified: 2021/01/19 09:49 by manu