#!/usr/bin/perl # check_ps -- Nagios Plugin for check specified process on the remote # host is alive or not, via standard SNMP. # $Id$ use strict; use Getopt::Long; use lib "/usr/local/nagios/libexec"; # for use 'utils.pm' use utils qw($TIMEOUT %ERRORS &print_revision &support &usage); use SNMP; ########################################################################### # SNMPWrapper class package SNMPWrapper; use Fcntl ':flock'; use Storable qw(nstore store_fd nstore_fd fd_retrieve freeze thaw dclone); sub new { my($type, $conf, $opt) = @_; my $self = {'conf' => $conf, 'option' => $opt}; $self->{'snmp'} = new SNMP::Session(%{$self->{'option'}}); bless $self, $type; } sub __die { my($msg) = @_; print "${main::PROGNAME} UNKNOWN: ${msg}\n"; exit($utils::ERRORS{'UNKNOWN'} || 3); } sub gettable { my($self, @opt) = @_; return $self->fetch(@opt) unless $self->getcache(); return $self->{'cached'}; } sub session { my $self = shift; return $self->{'snmp'} } # __file_transaction(\%conf) # Arguments: # $conf->{'file'}: filename to pass to open. # $conf->{'flag'}: flag to pass flock. # $conf->{'count'}: number of attempt file locking. # $conf->{'name'}: the name of this session. it appears in error message. # Callback functions: # $conf->{'do'}: code reference. that executes after lock succeed. # $conf->{'retry'}: code reference. that executes before retry locking. # when this code returns TRUE, exit transaction immediatly. # when returns FALSE, try locking continual. # $conf->{'on_error'}: code reference. that executes when file was broken. # # returns: # this function returns the result of 'do' or 'retry'. sub __file_transaction { my($conf) = @_; my $file = $conf->{'file'}; my $c = $conf->{'count'}; my $ret; $conf->{'name'} ||= '__file_transaction'; TRY: { eval { use Fatal qw(:void open flock); open(FD, $file); flock(FD, $conf->{'flag'}); $ret = &{$conf->{'do'}}(\*FD); close(FD); }; last unless $@; for ($@) { /Can't open/ ? __die("ERROR: " . $conf->{'name'} . ": Open failed ($_)\n") : /Can't flock/ ? do { close(FD); unless (--$c) { __die("ERROR: " . $conf->{'name'} . ": lock failed " . $conf->{'count'} . "times ". "($_)\n"); } $ret = &{$conf->{'retry'}}; return $ret if $ret; sleep 1; redo TRY; } : /Magic number checking on storable file failed/ ? do { &{$conf->{'on_error'}} if $conf->{'on_error'}; __die("ERROR: " . $conf->{'name'} . ": " . "fd_retrieve failed. It may broken file ($_)\n") } : __die("ERROR: ".$conf->{'name'}.": Unknown error occured: $_\n"); } } return $ret; } sub fetch { my($self, @opt) = @_; my $conf = $self->{'conf'}; my $host = $conf->{'host'}; my $file = $conf->{'cachedir'} . "/" . $host; return(__file_transaction({ 'file' => "+>> $file", 'flag' => LOCK_EX|LOCK_NB, 'count' => 5, 'name' => 'snmpget and write to cache', 'do' => sub { local(*FD) = @_; seek(FD, 0, 0); truncate(FD, 0); my $result = $self->{'cached'} = $self->{'snmp'}->gettable(@opt); store_fd($result, \*FD); return($result); }, 'retry' => sub { return $self->getcache() }, 'on_error' => sub { unlink $file }, })); } sub getcache { my $self = shift; my $conf = $self->{'conf'}; my $host = $conf->{'host'}; my $file = $conf->{'cachedir'} . "/" . $host; my $status = my($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($file); return 0 unless (($status > 0) and ($size > 0) and (time - $mtime < $conf->{'expire'})); return(__file_transaction({ 'file' => $file, 'flag' => LOCK_SH|LOCK_NB, 'count' => 5, 'name' => 'read from cache', 'do' => sub { local(*FD) = @_; seek(FD, 0, 0); return($self->{'cached'} = fd_retrieve(\*FD)); }, 'retry' => sub { return undef }, 'on_error' => sub { unlink $file }, })); } ########################################################################### # main package main; our $PROGNAME; # # consts # ($PROGNAME) = ($0 =~ qr{([^/]+)$}); my $REVISION = '1.02'; my $snmp_version = '2c'; my $hrSWRunTable = '.1.3.6.1.2.1.25.4.2'; # HOST-RESOURSES-MIB::hrSWRunTable my $hrSWRunEntry = '.1.3.6.1.2.1.25.4.2.1'; # HOST-RESOURSES-MIB::hrSWRunEntry my $def_cachedir = "/tmp/${main::PROGNAME}"; # default cache directory my $def_expire = 50; # # vars # my $conf = {}; # # functions # sub print_usage { print "Usage: ${main::PROGNAME} -H [-C community] " . "-w -c \n"; print " [-p ] [-t ] [-x ]\n"; print " [-d ]\n"; } sub print_help { print_revision($main::PROGNAME, $REVISION); print_usage(); print <<"_EOD_"; -H, --hostname=HOST Name or IP address of host to check -C, --community=community SNMPv2c community (default: public) -w, --warning=INTEGER Number of processes which a WARNING status will result -c, --critical=INTEGER Number of processes which a CRITICAL status will result -p, --process=STRING Name of process for watch (default: crond) -t, --timeout=INTEGER Seconds before the plugin times out (default: ${TIMEOUT}) -x, --expire=INTEGER Keep SNMP result as cache while specified seconds (default: ${def_expire}) '-x 0' for don't use cache. (It cause increasing network load. Be careful!) -d, --dir=STRING Cache directory (default: $def_cachedir) _EOD_ } sub exit_on { my($code, @msg) = @_; $code = 'UNKNOWN' if ! exists $ERRORS{$code}; printf("%s %s: ", $main::PROGNAME, $code); printf(@msg); print "\n"; exit $ERRORS{$code}; } sub parse_arg { my($opt_V, $opt_h, $opt_w, $opt_c, $opt_H, $opt_C, $opt_p, $opt_t, $opt_x, $opt_d); my $conf = shift; Getopt::Long::Configure('bundling'); GetOptions ("V" => \$opt_V, "version" => \$opt_V, "h" => \$opt_h, "help" => \$opt_h, "w=s" => \$opt_w, "warning=s" => \$opt_w, "c=s" => \$opt_c, "critical=s" => \$opt_c, "H=s" => \$opt_H, "hostname=s" => \$opt_H, "C=s" => \$opt_C, "community=s" => \$opt_C, "p=s" => \$opt_p, "process=s" => \$opt_p, "t=s" => \$opt_t, "timeout=s" => \$opt_t, "x=s" => \$opt_x, "expire=s" => \$opt_x, "d=s" => \$opt_d, "dir=s" => \$opt_d, ); if ($opt_V) { print_revision($main::PROGNAME, $REVISION); exit $ERRORS{'OK'}; } if ($opt_h) {print_help(); exit $ERRORS{'OK'};} # mandatory options ($opt_H) || usage("Host name/address not specified\n"); $conf->{'host'} = $1 if ($opt_H =~ /^([-.A-Za-z0-9]+)$/); ($conf->{'host'}) || usage("Invalid host: $opt_H\n"); ($opt_w) || usage("Warning threshold not specified\n"); $conf->{'warning'} = $1 if ($opt_w =~ /^(\d+)$/); ($conf->{'warning'}) || usage("Invalid warning threshold: $opt_w\n"); ($opt_c) || usage("Critical threshold not specified\n"); $conf->{'critical'} = $1 if ($opt_c =~ /^(\d+)$/); ($conf->{'critical'}) || usage("Invalid critical threshold: $opt_c\n"); # optional options $conf->{'community'} = $opt_C || 'public'; $conf->{'process'} = $opt_p || 'crond'; $conf->{'timeout'} = $TIMEOUT * 100000; if ($opt_t) { usage("Invalid timeout specified\n") unless ($opt_t =~ /^(\d+)$/); $conf->{'timeout'} = $1 * 100000; } $conf->{'expire'} = $def_expire; if (defined $opt_x) { usage("Invalid expire specified\n") unless ($opt_x =~ /^(\d+)$/); $conf->{'expire'} = $1 + 0; } $conf->{'cachedir'} = $opt_d || $def_cachedir; if (($conf->{'expire'} > 0) and (! -d $conf->{'cachedir'})) { unless (mkdir($conf->{'cachedir'})) { exit_on('UNKNOWN', "Cannot create cachedir (mkdir: " . $! . ")"); } } return $conf; } # # Initialize # $ENV{'PATH'} = ''; $ENV{'BASH_ENV'} = ''; $ENV{'ENV'} = ''; parse_arg($conf); # # Polling # my $snmp = new SNMPWrapper($conf, { 'DestHost' => $conf->{'host'}, 'Community' => $conf->{'community'}, 'Version' => $snmp_version, 'Timeout' => $conf->{'timeout'} }); my $result = $snmp->gettable($hrSWRunTable, { 'columns' => ['hrSWRunPath'], 'noindexes' => 1 }); unless (%$result) { exit_on('CRITICAL', 'ERROR: %s, (%s) in %s.', $snmp->session->{'ErrorStr'}, @{%$conf}{'process', 'host'}); } my $queue = {}; foreach my $id (keys %$result) { next unless $result->{$id}->{'hrSWRunPath'} eq $conf->{'process'}; $queue->{$id} = $result->{$id}; } my $procs = keys(%$queue); if ($procs == 0) { exit_on('CRITICAL', "There is no process (%s) in %s.", @{%$conf}{'process', 'host'}); } elsif ($procs >= $conf->{'critical'}) { exit_on('CRITICAL', "There is %d process (>=%s), (%s) in %s.", $procs, @{%$conf}{'critical', 'process', 'host'}); } elsif ($procs >= $conf->{'warning'}) { exit_on('WARNING', "There is %d process (>=%s), (%s) in %s.", $procs, @{%$conf}{'warning', 'process', 'host'}); } exit_on('OK', "There is %d process (%s) in %s.", $procs, @{%$conf}{'process', 'host'}); __END__ =head1 NAME check_ps -- Nagios Plugin for check specified process on the remote host is alive or not, via SNMP. =head1 SYNOPSIS check_ps -H [-C community] -w -c [-p ] [-t ] [-x ] [-d ] =head1 DESCRIPTION check_ps collects remote process information via SNMPv2 HOST-RESOURCES-MIB (OID: .1.3.6.1.2.1.25.4.2.1.4). If number of specified running process is too high or not runnning, It issues WARNING or CRITICAL states. check_ps aims to replace check_procs. check_procs is more powerful, but it needs the 'check_nspr' program to be installed in the remote systems. It is painful to install such additional program in all remote systems, Especially, when you have a lot of systems to watch. check_ps only depends SNMP standard MIBs. So you only needed to make sure that remote system provides SNMP. Additionally, cache_ps stores the result of SNMP query to local cache. It decrease the network load even watching several processes. The options are as follows: -H, --hostname=HOST Name or IP address of host to check -C, --community=community SNMPv2c community -w, --warning=INTEGER Number of processes which a WARNING status will result -c, --critical=INTEGER Number of processes which a CRITICAL status will result -p, --process=STRING Name of process for watch. It attempts exactly match. -t, --timeout=INTEGER Seconds before the plugin times out. -x, --expire=INTEGER Keep SNMP result as cache while specified seconds. '-x 0' for don't use cache. (It cause increasing network load, especially you watching multiple remote processes. Be cafeful!) -d, --dir=STRING Cache directory. =head1 HOW TO DEPLOY IN YOUR ENVIRONMENT 1. Make sure that net-snmpd is installed in remote system. 2. Determine the name of process what you want to watch. e.g. If you want to watch sshd process, type as follow: % snmpwalk -v2c -c YOUR_SNMP_COMMUNITY_NAME HOST \ .1.3.6.1.2.1.25.4.2.1.4 | grep sshd HOST-RESOURCES-MIB::hrSWRunPath.2056 = STRING: "/usr/sbin/sshd" In this case, "/usr/sbin/sshd" is the strings for pass to -p option. 3. Dry run. # su - nagios # ./check_ps -C YOUR_SNMP_COMMUNITY_NAME -H HOST -w 2 -c 10 -p /usr/sbin/sshd check_ps OK: There is 1 process (/usr/sbin/sshd) in HOST. 4. Modify the nagios configuration file. e.g. define command { command_name check_sshd_proc command_line $USER1$/check_ps -H $HOSTADDRESS$ \ -C YOUR_SNMP_COMMUNITY -w $ARG1$ -c $ARG2$ \ -p "/usr/sbin/sshd" } Please remove backslashes on the end of line, and write it as one line. Then, define services that use above command. 5. Reload nagios. Enjoy! =head1 REQUIREMENT =over 4 =item * Perl 5 =item * SNMP module version 5.0301 or later. =item * Nagios 3.0 or later. =back =head1 COPYRIGHT AND LICENSE AS IS.