source: gpfs_3.1_ker2.6.20/lpp/mmfs/samples/nfscluster/nfsmonitor @ 152

Last change on this file since 152 was 16, checked in by rock, 17 years ago
  • Property svn:executable set to *
File size: 24.0 KB
Line 
1#!/bin/ksh
2# @(#)23  1.12.1.10  src/avs/fs/mmfs/samples/nfscluster/nfsmonitor, mmfs, avs_rgpfs24, rgpfs24s011a 3/20/07 16:46:39
3#
4
5# HA-NFS monitoring
6# Usage: nfsmonitor [start|stop|restart|status]
7
8# Monitor levels
9LEVEL0=0 # NO_MONITORING
10LEVEL1=1 # ALERT_ONLY
11LEVEL2=2 # RESTART (not applicable for all services)
12LEVEL3=3 # FAILOVER
13
14MONITOR_INTERVAL=${MONITOR_INTERVAL-15}
15MONITOR_NETWORK=${MONITOR_NETWORK-3}
16MONITOR_PORTMAP=${MONITOR_PORTMAP-3}
17MONITOR_NFSD=${MONITOR_NFSD-3}
18MONITOR_MOUNTD=${MONITOR_MOUNTD-3}
19MONITOR_STATD=${MONITOR_STATD-3}
20MONITOR_SSHD=${MONITOR_SSHD-3}
21
22# All times below are in seconds
23NFS_RPC_ACT_SAMPLE_INTERVAL=3
24RESTART_TIMEOUT=7
25
26# Internal flags
27NUMBER_OF_RESTARTS=3
28
29# Functions return values (used only within this file)
30SERVICE_RUNNING=1
31SERVICE_NOT_RUNNING=2
32SERVICE_UNKNOWN=3
33
34if [ ! -f /var/mmfs/etc/nfsfuncs ]; then
35    echo "$0: Can't find NFS functions in /var/mmfs/etc"
36    exit 0
37fi
38. /var/mmfs/etc/nfsfuncs
39
40# Display an alert message in syslog. Optionally, call a customer-provided alert script
41# Usage: alert <message> <node issuing alert>
42alert() {
43    service=$1
44    actionmsg=$2
45    shift 2
46    [ -n "$*" ] && comment="($*)"
47    message="Monitoring detected $service is inactive, $action $comment"
48    msg "$message"
49    [ ! -e /var/mmfs/etc/alert ] && return
50    /var/mmfs/etc/alert "$message" "$GPFS_IP"
51}
52
53alertmsg() {
54    service=$1
55    shift
56    alert $service "no action taken as configured" $*
57}
58
59failovermsg() {
60    service=$1
61    shift
62    alert $service "node failure initiated as configured" $*
63}
64
65# Terminate all processes that are running the nfs tool.
66# This is useful for the case that the restart process is hanging...
67nfsToolKill() {
68    if [ -f /etc/init.d/nfsserver ]; then
69  NFSTOOL=/etc/init.d/nfsserver
70    else
71  NFSTOOL=/etc/init.d/nfs
72    fi
73    nfsToolBase=${NFSTOOL##*/}
74    nfsToolPid=`pidof -o $$ -o $PPID -o %PPID -x $NFSTOOL || \
75    pidof -o $$ -o $PPID -o %PPID -x $nfsToolBase`
76    debugmsg2 "The pids of $NFSTOOL are $nfsToolPid"   
77    if [[ -z $nfsToolPid ]]; then
78  return
79    fi
80    kill -9 $nfsToolPid
81}
82
83# Check if a given service is configured to be restarted.
84# Returns: the number of restarts or 0 if configured for no restart
85checkRestart() {
86    service=$1
87    eval service_level='$'MONITOR_$service
88    debugmsg2 "$service level is $service_level"
89    [ $service_level -ge $LEVEL2 ] && return $NUMBER_OF_RESTARTS
90    return 0
91}
92
93# Get the status of a given service.
94# Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING
95getStatus() {
96    service=$1
97    checkStatus $service
98    status=$?
99    if [ $status -eq 0 ]; then
100  debugmsg2 "$service is running"
101  return $SERVICE_RUNNING
102    else 
103  debugmsg "$service is not running (status $status)"
104  return $SERVICE_NOT_RUNNING 
105    fi
106}
107
108invokeFailover() {
109    debugmsg "Invoking failover..."
110    # Stop nfs gracefully to prevent client from getting ESTALE
111    debuglog2 touch /tmp/ha-nfs-reboot 2> /dev/null
112    stop.nfs
113    # Kill the gpfs daemon on the node to invoke failover
114    debugmsg "Stopping GPFS..."
115    /etc/init.d/gpfs stop
116    exit
117}
118
119invokeFailoverReboot() {
120    debugmsg "Invoking failover with reboot..."
121    # Stop nfs gracefully to prevent client from getting ESTALE
122    stop.nfs
123    # Kill the gpfs daemon on the node to invoke failover
124    debugmsg "Stopping GPFS before reboot..."
125    /etc/init.d/gpfs stop
126    reboot
127    exit
128}
129
130#################
131# Monitoring nfsd
132#################
133
134# This function samples /proc/net/rpc/nfsd twice within a given interval
135# and compares the two samples to detect any nfsd rpc activity
136# Returns: if activity was detected -> SERVICE_RUNNING
137#          if there is no entry in proc or no activity was detected, no conclusion ->
138#          SERVICE_UNKNOWN
139detectNfsdActivity() {
140    set -A v2procs NAME COUNT NULL GETATTR SETATTR ROOT LOOKUP READLINK \
141                   READ WRCACHE WRITE CREATE REMOVE RENAME \
142                   LINK SYMLINK MKDIR RMDIR READDIR FSSTAT
143    set -A v3procs NAME COUNT NULL GETATTR SETATTR LOOKUP ACCESS READLINK \
144                   READ WRITE CREATE MKDIR SYMLINK MKNOD REMOVE RMDIR RENAME \
145                   LINK READDIR READDIRPLUS FSSTAT FSINFO PATHCONF COMMIT
146
147    procfile=/proc/net/rpc/nfsd
148    if [ ! -f $procfile ]; then
149        msg "Monitoring could not find /proc/net/rpc/nfsd"
150        return $SERVICE_UNKNOWN
151    fi
152    # Sample v2/v3 activity (using proc) in the next NFS_RPC_ACT_SAMPLE_INTERVAL sec
153    set -A v2procs1 $(cat $procfile | grep -w proc2)
154    set -A v3procs1 $(cat $procfile | grep -w proc3)
155    debugmsg2 "Sleeping for $NFS_RPC_ACT_SAMPLE_INTERVAL"
156    sleep $NFS_RPC_ACT_SAMPLE_INTERVAL
157    set -A v2procs2 $(cat $procfile | grep -w proc2)
158    set -A v3procs2 $(cat $procfile | grep -w proc3)
159
160    # Use the samples to detect activity
161    p=2 # skipping name and count fields
162    n=${#v2procs[@]}
163    while [ $p -lt $n ]; do
164  activity=$((${v2procs2[$p]}-${v2procs1[$p]}))
165  if [ $activity -gt 0 ]; then
166      debugmsg2 "nfsd activity detected: $activity ${v2procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec"
167      return $SERVICE_RUNNING
168  fi
169        p=$((p+1))
170    done
171    p=2 # skipping name and count fields
172    n=${#v3procs[@]}
173    while [ $p -lt $n ]; do
174  activity=$((${v3procs2[$p]}-${v3procs1[$p]}))
175  if [ $activity -gt 0 ]; then
176      debugmsg2 "nfsd activity detected: $activity ${v3procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec"
177      return $SERVICE_RUNNING
178  fi
179        p=$((p+1))
180    done
181    # no activity detected
182    debugmsg2 "Could not detect nfsd activity using /proc"
183    return $SERVICE_UNKNOWN
184}
185
186# If nfsd is configured to be restarted, this function attempts to start/restart
187# the nfsd process as many times as configured. The operation parameter is set
188# to start/restart.
189# The success of restart is determined by checking the status of nfsd.
190# Note that portmap has to be running in order to restart nfsd.
191# If restart has failed, alert and failover are invoked as configured.
192# For nfsd, we use the /etc/init.d/nfs(RH)|nfsserver(SUSE) utility since it stops
193# the service before restarting it (which is important to make sure that nfsd
194# re-registerin with portmap, for example, in the case of restarting nfsd after
195# restarting portmap. This utility also reloads /etc/exports.
196startNfsd() {
197    checkRestart NFSD # This call takes care of MONITOR_NFSD < LEVEL2 (numberOfRestarts=0)
198    numberOfRestarts=$?
199    attemptNo=0
200    debugmsg2 "About to start nfsd (up to $numberOfRestarts times)"
201    while [ $numberOfRestarts -gt 0 ]; do
202  attemptNo=$((attemptNo+1))
203  debugmsg2 "In startnfsd, attempt number = $attemptNo"
204  nfsService start
205  # give the restart/start chance to complete
206  sleep $RESTART_TIMEOUT
207  getStatus nfsd
208  status=$?
209  debugmsg2 "The nfsd status after $attemptNo attempts to start is $status"
210  [[ $status == $SERVICE_RUNNING ]] && return
211  # start attempt has failed/hangs -> kill the process and retry
212  debugmsg "start nfsd failed/hangs, about to kill the start process."
213  nfsToolKill
214        numberOfAttempts=$((numberOfRestarts-1))
215    done
216
217    # If we get here, it means that all start attempts have failed
218    # Failed to restart nfsd, check for failover configuration parameters
219    if [[ $MONITOR_NFSD -eq $LEVEL3 ]]; then 
220        failovermsg nfsd
221  invokeFailover
222    else
223        alertmsg nfsd
224    fi
225}
226
227# NULL RPC test: use rpcinfo to send a null rpc to nfs v3
228# Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING
229nfsdNullRpcTest() {
230    hostname=$(hostname)
231    # Use rpcinfo to send a null rpc to nfs v3 using UDP
232    debuglog2 rpcinfo -u $hostname nfs 3
233    if [ $? -eq 0 ]; then
234  debugmsg2 "Sent nullrpc to nfsd v3 using rpcinfo, the service is up and running"
235  return $SERVICE_RUNNING
236    else
237  msg "Sent nullrpc to nfsd v3 using rpcinfo, the service is not available (it maybe because portmap was started after nfsd, or that rpcinfo is not installed)"
238  return $SERVICE_NOT_RUNNING
239    fi
240}
241
242# Run all the tests for monitoring nfsd, and take actions according
243# to the monioring level.
244monitorNfsd() {
245    ifGPFSDownExit $GPFS_IP
246
247    # Check that the nfsd process is running
248    getStatus nfsd
249    runStatus=$?
250
251    case $runStatus in
252  $SERVICE_RUNNING)
253      debugmsg2 "Perform more tests, to make sure that nfsd is functioning"
254      detectNfsdActivity
255      case $? in
256    $SERVICE_RUNNING) 
257        return ;;
258    $SERVICE_UNKNOWN)
259        nfsdNullRpcTest
260        case $? in
261      $SERVICE_NOT_RUNNING)
262          startNfsd ;;     
263      $SERVICE_RUNNING)
264          return ;;
265        esac
266        ;;
267      esac
268      ;;
269  $SERVICE_NOT_RUNNING)
270      # Note that if nfsd was not running and the start has succeeded, we assume
271      # that the process is running, and only perform the next level of tests
272            # (rpc activity, and null rpc) next time.
273      startNfsd
274      return ;;
275    esac
276}
277
278####################
279# Monitoring mountd
280####################
281restartMountd() {
282    checkRestart MOUNTD  # This takes care of MONITOR_MOUNTD < LEVEL3 (numberOfRestarts=0)
283    numberOfRestarts=$?
284    attemptNo=0
285    debugmsg2 "About to restart mountd (up to $numberOfRestarts times as configured, note: 0 means that the configuration level is lower than LEVEL2)"
286    while [ $numberOfRestarts -gt 0 ]; do
287  attemptNo=$((attemptNo+1))
288  startMountd
289  restartPID=$! 
290  debugmsg2 "restartPID=$restartPID"
291  # give the restart a chance to complete
292  sleep $RESTART_TIMEOUT
293  # check the status of mountd after the restart
294  debugmsg2 "Try to restart mountd (attempt $attemptNo), checking the status:"
295  getStatus /usr/sbin/rpc.mountd
296  if [ $? -eq $SERVICE_RUNNING ]; then     
297      debugmsg2 "Succeeded to restart the mountd service at $attemptNo attempt"
298      return
299  fi
300  # restart attempt has failed/hangs -> kill the process and retry
301  debugmsg "Restarting mountd failed/hangs, about to kill the restart process."
302  if [ -e /sbin/startproc ]; then
303      debuglog kill -9 $restartPID
304  else
305      nfsToolKill # FIX
306  fi
307  numberOfRestarts=$((numberOfRestarts-1))
308    done
309
310    # If we get to this point, it means that all restart attempts have failed
311    msg "Failed to restart the mountd (tried $attemptNo times as configured)"
312    # Failed to restart mountd, check for failover configuration parameters
313    if [[ $MONITOR_MOUNTD == $LEVEL3 ]]; then
314        failovermsg mountd
315  invokeFailover
316    else
317        alertmsg mountd
318    fi
319}   
320
321
322# This is the main function for monitoring mountd, and take actions according
323# to the monioring level.
324monitorMountd() {
325    ifGPFSDownExit $GPFS_IP
326    getStatus /usr/sbin/rpc.mountd   
327    case $? in
328  $SERVICE_RUNNING)
329      return
330      ;;
331  $SERVICE_NOT_RUNNING)
332      restartMountd
333      ;;
334    esac
335}
336
337########################
338# Monitoring the network
339########################
340
341# Monitor the network.
342# For now the only tests performed are: (1) whether the link is connected or not, using
343# ethtool. (2) ping the gateway. More tests can be added here later.
344monitorNetwork() {
345    ifGPFSDownExit $GPFS_IP
346    # TEST1: make sure that all interfaces that are used for nfs serving are connected
347    nfsIfs=$(getNfsIFs $GPFS_IP)
348    if [[ -z $nfsIfs ]]; then
349  msg "No configured NFS IP addresses detected on any of the node's interfaces"
350  nwFailoverCondition "no configured nfs interfaces"
351    else
352  for eth in $nfsIfs; do
353            tmp=$(mmgetifconf | grep -w $eth | awk '{print $1}')
354            if [[ -z $tmp ]]; then
355         nwFailoverCondition "interface is down"
356            fi
357
358      checkLinkStatus $eth
359      if [ $? -eq 0 ]; then
360    continue
361      else
362          nwFailoverCondition "link is not connected"
363      fi
364  done
365    fi
366
367    # TEST2: check that all NFS IP addresses are enabled
368    nfsIPs=$(getNfsIPs $GPFS_IP)
369    for ip in $nfsIPs; do
370        mmgetifconf | grep -q $ip
371        [ $? -eq 0 ] && continue
372        debugmsg "monitor detected $ip is down, restarting"
373        ifUp $ip   
374    done
375
376    # Now check that all NFS IP addresses for failover nodes are enabled
377    nfsIPs=
378    for ip in $(getFailedNodes $GPFS_IP); do
379        nfsIPs="$nfsIPs $(getNfsIPs $ip)"
380    done
381    for ip in $nfsIPs; do
382        mmgetifconf | grep -q $ip
383        [ $? -eq 0 ] && continue
384        [ "$(IPaddr $ip monitor)" == "OK" ] && continue
385        debugmsg "monitor detected $ip is down, restarting"
386        debuglog IPaddr $ip start
387    done
388
389    # TEST3: ping the gateway
390    pingDefaultGateway
391}
392
393# List all interfaces used for NFS serving from NODELIST
394getNfsIFs () {
395    thisGpfsIP=$1
396    eth=""
397    # Get the list of nfs ip addresses for the given gpfs ip address
398    nfsIPList=$(getNfsIPs $thisGpfsIP)
399    # Handle the case that there is not entry for the node in nfs.nodes
400    if [[ -z $nfsIPList ]]; then
401  debugmsg "ALERT: No entry was found in nfs.nodes for this node (gpfs ip address:$thisGpfsIP)"
402  nwFailoverCondition "no configured nfs interfaces" 
403    fi
404    debugmsg2 "The list of ips is $nfsIPList"
405    for nfsIP in $nfsIPList; do 
406        # get the "original" interface (e.g. the original interface for eth0:1 is eth0)
407        origEth=$(getEthInterface $nfsIP |  awk -F: '{print $1}')
408        if [[ -z $origEth ]]; then
409      debugmsg "ALERT: The nfs ip address $nfsIP is not assigned an interface"
410      continue
411        fi
412        echo $origEth
413        debugmsg2 "the actual interface for $nfsIP is $origEth"
414    done
415}
416
417# Get the inerface for a given IP address
418# The format of nfs.nodes: GPFS_IP NFS_IP1 NFS_IP2 ...
419# NOTE: The original function was copied from Marc and was changed.
420#       May requires future integration.
421getEthInterface() {
422    eth=""
423    # calls an executable that returns a line for each interface on the machine,
424    # and what ip address it is assigned
425    tmp=/tmp/mmgetifconf.$$
426    mmgetifconf > $tmp
427    exec 3< $tmp
428    while read -u3 iface ip mask; do
429        if [[ $ip == $1 ]]; then 
430      eth=$iface
431        fi
432    done
433    unlink $tmp
434    # eth may be empty if there is no interface associated with this ip address
435    echo $eth
436}
437
438# Locate and ping the default gateway.
439# On failure, alert and invoke failover if configured.
440pingDefaultGateway() {
441    gwIP=$(route -n | awk '/UG/ {print $2}')
442    [[ -z $gwIP ]] && return
443
444    # Make sure the local machine is not set as the default gateway
445    tmp=/tmp/mmgetifconf.$$
446    mmgetifconf > $tmp
447    exec 3< $tmp
448    while read -u3 iface ip mask; do
449        [[ $ip == $gwIP ]] && return
450    done
451    unlink $tmp
452
453    # try to ping the gateway
454    ping -c 1 -w 5 $gwIP > /dev/null
455    outPing=$?
456    if [ $outPing -ne 0 ]; then
457  msg "Failed to ping the gateway at $gwIP (err $outPing)"
458  nwFailoverCondition "can't ping the gateway"
459    else
460        debugmsg2 "Succeeded to ping the gateway at $gwIP (ping returns $outPing)"
461    fi
462}
463
464# Handle the network alert and failover if configured.
465nwFailoverCondition() {
466    message=$1
467    debugmsg2 nwFailoverCondition $message
468    if [[ $MONITOR_NETWORK -eq $LEVEL3 ]]; then
469        failovermsg network $message
470  invokeFailover
471    else
472        alertmsg network $message
473    fi
474}
475
476
477########################
478# Monitoring portmap
479########################
480# Check if portmap is up; invoke failover and/or alert if configured.
481# TODO: we can test if this node is mounting anything, and if not
482# we can restart portmap, and re-register the nfs prcesses with it.
483# Currently, if this machine is mouning anything, lockd does not re-register with portmap.
484monitorPortmap() {
485    getStatus /sbin/portmap
486    case $? in
487        $SERVICE_RUNNING)
488            return
489            ;;
490        $SERVICE_NOT_RUNNING)
491            if [[ $MONITOR_PORTMAP -eq $LEVEL1 ]]; then
492                alertmsg portmap
493      else
494                failovermsg portmap
495                invokeFailoverReboot
496      fi
497            ;;
498    esac
499}
500
501######################################
502# Monitoring locking (lockd and statd)
503######################################
504
505# This is the main function for monitoring locking (lockd, and statd)
506monitorLocking() {
507    ifGPFSDownExit $GPFS_IP
508
509    # Monitor lockd.
510    getStatus lockd
511    case $? in
512  $SERVICE_RUNNING) 
513      ;;
514  $SERVICE_NOT_RUNNING)   
515      if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then 
516    failovermsg lockd
517    invokeFailover
518      else
519    alertmsg lockd
520      fi
521      ;;
522    esac
523
524    # Monitor statd
525    if [ -f /sbin/rpc.statd ]; then
526  getStatus /sbin/rpc.statd
527  case $? in
528      $SERVICE_RUNNING)
529    ;;
530      $SERVICE_NOT_RUNNING)     
531    if [[ $MONITOR_STATD -ge $LEVEL2 ]]; then
532        restartStatd
533    else
534        alertmsg statd
535    fi
536    ;;
537  esac
538    fi     
539}
540
541
542# This function attempts to restart statd in the background (only once).
543restartStatd() {
544    [ ! -f /sbin/rpc.statd ] && return
545   
546    # Kill the statd process if exists (important for registering with portmap)
547    debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd)
548    debuglog /etc/init.d/nfslock start
549    sleep $RESTART_TIMEOUT
550    getStatus /sbin/rpc.statd   
551    case $? in
552  $SERVICE_RUNNING) 
553      ;;
554  $SERVICE_NOT_RUNNING)     
555      if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then
556                failovermsg statd
557    invokeFailover
558      else
559                alertmsg statd
560      fi
561      ;;
562    esac
563}
564
565######################################
566# Monitoring rsh/ssh daemon
567######################################
568
569startSshd() {
570    checkRestart SSHD  # This call takes care of MONITOR_SSHD < LEVEL3 (numberOfRestarts=0)
571    numberOfRestarts=$?
572    service=$(rshService)
573    attemptNo=0
574    debugmsg2 "About to restart sshd (up to $numberOfRestarts times as configured,
575                 note: 0 means that the configuration level is lower than LEVEL2)"
576    while [[ $numberOfRestarts -gt 0 ]]; do
577  attemptNo=$((attemptNo+1))
578  /etc/init.d/$service restart > /tmp/$service_restart.out 2>&1 &
579  restartPID=$!
580  sleep $((attemptNo*RESTART_TIMEOUT))
581  debugmsg2 "Try to restart $service (attempt $attemptNo), checking the status:"
582  getStatus $service
583  restartStatus=$?
584  debugmsg2 "The $service status after the $attemptNo attempt of restart is $restartStatus"
585  if [[ $restartStatus == $SERVICE_RUNNING ]]; then     
586      return
587  fi
588  debugmsg "Restarting $service failed/hangs, about to kill the restart process.
589         The output of restart attempt is in /tmp/$service_restart.out"
590  kill -9 $restartPID 2>&1
591  numberOfRestartsLeft=$((numberOfRestartsLeft-1))
592    done
593    # Failed to restart service, check for failover configuration parameters
594    msg "Failed to restart the $service process (tried $attemptNo times as configured)"
595    if [[ $MONITOR_SSHD == $LEVEL3 ]]; then
596        failovermsg $service
597  invokeFailover
598    else
599        alertmsg $service
600    fi
601}   
602
603monitorSshd() {
604    ifGPFSDownExit $GPFS_IP
605    service=$(rshService)
606    getStatus $service
607    case $? in
608  $SERVICE_RUNNING)
609      return
610      ;;
611  $SERVICE_NOT_RUNNING)
612      startSshd
613      ;;
614    esac
615}
616
617######################################
618# Monitoring gpfs daemon
619######################################
620monitorGPFS() {
621    ifGPFSDownExit $GPFS_IP
622    if [ $? != 0 ]; then
623        failovermsg GPFS
624  exit
625    fi
626}
627
628######################################
629# Main
630######################################
631nfsMonitor() {
632    GPFS_IP=$(myGPFSIP)
633
634    tempvar=1
635    while [ $tempvar == 1 ] ; do
636  sleep $MONITOR_INTERVAL
637 
638  # GPFS monitoring
639        if [[ $MONITOR_GPFS -gt $LEVEL3 ]]; then
640            debugmsg2 "==========GPFS monitoring==============="
641            monitorGPFS
642            debugmsg2 "done monitoring GPFS"
643        fi
644
645  # Network monitoring
646  if [[ $MONITOR_NETWORK -gt $LEVEL0 ]]; then
647      debugmsg2 "==========NW monitoring==============="
648      monitorNetwork
649      debugmsg2 "done monitoring the network"
650  fi
651
652  # Monitoring portmap
653  # Note that all of the rpc services have to be registered with portmap
654  # in order for new clients to access them. In our monitoring script,
655  # we only test for portmap once, but if it fails afterward, the services
656  # may not be available for new clients even though they are running.
657  # This is the case until portmap is restarted again,
658  # and the rpc processes re-register with it.
659  if [[ $MONITOR_PORTMAP -gt $LEVEL0 ]]; then
660      debugmsg2 "===========portmap monitoring=============="
661      monitorPortmap
662      debugmsg2 "done monitoring portmap"
663  fi
664
665  # Monitoring nfsd
666  # The system monitors nfsd only if the nfsd is configured to be monitored in
667  # the configuration file.
668  # There are several monitoring methods/levels:
669  # (test 1) Check that the nfsd process is running.
670  # (test 2) Monitor the rpc-nfs activity.
671  # (test 3) Send null rpc to the nfsd service.
672  #
673  # Order of tests:
674  # Perfom (test 1). If the process is not running, goto Action.
675  # If the process is running, perform (test 2), if there is nfs activity,
676  # goto Done.
677  # If no activity is detected, perform (test 3); if fails -- goto Action;
678  # if pass -- goto Done.
679 
680  # Action: if nfsd is not running & configured to be restarted then the nfsd is
681  # restarted.
682  # If all restart attempts have failed, the node is declared "dead" for nfs
683  # serving, and if nfsd is configured as "failover" then the node is failed over
684  # to another node, and a user level alert is invoked.
685 
686  # Done: nfsd is up and running, continue.
687 
688  if [[ $MONITOR_NFSD  -gt $LEVEL0 ]]; then
689      debugmsg2 "==========nfsd monitoring==============="
690      monitorNfsd
691      debugmsg2 "done monitoring nfsd"
692  fi
693
694  # Monitoring mountd
695  # The system monitors mountd only if mountd is configured to be monitored.
696  # (test 1) Check that the mountd process is running.
697  # Order of tests:
698  # Perfom (test 1). If the process is not running, goto Action.
699  # If the process is running, goto Done.
700 
701  # Action: if mountd is not running & mountd is configured to be restarted then the mountd is
702  # restarted.
703  # If all restart attempts have failed, the node is declared "dead" for nfs
704  # serving. If mountd is configured as "failover" then the node is failed over
705  # to another node, and alert is sent.
706
707  if [[ $MONITOR_MOUNTD -gt $LEVEL0 ]]; then
708      debugmsg2 "============mountd monitoring============="
709      monitorMountd
710      debugmsg2 "done monitoring mountd"
711  fi
712
713  # Monitoring locking (lockd and statd)
714  # The system monitors locking only if configured.
715  # There are several monitoring methods/levels:
716  # (test 1) Check that the lockd processes is running
717  # (test 2) Check that the statd processes is running
718 
719  # If lockd is not running, the node is declared "dead" for nfs serving,
720  # and if locking is configured as "failover" then the node is failed over to another
721  # node, and alert is sent.
722  # If statd is not running, a restart takes place if statd is configured to be restarted.
723  # If the restart has failed, the node is declared "dead" for nfs serving,
724  # and if configured, the node is failed over to another
725  # node, and alert is sent.
726
727  if [[ $MONITOR_STATD -gt $LEVEL0 ]]; then
728      debugmsg2 "==========statd  monitoring==============="
729      monitorLocking
730      debugmsg2 "done monitoring statd"
731  fi
732
733  # Monitoring sshd
734  # The system monitors sshd only if sshd is configured to be monitored.
735  # (test 1) Check that the sshd process is running.
736  # Order of tests:
737  # Perfom (test 1). If the process is not running, goto Action.
738  # If the process is running, goto Done.
739  # Action: if sshd is not running is configured to be restarted, then restart.
740  # If all restart attempts have failed, the node is declared "dead" for ssh
741  # serving. If sshd is configured as "failover" then the node is failed over
742  # to another node, and alert is sent.
743
744  if [[ $MONITOR_SSHD -gt $LEVEL0 ]]; then
745      debugmsg2 "============sshd monitoring============="
746      monitorSshd
747      debugmsg2 "done monitoring sshd"
748  fi
749
750    done
751}
752
753stopNfsMonitor() {
754    nfsMonToolPid=`pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor`
755    if [[ -z $nfsMonToolPid ]]; then
756  debugmsg2 "Warning: Couldn't find the monitoring process to stop"
757  return
758    fi
759    kill -9 $nfsMonToolPid
760    msg "Monitoring has stopped."
761}
762
763startNfsMonitor() {
764    nfsMonitor &
765    msg "Monitoring has started."
766}
767
768restartNfsMonitor() {
769    stopNfsMonitor
770    startNfsMonitor
771}
772
773statusNfsMonitor() {
774    nfsMonToolPid=$(pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor)
775    if [[ -z $nfsMonToolPid ]]; then
776  debugmsg "nfsmonitor is not running"
777  return
778    else
779  debugmsg "nfsmonitor is running"
780    fi   
781}
782
783#################################
784# Main program
785#################################
786case "$1" in
787    -s|start)
788        startNfsMonitor
789        ;;
790    -e|stop)
791        stopNfsMonitor
792        ;;
793    -r|restart)
794        restartNfsMonitor
795        ;;
796    -q|status)
797        statusNfsMonitor
798        ;;
799    *)
800        echo $"Usage: $0 [-s|start|-e|end|-r|restart|-q|status]"
801        exit 1
802  ;;
803esac
Note: See TracBrowser for help on using the repository browser.