#!/bin/ksh
# @(#)23	1.12.1.10  src/avs/fs/mmfs/samples/nfscluster/nfsmonitor, mmfs, avs_rgpfs24, rgpfs24s011a 3/20/07 16:46:39
#

# HA-NFS monitoring
# Usage: nfsmonitor [start|stop|restart|status]

# Monitor levels
LEVEL0=0 # NO_MONITORING
LEVEL1=1 # ALERT_ONLY
LEVEL2=2 # RESTART (not applicable for all services)
LEVEL3=3 # FAILOVER

MONITOR_INTERVAL=${MONITOR_INTERVAL-15}
MONITOR_NETWORK=${MONITOR_NETWORK-3}
MONITOR_PORTMAP=${MONITOR_PORTMAP-3}
MONITOR_NFSD=${MONITOR_NFSD-3}
MONITOR_MOUNTD=${MONITOR_MOUNTD-3}
MONITOR_STATD=${MONITOR_STATD-3}
MONITOR_SSHD=${MONITOR_SSHD-3}

# All times below are in seconds
NFS_RPC_ACT_SAMPLE_INTERVAL=3
RESTART_TIMEOUT=7

# Internal flags
NUMBER_OF_RESTARTS=3

# Functions return values (used only within this file)
SERVICE_RUNNING=1
SERVICE_NOT_RUNNING=2
SERVICE_UNKNOWN=3

if [ ! -f /var/mmfs/etc/nfsfuncs ]; then
    echo "$0: Can't find NFS functions in /var/mmfs/etc"
    exit 0
fi
. /var/mmfs/etc/nfsfuncs

# Display an alert message in syslog. Optionally, call a customer-provided alert script
# Usage: alert <message> <node issuing alert>
alert() {
    service=$1
    actionmsg=$2
    shift 2
    [ -n "$*" ] && comment="($*)"
    message="Monitoring detected $service is inactive, $action $comment"
    msg "$message"
    [ ! -e /var/mmfs/etc/alert ] && return
    /var/mmfs/etc/alert "$message" "$GPFS_IP"
}

alertmsg() {
    service=$1
    shift
    alert $service "no action taken as configured" $*
}

failovermsg() {
    service=$1
    shift
    alert $service "node failure initiated as configured" $*
}

# Terminate all processes that are running the nfs tool.
# This is useful for the case that the restart process is hanging...
nfsToolKill() {
    if [ -f /etc/init.d/nfsserver ]; then
	NFSTOOL=/etc/init.d/nfsserver
    else
	NFSTOOL=/etc/init.d/nfs
    fi
    nfsToolBase=${NFSTOOL##*/}
    nfsToolPid=`pidof -o $$ -o $PPID -o %PPID -x $NFSTOOL || \
		pidof -o $$ -o $PPID -o %PPID -x $nfsToolBase`
    debugmsg2 "The pids of $NFSTOOL are $nfsToolPid"		
    if [[ -z $nfsToolPid ]]; then
	return
    fi
    kill -9 $nfsToolPid
}

# Check if a given service is configured to be restarted.
# Returns: the number of restarts or 0 if configured for no restart
checkRestart() {
    service=$1
    eval service_level='$'MONITOR_$service
    debugmsg2 "$service level is $service_level"
    [ $service_level -ge $LEVEL2 ] && return $NUMBER_OF_RESTARTS
    return 0
}

# Get the status of a given service.
# Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING
getStatus() {
    service=$1
    checkStatus $service
    status=$?
    if [ $status -eq 0 ]; then
	debugmsg2 "$service is running"
	return $SERVICE_RUNNING
    else 
	debugmsg "$service is not running (status $status)"
	return $SERVICE_NOT_RUNNING	
    fi
}

invokeFailover() {
    debugmsg "Invoking failover..."
    # Stop nfs gracefully to prevent client from getting ESTALE
    debuglog2 touch /tmp/ha-nfs-reboot 2> /dev/null
    stop.nfs
    # Kill the gpfs daemon on the node to invoke failover
    debugmsg "Stopping GPFS..."
    /etc/init.d/gpfs stop
    exit
}

invokeFailoverReboot() {
    debugmsg "Invoking failover with reboot..."
    # Stop nfs gracefully to prevent client from getting ESTALE
    stop.nfs
    # Kill the gpfs daemon on the node to invoke failover
    debugmsg "Stopping GPFS before reboot..."
    /etc/init.d/gpfs stop
    reboot
    exit
}

#################
# Monitoring nfsd
#################

# This function samples /proc/net/rpc/nfsd twice within a given interval
# and compares the two samples to detect any nfsd rpc activity
# Returns: if activity was detected -> SERVICE_RUNNING
#          if there is no entry in proc or no activity was detected, no conclusion -> 
#          SERVICE_UNKNOWN
detectNfsdActivity() {
    set -A v2procs NAME COUNT NULL GETATTR SETATTR ROOT LOOKUP READLINK \
                   READ WRCACHE WRITE CREATE REMOVE RENAME \
                   LINK SYMLINK MKDIR RMDIR READDIR FSSTAT
    set -A v3procs NAME COUNT NULL GETATTR SETATTR LOOKUP ACCESS READLINK \
                   READ WRITE CREATE MKDIR SYMLINK MKNOD REMOVE RMDIR RENAME \
                   LINK READDIR READDIRPLUS FSSTAT FSINFO PATHCONF COMMIT

    procfile=/proc/net/rpc/nfsd
    if [ ! -f $procfile ]; then
        msg "Monitoring could not find /proc/net/rpc/nfsd"
        return $SERVICE_UNKNOWN
    fi
    # Sample v2/v3 activity (using proc) in the next NFS_RPC_ACT_SAMPLE_INTERVAL sec
    set -A v2procs1 $(cat $procfile | grep -w proc2)
    set -A v3procs1 $(cat $procfile | grep -w proc3)
    debugmsg2 "Sleeping for $NFS_RPC_ACT_SAMPLE_INTERVAL"
    sleep $NFS_RPC_ACT_SAMPLE_INTERVAL
    set -A v2procs2 $(cat $procfile | grep -w proc2)
    set -A v3procs2 $(cat $procfile | grep -w proc3)

    # Use the samples to detect activity
    p=2 # skipping name and count fields
    n=${#v2procs[@]}
    while [ $p -lt $n ]; do
	activity=$((${v2procs2[$p]}-${v2procs1[$p]}))
	if [ $activity -gt 0 ]; then
	    debugmsg2 "nfsd activity detected: $activity ${v2procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec"
	    return $SERVICE_RUNNING
	fi
        p=$((p+1))
    done
    p=2 # skipping name and count fields
    n=${#v3procs[@]}
    while [ $p -lt $n ]; do
	activity=$((${v3procs2[$p]}-${v3procs1[$p]}))
	if [ $activity -gt 0 ]; then
	    debugmsg2 "nfsd activity detected: $activity ${v3procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec"
	    return $SERVICE_RUNNING
	fi
        p=$((p+1))
    done
    # no activity detected
    debugmsg2 "Could not detect nfsd activity using /proc"
    return $SERVICE_UNKNOWN
}

# If nfsd is configured to be restarted, this function attempts to start/restart 
# the nfsd process as many times as configured. The operation parameter is set
# to start/restart.
# The success of restart is determined by checking the status of nfsd.
# Note that portmap has to be running in order to restart nfsd.
# If restart has failed, alert and failover are invoked as configured. 
# For nfsd, we use the /etc/init.d/nfs(RH)|nfsserver(SUSE) utility since it stops
# the service before restarting it (which is important to make sure that nfsd 
# re-registerin with portmap, for example, in the case of restarting nfsd after
# restarting portmap. This utility also reloads /etc/exports.
startNfsd() {
    checkRestart NFSD # This call takes care of MONITOR_NFSD < LEVEL2 (numberOfRestarts=0)
    numberOfRestarts=$?
    attemptNo=0
    debugmsg2 "About to start nfsd (up to $numberOfRestarts times)"
    while [ $numberOfRestarts -gt 0 ]; do
	attemptNo=$((attemptNo+1))
	debugmsg2 "In startnfsd, attempt number = $attemptNo"
	nfsService start
	# give the restart/start chance to complete
	sleep $RESTART_TIMEOUT
	getStatus nfsd
	status=$?
	debugmsg2 "The nfsd status after $attemptNo attempts to start is $status"
	[[ $status == $SERVICE_RUNNING ]] && return
	# start attempt has failed/hangs -> kill the process and retry
	debugmsg "start nfsd failed/hangs, about to kill the start process."
	nfsToolKill 
        numberOfAttempts=$((numberOfRestarts-1))
    done

    # If we get here, it means that all start attempts have failed
    # Failed to restart nfsd, check for failover configuration parameters
    if [[ $MONITOR_NFSD -eq $LEVEL3 ]]; then 
        failovermsg nfsd
	invokeFailover
    else
        alertmsg nfsd
    fi
}

# NULL RPC test: use rpcinfo to send a null rpc to nfs v3
# Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING
nfsdNullRpcTest() {
    hostname=$(hostname)
    # Use rpcinfo to send a null rpc to nfs v3 using UDP
    debuglog2 rpcinfo -u $hostname nfs 3
    if [ $? -eq 0 ]; then
	debugmsg2 "Sent nullrpc to nfsd v3 using rpcinfo, the service is up and running"
	return $SERVICE_RUNNING
    else
	msg "Sent nullrpc to nfsd v3 using rpcinfo, the service is not available (it maybe because portmap was started after nfsd, or that rpcinfo is not installed)"
	return $SERVICE_NOT_RUNNING
    fi
}

# Run all the tests for monitoring nfsd, and take actions according
# to the monioring level.
monitorNfsd() {
    ifGPFSDownExit $GPFS_IP

    # Check that the nfsd process is running
    getStatus nfsd
    runStatus=$?

    case $runStatus in
	$SERVICE_RUNNING)
	    debugmsg2 "Perform more tests, to make sure that nfsd is functioning"
	    detectNfsdActivity
	    case $? in
		$SERVICE_RUNNING)	
		    return ;;
		$SERVICE_UNKNOWN)
		    nfsdNullRpcTest
		    case $? in
			$SERVICE_NOT_RUNNING)
			    startNfsd ;;	    
			$SERVICE_RUNNING)
			    return ;;
		    esac
		    ;;
	    esac
	    ;;
	$SERVICE_NOT_RUNNING)
	    # Note that if nfsd was not running and the start has succeeded, we assume 
	    # that the process is running, and only perform the next level of tests 
            # (rpc activity, and null rpc) next time.
	    startNfsd
	    return ;;
    esac
}

####################
# Monitoring mountd
####################
restartMountd() {
    checkRestart MOUNTD  # This takes care of MONITOR_MOUNTD < LEVEL3 (numberOfRestarts=0)
    numberOfRestarts=$?
    attemptNo=0
    debugmsg2 "About to restart mountd (up to $numberOfRestarts times as configured, note: 0 means that the configuration level is lower than LEVEL2)"
    while [ $numberOfRestarts -gt 0 ]; do
	attemptNo=$((attemptNo+1))
	startMountd
	restartPID=$!	
	debugmsg2 "restartPID=$restartPID"
	# give the restart a chance to complete
	sleep $RESTART_TIMEOUT
	# check the status of mountd after the restart
	debugmsg2 "Try to restart mountd (attempt $attemptNo), checking the status:"
	getStatus /usr/sbin/rpc.mountd
	if [ $? -eq $SERVICE_RUNNING ]; then	    
	    debugmsg2 "Succeeded to restart the mountd service at $attemptNo attempt"
	    return
	fi
	# restart attempt has failed/hangs -> kill the process and retry
	debugmsg "Restarting mountd failed/hangs, about to kill the restart process."
	if [ -e /sbin/startproc ]; then
	    debuglog kill -9 $restartPID
	else
	    nfsToolKill # FIX
	fi
	numberOfRestarts=$((numberOfRestarts-1))
    done

    # If we get to this point, it means that all restart attempts have failed
    msg "Failed to restart the mountd (tried $attemptNo times as configured)"
    # Failed to restart mountd, check for failover configuration parameters
    if [[ $MONITOR_MOUNTD == $LEVEL3 ]]; then
        failovermsg mountd
	invokeFailover
    else
        alertmsg mountd
    fi
}    


# This is the main function for monitoring mountd, and take actions according
# to the monioring level.
monitorMountd() {
    ifGPFSDownExit $GPFS_IP
    getStatus /usr/sbin/rpc.mountd   
    case $? in
	$SERVICE_RUNNING)
	    return
	    ;;
	$SERVICE_NOT_RUNNING)
	    restartMountd
	    ;;
    esac
}

########################
# Monitoring the network
########################

# Monitor the network. 
# For now the only tests performed are: (1) whether the link is connected or not, using
# ethtool. (2) ping the gateway. More tests can be added here later.
monitorNetwork() {
    ifGPFSDownExit $GPFS_IP
    # TEST1: make sure that all interfaces that are used for nfs serving are connected
    nfsIfs=$(getNfsIFs $GPFS_IP)
    if [[ -z $nfsIfs ]]; then
	msg "No configured NFS IP addresses detected on any of the node's interfaces"
	nwFailoverCondition "no configured nfs interfaces"
    else
	for eth in $nfsIfs; do
            tmp=$(mmgetifconf | grep -w $eth | awk '{print $1}')
            if [[ -z $tmp ]]; then
	       nwFailoverCondition "interface is down"
            fi

	    checkLinkStatus $eth
	    if [ $? -eq 0 ]; then
		continue
	    else
	        nwFailoverCondition "link is not connected"
	    fi
	done
    fi

    # TEST2: check that all NFS IP addresses are enabled
    nfsIPs=$(getNfsIPs $GPFS_IP)
    for ip in $nfsIPs; do
        mmgetifconf | grep -q $ip
        [ $? -eq 0 ] && continue
        debugmsg "monitor detected $ip is down, restarting"
        ifUp $ip    
    done

    # Now check that all NFS IP addresses for failover nodes are enabled
    nfsIPs=
    for ip in $(getFailedNodes $GPFS_IP); do
        nfsIPs="$nfsIPs $(getNfsIPs $ip)"
    done
    for ip in $nfsIPs; do
        mmgetifconf | grep -q $ip
        [ $? -eq 0 ] && continue
        [ "$(IPaddr $ip monitor)" == "OK" ] && continue
        debugmsg "monitor detected $ip is down, restarting"
        debuglog IPaddr $ip start
    done

    # TEST3: ping the gateway
    pingDefaultGateway
}

# List all interfaces used for NFS serving from NODELIST
getNfsIFs () {
    thisGpfsIP=$1
    eth=""
    # Get the list of nfs ip addresses for the given gpfs ip address
    nfsIPList=$(getNfsIPs $thisGpfsIP)
    # Handle the case that there is not entry for the node in nfs.nodes
    if [[ -z $nfsIPList ]]; then
	debugmsg "ALERT: No entry was found in nfs.nodes for this node (gpfs ip address:$thisGpfsIP)"
	nwFailoverCondition "no configured nfs interfaces"	
    fi
    debugmsg2 "The list of ips is $nfsIPList"
    for nfsIP in $nfsIPList; do 
        # get the "original" interface (e.g. the original interface for eth0:1 is eth0)
        origEth=$(getEthInterface $nfsIP |  awk -F: '{print $1}')
        if [[ -z $origEth ]]; then
	    debugmsg "ALERT: The nfs ip address $nfsIP is not assigned an interface"
	    continue
        fi
        echo $origEth
        debugmsg2 "the actual interface for $nfsIP is $origEth"
    done
}

# Get the inerface for a given IP address
# The format of nfs.nodes: GPFS_IP NFS_IP1 NFS_IP2 ...
# NOTE: The original function was copied from Marc and was changed.
#       May requires future integration.
getEthInterface() {
    eth=""
    # calls an executable that returns a line for each interface on the machine, 
    # and what ip address it is assigned
    tmp=/tmp/mmgetifconf.$$
    mmgetifconf > $tmp
    exec 3< $tmp
    while read -u3 iface ip mask; do
        if [[ $ip == $1 ]]; then 
	    eth=$iface
        fi
    done
    unlink $tmp
    # eth may be empty if there is no interface associated with this ip address
    echo $eth
}

# Locate and ping the default gateway. 
# On failure, alert and invoke failover if configured.
pingDefaultGateway() {
    gwIP=$(route -n | awk '/UG/ {print $2}')
    [[ -z $gwIP ]] && return

    # Make sure the local machine is not set as the default gateway
    tmp=/tmp/mmgetifconf.$$
    mmgetifconf > $tmp
    exec 3< $tmp
    while read -u3 iface ip mask; do
        [[ $ip == $gwIP ]] && return
    done
    unlink $tmp

    # try to ping the gateway
    ping -c 1 -w 5 $gwIP > /dev/null
    outPing=$?
    if [ $outPing -ne 0 ]; then
	msg "Failed to ping the gateway at $gwIP (err $outPing)"
	nwFailoverCondition "can't ping the gateway"
    else
        debugmsg2 "Succeeded to ping the gateway at $gwIP (ping returns $outPing)"
    fi
}

# Handle the network alert and failover if configured.
nwFailoverCondition() {
    message=$1
    debugmsg2 nwFailoverCondition $message
    if [[ $MONITOR_NETWORK -eq $LEVEL3 ]]; then
        failovermsg network $message
	invokeFailover
    else
        alertmsg network $message
    fi
}


########################
# Monitoring portmap
########################
# Check if portmap is up; invoke failover and/or alert if configured.
# TODO: we can test if this node is mounting anything, and if not
# we can restart portmap, and re-register the nfs prcesses with it.
# Currently, if this machine is mouning anything, lockd does not re-register with portmap.
monitorPortmap() {
    getStatus /sbin/portmap
    case $? in
        $SERVICE_RUNNING)
            return
            ;;
        $SERVICE_NOT_RUNNING)
            if [[ $MONITOR_PORTMAP -eq $LEVEL1 ]]; then
                alertmsg portmap
	    else
                failovermsg portmap
                invokeFailoverReboot
	    fi
            ;;
    esac
}

######################################
# Monitoring locking (lockd and statd)
######################################

# This is the main function for monitoring locking (lockd, and statd)
monitorLocking() {
    ifGPFSDownExit $GPFS_IP

    # Monitor lockd.
    getStatus lockd
    case $? in
	$SERVICE_RUNNING) 
	    ;;
	$SERVICE_NOT_RUNNING) 	
	    if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then 
		failovermsg lockd
		invokeFailover
	    else
		alertmsg lockd
	    fi
	    ;;
    esac

    # Monitor statd
    if [ -f /sbin/rpc.statd ]; then
	getStatus /sbin/rpc.statd
	case $? in
	    $SERVICE_RUNNING)
		;;
	    $SERVICE_NOT_RUNNING) 	  
		if [[ $MONITOR_STATD -ge $LEVEL2 ]]; then
		    restartStatd
		else
		    alertmsg statd
		fi
		;;
	esac
    fi	   
}


# This function attempts to restart statd in the background (only once).
restartStatd() {
    [ ! -f /sbin/rpc.statd ] && return
    
    # Kill the statd process if exists (important for registering with portmap)
    debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd)
    debuglog /etc/init.d/nfslock start
    sleep $RESTART_TIMEOUT
    getStatus /sbin/rpc.statd	   
    case $? in
	$SERVICE_RUNNING) 
	    ;;
	$SERVICE_NOT_RUNNING) 	  
	    if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then
                failovermsg statd
		invokeFailover
	    else
                alertmsg statd
	    fi
	    ;;
    esac
}

######################################
# Monitoring rsh/ssh daemon
######################################

startSshd() {
    checkRestart SSHD  # This call takes care of MONITOR_SSHD < LEVEL3 (numberOfRestarts=0)
    numberOfRestarts=$?
    service=$(rshService)
    attemptNo=0
    debugmsg2 "About to restart sshd (up to $numberOfRestarts times as configured, 
                 note: 0 means that the configuration level is lower than LEVEL2)"
    while [[ $numberOfRestarts -gt 0 ]]; do
	attemptNo=$((attemptNo+1))
	/etc/init.d/$service restart > /tmp/$service_restart.out 2>&1 &
	restartPID=$!
	sleep $((attemptNo*RESTART_TIMEOUT))
	debugmsg2 "Try to restart $service (attempt $attemptNo), checking the status:"
	getStatus $service
	restartStatus=$?
	debugmsg2 "The $service status after the $attemptNo attempt of restart is $restartStatus"
	if [[ $restartStatus == $SERVICE_RUNNING ]]; then	    
	    return
	fi
	debugmsg "Restarting $service failed/hangs, about to kill the restart process.
		     The output of restart attempt is in /tmp/$service_restart.out"
	kill -9 $restartPID 2>&1
	numberOfRestartsLeft=$((numberOfRestartsLeft-1))
    done
    # Failed to restart service, check for failover configuration parameters
    msg "Failed to restart the $service process (tried $attemptNo times as configured)"
    if [[ $MONITOR_SSHD == $LEVEL3 ]]; then
        failovermsg $service
	invokeFailover
    else
        alertmsg $service
    fi
}    

monitorSshd() {
    ifGPFSDownExit $GPFS_IP
    service=$(rshService)
    getStatus $service
    case $? in
	$SERVICE_RUNNING)
	    return
	    ;;
	$SERVICE_NOT_RUNNING)
	    startSshd
	    ;;
    esac
}

######################################
# Monitoring gpfs daemon
######################################
monitorGPFS() {
    ifGPFSDownExit $GPFS_IP
    if [ $? != 0 ]; then
        failovermsg GPFS
	exit
    fi
}

######################################
# Main
######################################
nfsMonitor() {
    GPFS_IP=$(myGPFSIP)

    tempvar=1
    while [ $tempvar == 1 ] ; do
	sleep $MONITOR_INTERVAL
	
	# GPFS monitoring
        if [[ $MONITOR_GPFS -gt $LEVEL3 ]]; then
            debugmsg2 "==========GPFS monitoring==============="
            monitorGPFS
            debugmsg2 "done monitoring GPFS"
      	fi

	# Network monitoring
	if [[ $MONITOR_NETWORK -gt $LEVEL0 ]]; then
	    debugmsg2 "==========NW monitoring==============="
	    monitorNetwork
	    debugmsg2 "done monitoring the network"
	fi

	# Monitoring portmap
	# Note that all of the rpc services have to be registered with portmap 
	# in order for new clients to access them. In our monitoring script, 
	# we only test for portmap once, but if it fails afterward, the services 
	# may not be available for new clients even though they are running. 
	# This is the case until portmap is restarted again, 
	# and the rpc processes re-register with it.
	if [[ $MONITOR_PORTMAP -gt $LEVEL0 ]]; then
	    debugmsg2 "===========portmap monitoring=============="
	    monitorPortmap
	    debugmsg2 "done monitoring portmap"
	fi

	# Monitoring nfsd
	# The system monitors nfsd only if the nfsd is configured to be monitored in 
	# the configuration file.
	# There are several monitoring methods/levels:
	# (test 1) Check that the nfsd process is running.
	# (test 2) Monitor the rpc-nfs activity.
	# (test 3) Send null rpc to the nfsd service.
	#
	# Order of tests:
	# Perfom (test 1). If the process is not running, goto Action.
	# If the process is running, perform (test 2), if there is nfs activity, 
	# goto Done.
	# If no activity is detected, perform (test 3); if fails -- goto Action; 
	# if pass -- goto Done.
	
	# Action: if nfsd is not running & configured to be restarted then the nfsd is 
	# restarted.
	# If all restart attempts have failed, the node is declared "dead" for nfs 
	# serving, and if nfsd is configured as "failover" then the node is failed over 
	# to another node, and a user level alert is invoked.
	
	# Done: nfsd is up and running, continue.
	
	if [[ $MONITOR_NFSD  -gt $LEVEL0 ]]; then
	    debugmsg2 "==========nfsd monitoring==============="
	    monitorNfsd
	    debugmsg2 "done monitoring nfsd"
	fi

	# Monitoring mountd
	# The system monitors mountd only if mountd is configured to be monitored.
	# (test 1) Check that the mountd process is running.
	# Order of tests:
	# Perfom (test 1). If the process is not running, goto Action.
	# If the process is running, goto Done.
	
	# Action: if mountd is not running & mountd is configured to be restarted then the mountd is 
	# restarted.
	# If all restart attempts have failed, the node is declared "dead" for nfs 
	# serving. If mountd is configured as "failover" then the node is failed over 
	# to another node, and alert is sent.

	if [[ $MONITOR_MOUNTD -gt $LEVEL0 ]]; then
	    debugmsg2 "============mountd monitoring============="
	    monitorMountd
	    debugmsg2 "done monitoring mountd"
	fi

	# Monitoring locking (lockd and statd)
	# The system monitors locking only if configured.
	# There are several monitoring methods/levels:
	# (test 1) Check that the lockd processes is running
	# (test 2) Check that the statd processes is running
	
	# If lockd is not running, the node is declared "dead" for nfs serving, 
	# and if locking is configured as "failover" then the node is failed over to another 
	# node, and alert is sent.
	# If statd is not running, a restart takes place if statd is configured to be restarted.
	# If the restart has failed, the node is declared "dead" for nfs serving, 
	# and if configured, the node is failed over to another 
	# node, and alert is sent.

	if [[ $MONITOR_STATD -gt $LEVEL0 ]]; then
	    debugmsg2 "==========statd  monitoring==============="
	    monitorLocking
	    debugmsg2 "done monitoring statd"
	fi

	# Monitoring sshd
	# The system monitors sshd only if sshd is configured to be monitored.
	# (test 1) Check that the sshd process is running.
	# Order of tests:
	# Perfom (test 1). If the process is not running, goto Action.
	# If the process is running, goto Done.
	# Action: if sshd is not running is configured to be restarted, then restart.
	# If all restart attempts have failed, the node is declared "dead" for ssh 
	# serving. If sshd is configured as "failover" then the node is failed over 
	# to another node, and alert is sent.

	if [[ $MONITOR_SSHD -gt $LEVEL0 ]]; then
	    debugmsg2 "============sshd monitoring============="
	    monitorSshd
	    debugmsg2 "done monitoring sshd"
	fi

    done
}

stopNfsMonitor() {
    nfsMonToolPid=`pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor`
    if [[ -z $nfsMonToolPid ]]; then
	debugmsg2 "Warning: Couldn't find the monitoring process to stop"
	return
    fi
    kill -9 $nfsMonToolPid
    msg "Monitoring has stopped."
}

startNfsMonitor() {
    nfsMonitor &
    msg "Monitoring has started."
}

restartNfsMonitor() {
    stopNfsMonitor
    startNfsMonitor
}

statusNfsMonitor() {
    nfsMonToolPid=$(pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor)
    if [[ -z $nfsMonToolPid ]]; then
	debugmsg "nfsmonitor is not running"
	return
    else
	debugmsg "nfsmonitor is running"
    fi    
}

#################################
# Main program
#################################
case "$1" in
    -s|start)
        startNfsMonitor
        ;;
    -e|stop)
        stopNfsMonitor
        ;;
    -r|restart)
        restartNfsMonitor
        ;;
    -q|status)
        statusNfsMonitor
        ;;
    *)
        echo $"Usage: $0 [-s|start|-e|end|-r|restart|-q|status]"
        exit 1
	;;
esac
