[16] | 1 | #!/bin/ksh |
---|
| 2 | # @(#)23 1.12.1.10 src/avs/fs/mmfs/samples/nfscluster/nfsmonitor, mmfs, avs_rgpfs24, rgpfs24s011a 3/20/07 16:46:39 |
---|
| 3 | # |
---|
| 4 | |
---|
| 5 | # HA-NFS monitoring |
---|
| 6 | # Usage: nfsmonitor [start|stop|restart|status] |
---|
| 7 | |
---|
| 8 | # Monitor levels |
---|
| 9 | LEVEL0=0 # NO_MONITORING |
---|
| 10 | LEVEL1=1 # ALERT_ONLY |
---|
| 11 | LEVEL2=2 # RESTART (not applicable for all services) |
---|
| 12 | LEVEL3=3 # FAILOVER |
---|
| 13 | |
---|
| 14 | MONITOR_INTERVAL=${MONITOR_INTERVAL-15} |
---|
| 15 | MONITOR_NETWORK=${MONITOR_NETWORK-3} |
---|
| 16 | MONITOR_PORTMAP=${MONITOR_PORTMAP-3} |
---|
| 17 | MONITOR_NFSD=${MONITOR_NFSD-3} |
---|
| 18 | MONITOR_MOUNTD=${MONITOR_MOUNTD-3} |
---|
| 19 | MONITOR_STATD=${MONITOR_STATD-3} |
---|
| 20 | MONITOR_SSHD=${MONITOR_SSHD-3} |
---|
| 21 | |
---|
| 22 | # All times below are in seconds |
---|
| 23 | NFS_RPC_ACT_SAMPLE_INTERVAL=3 |
---|
| 24 | RESTART_TIMEOUT=7 |
---|
| 25 | |
---|
| 26 | # Internal flags |
---|
| 27 | NUMBER_OF_RESTARTS=3 |
---|
| 28 | |
---|
| 29 | # Functions return values (used only within this file) |
---|
| 30 | SERVICE_RUNNING=1 |
---|
| 31 | SERVICE_NOT_RUNNING=2 |
---|
| 32 | SERVICE_UNKNOWN=3 |
---|
| 33 | |
---|
| 34 | if [ ! -f /var/mmfs/etc/nfsfuncs ]; then |
---|
| 35 | echo "$0: Can't find NFS functions in /var/mmfs/etc" |
---|
| 36 | exit 0 |
---|
| 37 | fi |
---|
| 38 | . /var/mmfs/etc/nfsfuncs |
---|
| 39 | |
---|
| 40 | # Display an alert message in syslog. Optionally, call a customer-provided alert script |
---|
| 41 | # Usage: alert <message> <node issuing alert> |
---|
| 42 | alert() { |
---|
| 43 | service=$1 |
---|
| 44 | actionmsg=$2 |
---|
| 45 | shift 2 |
---|
| 46 | [ -n "$*" ] && comment="($*)" |
---|
| 47 | message="Monitoring detected $service is inactive, $action $comment" |
---|
| 48 | msg "$message" |
---|
| 49 | [ ! -e /var/mmfs/etc/alert ] && return |
---|
| 50 | /var/mmfs/etc/alert "$message" "$GPFS_IP" |
---|
| 51 | } |
---|
| 52 | |
---|
| 53 | alertmsg() { |
---|
| 54 | service=$1 |
---|
| 55 | shift |
---|
| 56 | alert $service "no action taken as configured" $* |
---|
| 57 | } |
---|
| 58 | |
---|
| 59 | failovermsg() { |
---|
| 60 | service=$1 |
---|
| 61 | shift |
---|
| 62 | alert $service "node failure initiated as configured" $* |
---|
| 63 | } |
---|
| 64 | |
---|
| 65 | # Terminate all processes that are running the nfs tool. |
---|
| 66 | # This is useful for the case that the restart process is hanging... |
---|
| 67 | nfsToolKill() { |
---|
| 68 | if [ -f /etc/init.d/nfsserver ]; then |
---|
| 69 | NFSTOOL=/etc/init.d/nfsserver |
---|
| 70 | else |
---|
| 71 | NFSTOOL=/etc/init.d/nfs |
---|
| 72 | fi |
---|
| 73 | nfsToolBase=${NFSTOOL##*/} |
---|
| 74 | nfsToolPid=`pidof -o $$ -o $PPID -o %PPID -x $NFSTOOL || \ |
---|
| 75 | pidof -o $$ -o $PPID -o %PPID -x $nfsToolBase` |
---|
| 76 | debugmsg2 "The pids of $NFSTOOL are $nfsToolPid" |
---|
| 77 | if [[ -z $nfsToolPid ]]; then |
---|
| 78 | return |
---|
| 79 | fi |
---|
| 80 | kill -9 $nfsToolPid |
---|
| 81 | } |
---|
| 82 | |
---|
| 83 | # Check if a given service is configured to be restarted. |
---|
| 84 | # Returns: the number of restarts or 0 if configured for no restart |
---|
| 85 | checkRestart() { |
---|
| 86 | service=$1 |
---|
| 87 | eval service_level='$'MONITOR_$service |
---|
| 88 | debugmsg2 "$service level is $service_level" |
---|
| 89 | [ $service_level -ge $LEVEL2 ] && return $NUMBER_OF_RESTARTS |
---|
| 90 | return 0 |
---|
| 91 | } |
---|
| 92 | |
---|
| 93 | # Get the status of a given service. |
---|
| 94 | # Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING |
---|
| 95 | getStatus() { |
---|
| 96 | service=$1 |
---|
| 97 | checkStatus $service |
---|
| 98 | status=$? |
---|
| 99 | if [ $status -eq 0 ]; then |
---|
| 100 | debugmsg2 "$service is running" |
---|
| 101 | return $SERVICE_RUNNING |
---|
| 102 | else |
---|
| 103 | debugmsg "$service is not running (status $status)" |
---|
| 104 | return $SERVICE_NOT_RUNNING |
---|
| 105 | fi |
---|
| 106 | } |
---|
| 107 | |
---|
| 108 | invokeFailover() { |
---|
| 109 | debugmsg "Invoking failover..." |
---|
| 110 | # Stop nfs gracefully to prevent client from getting ESTALE |
---|
| 111 | debuglog2 touch /tmp/ha-nfs-reboot 2> /dev/null |
---|
| 112 | stop.nfs |
---|
| 113 | # Kill the gpfs daemon on the node to invoke failover |
---|
| 114 | debugmsg "Stopping GPFS..." |
---|
| 115 | /etc/init.d/gpfs stop |
---|
| 116 | exit |
---|
| 117 | } |
---|
| 118 | |
---|
| 119 | invokeFailoverReboot() { |
---|
| 120 | debugmsg "Invoking failover with reboot..." |
---|
| 121 | # Stop nfs gracefully to prevent client from getting ESTALE |
---|
| 122 | stop.nfs |
---|
| 123 | # Kill the gpfs daemon on the node to invoke failover |
---|
| 124 | debugmsg "Stopping GPFS before reboot..." |
---|
| 125 | /etc/init.d/gpfs stop |
---|
| 126 | reboot |
---|
| 127 | exit |
---|
| 128 | } |
---|
| 129 | |
---|
| 130 | ################# |
---|
| 131 | # Monitoring nfsd |
---|
| 132 | ################# |
---|
| 133 | |
---|
| 134 | # This function samples /proc/net/rpc/nfsd twice within a given interval |
---|
| 135 | # and compares the two samples to detect any nfsd rpc activity |
---|
| 136 | # Returns: if activity was detected -> SERVICE_RUNNING |
---|
| 137 | # if there is no entry in proc or no activity was detected, no conclusion -> |
---|
| 138 | # SERVICE_UNKNOWN |
---|
| 139 | detectNfsdActivity() { |
---|
| 140 | set -A v2procs NAME COUNT NULL GETATTR SETATTR ROOT LOOKUP READLINK \ |
---|
| 141 | READ WRCACHE WRITE CREATE REMOVE RENAME \ |
---|
| 142 | LINK SYMLINK MKDIR RMDIR READDIR FSSTAT |
---|
| 143 | set -A v3procs NAME COUNT NULL GETATTR SETATTR LOOKUP ACCESS READLINK \ |
---|
| 144 | READ WRITE CREATE MKDIR SYMLINK MKNOD REMOVE RMDIR RENAME \ |
---|
| 145 | LINK READDIR READDIRPLUS FSSTAT FSINFO PATHCONF COMMIT |
---|
| 146 | |
---|
| 147 | procfile=/proc/net/rpc/nfsd |
---|
| 148 | if [ ! -f $procfile ]; then |
---|
| 149 | msg "Monitoring could not find /proc/net/rpc/nfsd" |
---|
| 150 | return $SERVICE_UNKNOWN |
---|
| 151 | fi |
---|
| 152 | # Sample v2/v3 activity (using proc) in the next NFS_RPC_ACT_SAMPLE_INTERVAL sec |
---|
| 153 | set -A v2procs1 $(cat $procfile | grep -w proc2) |
---|
| 154 | set -A v3procs1 $(cat $procfile | grep -w proc3) |
---|
| 155 | debugmsg2 "Sleeping for $NFS_RPC_ACT_SAMPLE_INTERVAL" |
---|
| 156 | sleep $NFS_RPC_ACT_SAMPLE_INTERVAL |
---|
| 157 | set -A v2procs2 $(cat $procfile | grep -w proc2) |
---|
| 158 | set -A v3procs2 $(cat $procfile | grep -w proc3) |
---|
| 159 | |
---|
| 160 | # Use the samples to detect activity |
---|
| 161 | p=2 # skipping name and count fields |
---|
| 162 | n=${#v2procs[@]} |
---|
| 163 | while [ $p -lt $n ]; do |
---|
| 164 | activity=$((${v2procs2[$p]}-${v2procs1[$p]})) |
---|
| 165 | if [ $activity -gt 0 ]; then |
---|
| 166 | debugmsg2 "nfsd activity detected: $activity ${v2procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec" |
---|
| 167 | return $SERVICE_RUNNING |
---|
| 168 | fi |
---|
| 169 | p=$((p+1)) |
---|
| 170 | done |
---|
| 171 | p=2 # skipping name and count fields |
---|
| 172 | n=${#v3procs[@]} |
---|
| 173 | while [ $p -lt $n ]; do |
---|
| 174 | activity=$((${v3procs2[$p]}-${v3procs1[$p]})) |
---|
| 175 | if [ $activity -gt 0 ]; then |
---|
| 176 | debugmsg2 "nfsd activity detected: $activity ${v3procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec" |
---|
| 177 | return $SERVICE_RUNNING |
---|
| 178 | fi |
---|
| 179 | p=$((p+1)) |
---|
| 180 | done |
---|
| 181 | # no activity detected |
---|
| 182 | debugmsg2 "Could not detect nfsd activity using /proc" |
---|
| 183 | return $SERVICE_UNKNOWN |
---|
| 184 | } |
---|
| 185 | |
---|
| 186 | # If nfsd is configured to be restarted, this function attempts to start/restart |
---|
| 187 | # the nfsd process as many times as configured. The operation parameter is set |
---|
| 188 | # to start/restart. |
---|
| 189 | # The success of restart is determined by checking the status of nfsd. |
---|
| 190 | # Note that portmap has to be running in order to restart nfsd. |
---|
| 191 | # If restart has failed, alert and failover are invoked as configured. |
---|
| 192 | # For nfsd, we use the /etc/init.d/nfs(RH)|nfsserver(SUSE) utility since it stops |
---|
| 193 | # the service before restarting it (which is important to make sure that nfsd |
---|
| 194 | # re-registerin with portmap, for example, in the case of restarting nfsd after |
---|
| 195 | # restarting portmap. This utility also reloads /etc/exports. |
---|
| 196 | startNfsd() { |
---|
| 197 | checkRestart NFSD # This call takes care of MONITOR_NFSD < LEVEL2 (numberOfRestarts=0) |
---|
| 198 | numberOfRestarts=$? |
---|
| 199 | attemptNo=0 |
---|
| 200 | debugmsg2 "About to start nfsd (up to $numberOfRestarts times)" |
---|
| 201 | while [ $numberOfRestarts -gt 0 ]; do |
---|
| 202 | attemptNo=$((attemptNo+1)) |
---|
| 203 | debugmsg2 "In startnfsd, attempt number = $attemptNo" |
---|
| 204 | nfsService start |
---|
| 205 | # give the restart/start chance to complete |
---|
| 206 | sleep $RESTART_TIMEOUT |
---|
| 207 | getStatus nfsd |
---|
| 208 | status=$? |
---|
| 209 | debugmsg2 "The nfsd status after $attemptNo attempts to start is $status" |
---|
| 210 | [[ $status == $SERVICE_RUNNING ]] && return |
---|
| 211 | # start attempt has failed/hangs -> kill the process and retry |
---|
| 212 | debugmsg "start nfsd failed/hangs, about to kill the start process." |
---|
| 213 | nfsToolKill |
---|
| 214 | numberOfAttempts=$((numberOfRestarts-1)) |
---|
| 215 | done |
---|
| 216 | |
---|
| 217 | # If we get here, it means that all start attempts have failed |
---|
| 218 | # Failed to restart nfsd, check for failover configuration parameters |
---|
| 219 | if [[ $MONITOR_NFSD -eq $LEVEL3 ]]; then |
---|
| 220 | failovermsg nfsd |
---|
| 221 | invokeFailover |
---|
| 222 | else |
---|
| 223 | alertmsg nfsd |
---|
| 224 | fi |
---|
| 225 | } |
---|
| 226 | |
---|
| 227 | # NULL RPC test: use rpcinfo to send a null rpc to nfs v3 |
---|
| 228 | # Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING |
---|
| 229 | nfsdNullRpcTest() { |
---|
| 230 | hostname=$(hostname) |
---|
| 231 | # Use rpcinfo to send a null rpc to nfs v3 using UDP |
---|
| 232 | debuglog2 rpcinfo -u $hostname nfs 3 |
---|
| 233 | if [ $? -eq 0 ]; then |
---|
| 234 | debugmsg2 "Sent nullrpc to nfsd v3 using rpcinfo, the service is up and running" |
---|
| 235 | return $SERVICE_RUNNING |
---|
| 236 | else |
---|
| 237 | msg "Sent nullrpc to nfsd v3 using rpcinfo, the service is not available (it maybe because portmap was started after nfsd, or that rpcinfo is not installed)" |
---|
| 238 | return $SERVICE_NOT_RUNNING |
---|
| 239 | fi |
---|
| 240 | } |
---|
| 241 | |
---|
| 242 | # Run all the tests for monitoring nfsd, and take actions according |
---|
| 243 | # to the monioring level. |
---|
| 244 | monitorNfsd() { |
---|
| 245 | ifGPFSDownExit $GPFS_IP |
---|
| 246 | |
---|
| 247 | # Check that the nfsd process is running |
---|
| 248 | getStatus nfsd |
---|
| 249 | runStatus=$? |
---|
| 250 | |
---|
| 251 | case $runStatus in |
---|
| 252 | $SERVICE_RUNNING) |
---|
| 253 | debugmsg2 "Perform more tests, to make sure that nfsd is functioning" |
---|
| 254 | detectNfsdActivity |
---|
| 255 | case $? in |
---|
| 256 | $SERVICE_RUNNING) |
---|
| 257 | return ;; |
---|
| 258 | $SERVICE_UNKNOWN) |
---|
| 259 | nfsdNullRpcTest |
---|
| 260 | case $? in |
---|
| 261 | $SERVICE_NOT_RUNNING) |
---|
| 262 | startNfsd ;; |
---|
| 263 | $SERVICE_RUNNING) |
---|
| 264 | return ;; |
---|
| 265 | esac |
---|
| 266 | ;; |
---|
| 267 | esac |
---|
| 268 | ;; |
---|
| 269 | $SERVICE_NOT_RUNNING) |
---|
| 270 | # Note that if nfsd was not running and the start has succeeded, we assume |
---|
| 271 | # that the process is running, and only perform the next level of tests |
---|
| 272 | # (rpc activity, and null rpc) next time. |
---|
| 273 | startNfsd |
---|
| 274 | return ;; |
---|
| 275 | esac |
---|
| 276 | } |
---|
| 277 | |
---|
| 278 | #################### |
---|
| 279 | # Monitoring mountd |
---|
| 280 | #################### |
---|
| 281 | restartMountd() { |
---|
| 282 | checkRestart MOUNTD # This takes care of MONITOR_MOUNTD < LEVEL3 (numberOfRestarts=0) |
---|
| 283 | numberOfRestarts=$? |
---|
| 284 | attemptNo=0 |
---|
| 285 | debugmsg2 "About to restart mountd (up to $numberOfRestarts times as configured, note: 0 means that the configuration level is lower than LEVEL2)" |
---|
| 286 | while [ $numberOfRestarts -gt 0 ]; do |
---|
| 287 | attemptNo=$((attemptNo+1)) |
---|
| 288 | startMountd |
---|
| 289 | restartPID=$! |
---|
| 290 | debugmsg2 "restartPID=$restartPID" |
---|
| 291 | # give the restart a chance to complete |
---|
| 292 | sleep $RESTART_TIMEOUT |
---|
| 293 | # check the status of mountd after the restart |
---|
| 294 | debugmsg2 "Try to restart mountd (attempt $attemptNo), checking the status:" |
---|
| 295 | getStatus /usr/sbin/rpc.mountd |
---|
| 296 | if [ $? -eq $SERVICE_RUNNING ]; then |
---|
| 297 | debugmsg2 "Succeeded to restart the mountd service at $attemptNo attempt" |
---|
| 298 | return |
---|
| 299 | fi |
---|
| 300 | # restart attempt has failed/hangs -> kill the process and retry |
---|
| 301 | debugmsg "Restarting mountd failed/hangs, about to kill the restart process." |
---|
| 302 | if [ -e /sbin/startproc ]; then |
---|
| 303 | debuglog kill -9 $restartPID |
---|
| 304 | else |
---|
| 305 | nfsToolKill # FIX |
---|
| 306 | fi |
---|
| 307 | numberOfRestarts=$((numberOfRestarts-1)) |
---|
| 308 | done |
---|
| 309 | |
---|
| 310 | # If we get to this point, it means that all restart attempts have failed |
---|
| 311 | msg "Failed to restart the mountd (tried $attemptNo times as configured)" |
---|
| 312 | # Failed to restart mountd, check for failover configuration parameters |
---|
| 313 | if [[ $MONITOR_MOUNTD == $LEVEL3 ]]; then |
---|
| 314 | failovermsg mountd |
---|
| 315 | invokeFailover |
---|
| 316 | else |
---|
| 317 | alertmsg mountd |
---|
| 318 | fi |
---|
| 319 | } |
---|
| 320 | |
---|
| 321 | |
---|
| 322 | # This is the main function for monitoring mountd, and take actions according |
---|
| 323 | # to the monioring level. |
---|
| 324 | monitorMountd() { |
---|
| 325 | ifGPFSDownExit $GPFS_IP |
---|
| 326 | getStatus /usr/sbin/rpc.mountd |
---|
| 327 | case $? in |
---|
| 328 | $SERVICE_RUNNING) |
---|
| 329 | return |
---|
| 330 | ;; |
---|
| 331 | $SERVICE_NOT_RUNNING) |
---|
| 332 | restartMountd |
---|
| 333 | ;; |
---|
| 334 | esac |
---|
| 335 | } |
---|
| 336 | |
---|
| 337 | ######################## |
---|
| 338 | # Monitoring the network |
---|
| 339 | ######################## |
---|
| 340 | |
---|
| 341 | # Monitor the network. |
---|
| 342 | # For now the only tests performed are: (1) whether the link is connected or not, using |
---|
| 343 | # ethtool. (2) ping the gateway. More tests can be added here later. |
---|
| 344 | monitorNetwork() { |
---|
| 345 | ifGPFSDownExit $GPFS_IP |
---|
| 346 | # TEST1: make sure that all interfaces that are used for nfs serving are connected |
---|
| 347 | nfsIfs=$(getNfsIFs $GPFS_IP) |
---|
| 348 | if [[ -z $nfsIfs ]]; then |
---|
| 349 | msg "No configured NFS IP addresses detected on any of the node's interfaces" |
---|
| 350 | nwFailoverCondition "no configured nfs interfaces" |
---|
| 351 | else |
---|
| 352 | for eth in $nfsIfs; do |
---|
| 353 | tmp=$(mmgetifconf | grep -w $eth | awk '{print $1}') |
---|
| 354 | if [[ -z $tmp ]]; then |
---|
| 355 | nwFailoverCondition "interface is down" |
---|
| 356 | fi |
---|
| 357 | |
---|
| 358 | checkLinkStatus $eth |
---|
| 359 | if [ $? -eq 0 ]; then |
---|
| 360 | continue |
---|
| 361 | else |
---|
| 362 | nwFailoverCondition "link is not connected" |
---|
| 363 | fi |
---|
| 364 | done |
---|
| 365 | fi |
---|
| 366 | |
---|
| 367 | # TEST2: check that all NFS IP addresses are enabled |
---|
| 368 | nfsIPs=$(getNfsIPs $GPFS_IP) |
---|
| 369 | for ip in $nfsIPs; do |
---|
| 370 | mmgetifconf | grep -q $ip |
---|
| 371 | [ $? -eq 0 ] && continue |
---|
| 372 | debugmsg "monitor detected $ip is down, restarting" |
---|
| 373 | ifUp $ip |
---|
| 374 | done |
---|
| 375 | |
---|
| 376 | # Now check that all NFS IP addresses for failover nodes are enabled |
---|
| 377 | nfsIPs= |
---|
| 378 | for ip in $(getFailedNodes $GPFS_IP); do |
---|
| 379 | nfsIPs="$nfsIPs $(getNfsIPs $ip)" |
---|
| 380 | done |
---|
| 381 | for ip in $nfsIPs; do |
---|
| 382 | mmgetifconf | grep -q $ip |
---|
| 383 | [ $? -eq 0 ] && continue |
---|
| 384 | [ "$(IPaddr $ip monitor)" == "OK" ] && continue |
---|
| 385 | debugmsg "monitor detected $ip is down, restarting" |
---|
| 386 | debuglog IPaddr $ip start |
---|
| 387 | done |
---|
| 388 | |
---|
| 389 | # TEST3: ping the gateway |
---|
| 390 | pingDefaultGateway |
---|
| 391 | } |
---|
| 392 | |
---|
| 393 | # List all interfaces used for NFS serving from NODELIST |
---|
| 394 | getNfsIFs () { |
---|
| 395 | thisGpfsIP=$1 |
---|
| 396 | eth="" |
---|
| 397 | # Get the list of nfs ip addresses for the given gpfs ip address |
---|
| 398 | nfsIPList=$(getNfsIPs $thisGpfsIP) |
---|
| 399 | # Handle the case that there is not entry for the node in nfs.nodes |
---|
| 400 | if [[ -z $nfsIPList ]]; then |
---|
| 401 | debugmsg "ALERT: No entry was found in nfs.nodes for this node (gpfs ip address:$thisGpfsIP)" |
---|
| 402 | nwFailoverCondition "no configured nfs interfaces" |
---|
| 403 | fi |
---|
| 404 | debugmsg2 "The list of ips is $nfsIPList" |
---|
| 405 | for nfsIP in $nfsIPList; do |
---|
| 406 | # get the "original" interface (e.g. the original interface for eth0:1 is eth0) |
---|
| 407 | origEth=$(getEthInterface $nfsIP | awk -F: '{print $1}') |
---|
| 408 | if [[ -z $origEth ]]; then |
---|
| 409 | debugmsg "ALERT: The nfs ip address $nfsIP is not assigned an interface" |
---|
| 410 | continue |
---|
| 411 | fi |
---|
| 412 | echo $origEth |
---|
| 413 | debugmsg2 "the actual interface for $nfsIP is $origEth" |
---|
| 414 | done |
---|
| 415 | } |
---|
| 416 | |
---|
| 417 | # Get the inerface for a given IP address |
---|
| 418 | # The format of nfs.nodes: GPFS_IP NFS_IP1 NFS_IP2 ... |
---|
| 419 | # NOTE: The original function was copied from Marc and was changed. |
---|
| 420 | # May requires future integration. |
---|
| 421 | getEthInterface() { |
---|
| 422 | eth="" |
---|
| 423 | # calls an executable that returns a line for each interface on the machine, |
---|
| 424 | # and what ip address it is assigned |
---|
| 425 | tmp=/tmp/mmgetifconf.$$ |
---|
| 426 | mmgetifconf > $tmp |
---|
| 427 | exec 3< $tmp |
---|
| 428 | while read -u3 iface ip mask; do |
---|
| 429 | if [[ $ip == $1 ]]; then |
---|
| 430 | eth=$iface |
---|
| 431 | fi |
---|
| 432 | done |
---|
| 433 | unlink $tmp |
---|
| 434 | # eth may be empty if there is no interface associated with this ip address |
---|
| 435 | echo $eth |
---|
| 436 | } |
---|
| 437 | |
---|
| 438 | # Locate and ping the default gateway. |
---|
| 439 | # On failure, alert and invoke failover if configured. |
---|
| 440 | pingDefaultGateway() { |
---|
| 441 | gwIP=$(route -n | awk '/UG/ {print $2}') |
---|
| 442 | [[ -z $gwIP ]] && return |
---|
| 443 | |
---|
| 444 | # Make sure the local machine is not set as the default gateway |
---|
| 445 | tmp=/tmp/mmgetifconf.$$ |
---|
| 446 | mmgetifconf > $tmp |
---|
| 447 | exec 3< $tmp |
---|
| 448 | while read -u3 iface ip mask; do |
---|
| 449 | [[ $ip == $gwIP ]] && return |
---|
| 450 | done |
---|
| 451 | unlink $tmp |
---|
| 452 | |
---|
| 453 | # try to ping the gateway |
---|
| 454 | ping -c 1 -w 5 $gwIP > /dev/null |
---|
| 455 | outPing=$? |
---|
| 456 | if [ $outPing -ne 0 ]; then |
---|
| 457 | msg "Failed to ping the gateway at $gwIP (err $outPing)" |
---|
| 458 | nwFailoverCondition "can't ping the gateway" |
---|
| 459 | else |
---|
| 460 | debugmsg2 "Succeeded to ping the gateway at $gwIP (ping returns $outPing)" |
---|
| 461 | fi |
---|
| 462 | } |
---|
| 463 | |
---|
| 464 | # Handle the network alert and failover if configured. |
---|
| 465 | nwFailoverCondition() { |
---|
| 466 | message=$1 |
---|
| 467 | debugmsg2 nwFailoverCondition $message |
---|
| 468 | if [[ $MONITOR_NETWORK -eq $LEVEL3 ]]; then |
---|
| 469 | failovermsg network $message |
---|
| 470 | invokeFailover |
---|
| 471 | else |
---|
| 472 | alertmsg network $message |
---|
| 473 | fi |
---|
| 474 | } |
---|
| 475 | |
---|
| 476 | |
---|
| 477 | ######################## |
---|
| 478 | # Monitoring portmap |
---|
| 479 | ######################## |
---|
| 480 | # Check if portmap is up; invoke failover and/or alert if configured. |
---|
| 481 | # TODO: we can test if this node is mounting anything, and if not |
---|
| 482 | # we can restart portmap, and re-register the nfs prcesses with it. |
---|
| 483 | # Currently, if this machine is mouning anything, lockd does not re-register with portmap. |
---|
| 484 | monitorPortmap() { |
---|
| 485 | getStatus /sbin/portmap |
---|
| 486 | case $? in |
---|
| 487 | $SERVICE_RUNNING) |
---|
| 488 | return |
---|
| 489 | ;; |
---|
| 490 | $SERVICE_NOT_RUNNING) |
---|
| 491 | if [[ $MONITOR_PORTMAP -eq $LEVEL1 ]]; then |
---|
| 492 | alertmsg portmap |
---|
| 493 | else |
---|
| 494 | failovermsg portmap |
---|
| 495 | invokeFailoverReboot |
---|
| 496 | fi |
---|
| 497 | ;; |
---|
| 498 | esac |
---|
| 499 | } |
---|
| 500 | |
---|
| 501 | ###################################### |
---|
| 502 | # Monitoring locking (lockd and statd) |
---|
| 503 | ###################################### |
---|
| 504 | |
---|
| 505 | # This is the main function for monitoring locking (lockd, and statd) |
---|
| 506 | monitorLocking() { |
---|
| 507 | ifGPFSDownExit $GPFS_IP |
---|
| 508 | |
---|
| 509 | # Monitor lockd. |
---|
| 510 | getStatus lockd |
---|
| 511 | case $? in |
---|
| 512 | $SERVICE_RUNNING) |
---|
| 513 | ;; |
---|
| 514 | $SERVICE_NOT_RUNNING) |
---|
| 515 | if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then |
---|
| 516 | failovermsg lockd |
---|
| 517 | invokeFailover |
---|
| 518 | else |
---|
| 519 | alertmsg lockd |
---|
| 520 | fi |
---|
| 521 | ;; |
---|
| 522 | esac |
---|
| 523 | |
---|
| 524 | # Monitor statd |
---|
| 525 | if [ -f /sbin/rpc.statd ]; then |
---|
| 526 | getStatus /sbin/rpc.statd |
---|
| 527 | case $? in |
---|
| 528 | $SERVICE_RUNNING) |
---|
| 529 | ;; |
---|
| 530 | $SERVICE_NOT_RUNNING) |
---|
| 531 | if [[ $MONITOR_STATD -ge $LEVEL2 ]]; then |
---|
| 532 | restartStatd |
---|
| 533 | else |
---|
| 534 | alertmsg statd |
---|
| 535 | fi |
---|
| 536 | ;; |
---|
| 537 | esac |
---|
| 538 | fi |
---|
| 539 | } |
---|
| 540 | |
---|
| 541 | |
---|
| 542 | # This function attempts to restart statd in the background (only once). |
---|
| 543 | restartStatd() { |
---|
| 544 | [ ! -f /sbin/rpc.statd ] && return |
---|
| 545 | |
---|
| 546 | # Kill the statd process if exists (important for registering with portmap) |
---|
| 547 | debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd) |
---|
| 548 | debuglog /etc/init.d/nfslock start |
---|
| 549 | sleep $RESTART_TIMEOUT |
---|
| 550 | getStatus /sbin/rpc.statd |
---|
| 551 | case $? in |
---|
| 552 | $SERVICE_RUNNING) |
---|
| 553 | ;; |
---|
| 554 | $SERVICE_NOT_RUNNING) |
---|
| 555 | if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then |
---|
| 556 | failovermsg statd |
---|
| 557 | invokeFailover |
---|
| 558 | else |
---|
| 559 | alertmsg statd |
---|
| 560 | fi |
---|
| 561 | ;; |
---|
| 562 | esac |
---|
| 563 | } |
---|
| 564 | |
---|
| 565 | ###################################### |
---|
| 566 | # Monitoring rsh/ssh daemon |
---|
| 567 | ###################################### |
---|
| 568 | |
---|
| 569 | startSshd() { |
---|
| 570 | checkRestart SSHD # This call takes care of MONITOR_SSHD < LEVEL3 (numberOfRestarts=0) |
---|
| 571 | numberOfRestarts=$? |
---|
| 572 | service=$(rshService) |
---|
| 573 | attemptNo=0 |
---|
| 574 | debugmsg2 "About to restart sshd (up to $numberOfRestarts times as configured, |
---|
| 575 | note: 0 means that the configuration level is lower than LEVEL2)" |
---|
| 576 | while [[ $numberOfRestarts -gt 0 ]]; do |
---|
| 577 | attemptNo=$((attemptNo+1)) |
---|
| 578 | /etc/init.d/$service restart > /tmp/$service_restart.out 2>&1 & |
---|
| 579 | restartPID=$! |
---|
| 580 | sleep $((attemptNo*RESTART_TIMEOUT)) |
---|
| 581 | debugmsg2 "Try to restart $service (attempt $attemptNo), checking the status:" |
---|
| 582 | getStatus $service |
---|
| 583 | restartStatus=$? |
---|
| 584 | debugmsg2 "The $service status after the $attemptNo attempt of restart is $restartStatus" |
---|
| 585 | if [[ $restartStatus == $SERVICE_RUNNING ]]; then |
---|
| 586 | return |
---|
| 587 | fi |
---|
| 588 | debugmsg "Restarting $service failed/hangs, about to kill the restart process. |
---|
| 589 | The output of restart attempt is in /tmp/$service_restart.out" |
---|
| 590 | kill -9 $restartPID 2>&1 |
---|
| 591 | numberOfRestartsLeft=$((numberOfRestartsLeft-1)) |
---|
| 592 | done |
---|
| 593 | # Failed to restart service, check for failover configuration parameters |
---|
| 594 | msg "Failed to restart the $service process (tried $attemptNo times as configured)" |
---|
| 595 | if [[ $MONITOR_SSHD == $LEVEL3 ]]; then |
---|
| 596 | failovermsg $service |
---|
| 597 | invokeFailover |
---|
| 598 | else |
---|
| 599 | alertmsg $service |
---|
| 600 | fi |
---|
| 601 | } |
---|
| 602 | |
---|
| 603 | monitorSshd() { |
---|
| 604 | ifGPFSDownExit $GPFS_IP |
---|
| 605 | service=$(rshService) |
---|
| 606 | getStatus $service |
---|
| 607 | case $? in |
---|
| 608 | $SERVICE_RUNNING) |
---|
| 609 | return |
---|
| 610 | ;; |
---|
| 611 | $SERVICE_NOT_RUNNING) |
---|
| 612 | startSshd |
---|
| 613 | ;; |
---|
| 614 | esac |
---|
| 615 | } |
---|
| 616 | |
---|
| 617 | ###################################### |
---|
| 618 | # Monitoring gpfs daemon |
---|
| 619 | ###################################### |
---|
| 620 | monitorGPFS() { |
---|
| 621 | ifGPFSDownExit $GPFS_IP |
---|
| 622 | if [ $? != 0 ]; then |
---|
| 623 | failovermsg GPFS |
---|
| 624 | exit |
---|
| 625 | fi |
---|
| 626 | } |
---|
| 627 | |
---|
| 628 | ###################################### |
---|
| 629 | # Main |
---|
| 630 | ###################################### |
---|
| 631 | nfsMonitor() { |
---|
| 632 | GPFS_IP=$(myGPFSIP) |
---|
| 633 | |
---|
| 634 | tempvar=1 |
---|
| 635 | while [ $tempvar == 1 ] ; do |
---|
| 636 | sleep $MONITOR_INTERVAL |
---|
| 637 | |
---|
| 638 | # GPFS monitoring |
---|
| 639 | if [[ $MONITOR_GPFS -gt $LEVEL3 ]]; then |
---|
| 640 | debugmsg2 "==========GPFS monitoring===============" |
---|
| 641 | monitorGPFS |
---|
| 642 | debugmsg2 "done monitoring GPFS" |
---|
| 643 | fi |
---|
| 644 | |
---|
| 645 | # Network monitoring |
---|
| 646 | if [[ $MONITOR_NETWORK -gt $LEVEL0 ]]; then |
---|
| 647 | debugmsg2 "==========NW monitoring===============" |
---|
| 648 | monitorNetwork |
---|
| 649 | debugmsg2 "done monitoring the network" |
---|
| 650 | fi |
---|
| 651 | |
---|
| 652 | # Monitoring portmap |
---|
| 653 | # Note that all of the rpc services have to be registered with portmap |
---|
| 654 | # in order for new clients to access them. In our monitoring script, |
---|
| 655 | # we only test for portmap once, but if it fails afterward, the services |
---|
| 656 | # may not be available for new clients even though they are running. |
---|
| 657 | # This is the case until portmap is restarted again, |
---|
| 658 | # and the rpc processes re-register with it. |
---|
| 659 | if [[ $MONITOR_PORTMAP -gt $LEVEL0 ]]; then |
---|
| 660 | debugmsg2 "===========portmap monitoring==============" |
---|
| 661 | monitorPortmap |
---|
| 662 | debugmsg2 "done monitoring portmap" |
---|
| 663 | fi |
---|
| 664 | |
---|
| 665 | # Monitoring nfsd |
---|
| 666 | # The system monitors nfsd only if the nfsd is configured to be monitored in |
---|
| 667 | # the configuration file. |
---|
| 668 | # There are several monitoring methods/levels: |
---|
| 669 | # (test 1) Check that the nfsd process is running. |
---|
| 670 | # (test 2) Monitor the rpc-nfs activity. |
---|
| 671 | # (test 3) Send null rpc to the nfsd service. |
---|
| 672 | # |
---|
| 673 | # Order of tests: |
---|
| 674 | # Perfom (test 1). If the process is not running, goto Action. |
---|
| 675 | # If the process is running, perform (test 2), if there is nfs activity, |
---|
| 676 | # goto Done. |
---|
| 677 | # If no activity is detected, perform (test 3); if fails -- goto Action; |
---|
| 678 | # if pass -- goto Done. |
---|
| 679 | |
---|
| 680 | # Action: if nfsd is not running & configured to be restarted then the nfsd is |
---|
| 681 | # restarted. |
---|
| 682 | # If all restart attempts have failed, the node is declared "dead" for nfs |
---|
| 683 | # serving, and if nfsd is configured as "failover" then the node is failed over |
---|
| 684 | # to another node, and a user level alert is invoked. |
---|
| 685 | |
---|
| 686 | # Done: nfsd is up and running, continue. |
---|
| 687 | |
---|
| 688 | if [[ $MONITOR_NFSD -gt $LEVEL0 ]]; then |
---|
| 689 | debugmsg2 "==========nfsd monitoring===============" |
---|
| 690 | monitorNfsd |
---|
| 691 | debugmsg2 "done monitoring nfsd" |
---|
| 692 | fi |
---|
| 693 | |
---|
| 694 | # Monitoring mountd |
---|
| 695 | # The system monitors mountd only if mountd is configured to be monitored. |
---|
| 696 | # (test 1) Check that the mountd process is running. |
---|
| 697 | # Order of tests: |
---|
| 698 | # Perfom (test 1). If the process is not running, goto Action. |
---|
| 699 | # If the process is running, goto Done. |
---|
| 700 | |
---|
| 701 | # Action: if mountd is not running & mountd is configured to be restarted then the mountd is |
---|
| 702 | # restarted. |
---|
| 703 | # If all restart attempts have failed, the node is declared "dead" for nfs |
---|
| 704 | # serving. If mountd is configured as "failover" then the node is failed over |
---|
| 705 | # to another node, and alert is sent. |
---|
| 706 | |
---|
| 707 | if [[ $MONITOR_MOUNTD -gt $LEVEL0 ]]; then |
---|
| 708 | debugmsg2 "============mountd monitoring=============" |
---|
| 709 | monitorMountd |
---|
| 710 | debugmsg2 "done monitoring mountd" |
---|
| 711 | fi |
---|
| 712 | |
---|
| 713 | # Monitoring locking (lockd and statd) |
---|
| 714 | # The system monitors locking only if configured. |
---|
| 715 | # There are several monitoring methods/levels: |
---|
| 716 | # (test 1) Check that the lockd processes is running |
---|
| 717 | # (test 2) Check that the statd processes is running |
---|
| 718 | |
---|
| 719 | # If lockd is not running, the node is declared "dead" for nfs serving, |
---|
| 720 | # and if locking is configured as "failover" then the node is failed over to another |
---|
| 721 | # node, and alert is sent. |
---|
| 722 | # If statd is not running, a restart takes place if statd is configured to be restarted. |
---|
| 723 | # If the restart has failed, the node is declared "dead" for nfs serving, |
---|
| 724 | # and if configured, the node is failed over to another |
---|
| 725 | # node, and alert is sent. |
---|
| 726 | |
---|
| 727 | if [[ $MONITOR_STATD -gt $LEVEL0 ]]; then |
---|
| 728 | debugmsg2 "==========statd monitoring===============" |
---|
| 729 | monitorLocking |
---|
| 730 | debugmsg2 "done monitoring statd" |
---|
| 731 | fi |
---|
| 732 | |
---|
| 733 | # Monitoring sshd |
---|
| 734 | # The system monitors sshd only if sshd is configured to be monitored. |
---|
| 735 | # (test 1) Check that the sshd process is running. |
---|
| 736 | # Order of tests: |
---|
| 737 | # Perfom (test 1). If the process is not running, goto Action. |
---|
| 738 | # If the process is running, goto Done. |
---|
| 739 | # Action: if sshd is not running is configured to be restarted, then restart. |
---|
| 740 | # If all restart attempts have failed, the node is declared "dead" for ssh |
---|
| 741 | # serving. If sshd is configured as "failover" then the node is failed over |
---|
| 742 | # to another node, and alert is sent. |
---|
| 743 | |
---|
| 744 | if [[ $MONITOR_SSHD -gt $LEVEL0 ]]; then |
---|
| 745 | debugmsg2 "============sshd monitoring=============" |
---|
| 746 | monitorSshd |
---|
| 747 | debugmsg2 "done monitoring sshd" |
---|
| 748 | fi |
---|
| 749 | |
---|
| 750 | done |
---|
| 751 | } |
---|
| 752 | |
---|
| 753 | stopNfsMonitor() { |
---|
| 754 | nfsMonToolPid=`pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor` |
---|
| 755 | if [[ -z $nfsMonToolPid ]]; then |
---|
| 756 | debugmsg2 "Warning: Couldn't find the monitoring process to stop" |
---|
| 757 | return |
---|
| 758 | fi |
---|
| 759 | kill -9 $nfsMonToolPid |
---|
| 760 | msg "Monitoring has stopped." |
---|
| 761 | } |
---|
| 762 | |
---|
| 763 | startNfsMonitor() { |
---|
| 764 | nfsMonitor & |
---|
| 765 | msg "Monitoring has started." |
---|
| 766 | } |
---|
| 767 | |
---|
| 768 | restartNfsMonitor() { |
---|
| 769 | stopNfsMonitor |
---|
| 770 | startNfsMonitor |
---|
| 771 | } |
---|
| 772 | |
---|
| 773 | statusNfsMonitor() { |
---|
| 774 | nfsMonToolPid=$(pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor) |
---|
| 775 | if [[ -z $nfsMonToolPid ]]; then |
---|
| 776 | debugmsg "nfsmonitor is not running" |
---|
| 777 | return |
---|
| 778 | else |
---|
| 779 | debugmsg "nfsmonitor is running" |
---|
| 780 | fi |
---|
| 781 | } |
---|
| 782 | |
---|
| 783 | ################################# |
---|
| 784 | # Main program |
---|
| 785 | ################################# |
---|
| 786 | case "$1" in |
---|
| 787 | -s|start) |
---|
| 788 | startNfsMonitor |
---|
| 789 | ;; |
---|
| 790 | -e|stop) |
---|
| 791 | stopNfsMonitor |
---|
| 792 | ;; |
---|
| 793 | -r|restart) |
---|
| 794 | restartNfsMonitor |
---|
| 795 | ;; |
---|
| 796 | -q|status) |
---|
| 797 | statusNfsMonitor |
---|
| 798 | ;; |
---|
| 799 | *) |
---|
| 800 | echo $"Usage: $0 [-s|start|-e|end|-r|restart|-q|status]" |
---|
| 801 | exit 1 |
---|
| 802 | ;; |
---|
| 803 | esac |
---|