1 | #!/bin/ksh |
---|
2 | # @(#)23 1.12.1.10 src/avs/fs/mmfs/samples/nfscluster/nfsmonitor, mmfs, avs_rgpfs24, rgpfs24s011a 3/20/07 16:46:39 |
---|
3 | # |
---|
4 | |
---|
5 | # HA-NFS monitoring |
---|
6 | # Usage: nfsmonitor [start|stop|restart|status] |
---|
7 | |
---|
8 | # Monitor levels |
---|
9 | LEVEL0=0 # NO_MONITORING |
---|
10 | LEVEL1=1 # ALERT_ONLY |
---|
11 | LEVEL2=2 # RESTART (not applicable for all services) |
---|
12 | LEVEL3=3 # FAILOVER |
---|
13 | |
---|
14 | MONITOR_INTERVAL=${MONITOR_INTERVAL-15} |
---|
15 | MONITOR_NETWORK=${MONITOR_NETWORK-3} |
---|
16 | MONITOR_PORTMAP=${MONITOR_PORTMAP-3} |
---|
17 | MONITOR_NFSD=${MONITOR_NFSD-3} |
---|
18 | MONITOR_MOUNTD=${MONITOR_MOUNTD-3} |
---|
19 | MONITOR_STATD=${MONITOR_STATD-3} |
---|
20 | MONITOR_SSHD=${MONITOR_SSHD-3} |
---|
21 | |
---|
22 | # All times below are in seconds |
---|
23 | NFS_RPC_ACT_SAMPLE_INTERVAL=3 |
---|
24 | RESTART_TIMEOUT=7 |
---|
25 | |
---|
26 | # Internal flags |
---|
27 | NUMBER_OF_RESTARTS=3 |
---|
28 | |
---|
29 | # Functions return values (used only within this file) |
---|
30 | SERVICE_RUNNING=1 |
---|
31 | SERVICE_NOT_RUNNING=2 |
---|
32 | SERVICE_UNKNOWN=3 |
---|
33 | |
---|
34 | if [ ! -f /var/mmfs/etc/nfsfuncs ]; then |
---|
35 | echo "$0: Can't find NFS functions in /var/mmfs/etc" |
---|
36 | exit 0 |
---|
37 | fi |
---|
38 | . /var/mmfs/etc/nfsfuncs |
---|
39 | |
---|
40 | # Display an alert message in syslog. Optionally, call a customer-provided alert script |
---|
41 | # Usage: alert <message> <node issuing alert> |
---|
42 | alert() { |
---|
43 | service=$1 |
---|
44 | actionmsg=$2 |
---|
45 | shift 2 |
---|
46 | [ -n "$*" ] && comment="($*)" |
---|
47 | message="Monitoring detected $service is inactive, $action $comment" |
---|
48 | msg "$message" |
---|
49 | [ ! -e /var/mmfs/etc/alert ] && return |
---|
50 | /var/mmfs/etc/alert "$message" "$GPFS_IP" |
---|
51 | } |
---|
52 | |
---|
53 | alertmsg() { |
---|
54 | service=$1 |
---|
55 | shift |
---|
56 | alert $service "no action taken as configured" $* |
---|
57 | } |
---|
58 | |
---|
59 | failovermsg() { |
---|
60 | service=$1 |
---|
61 | shift |
---|
62 | alert $service "node failure initiated as configured" $* |
---|
63 | } |
---|
64 | |
---|
65 | # Terminate all processes that are running the nfs tool. |
---|
66 | # This is useful for the case that the restart process is hanging... |
---|
67 | nfsToolKill() { |
---|
68 | if [ -f /etc/init.d/nfsserver ]; then |
---|
69 | NFSTOOL=/etc/init.d/nfsserver |
---|
70 | else |
---|
71 | NFSTOOL=/etc/init.d/nfs |
---|
72 | fi |
---|
73 | nfsToolBase=${NFSTOOL##*/} |
---|
74 | nfsToolPid=`pidof -o $$ -o $PPID -o %PPID -x $NFSTOOL || \ |
---|
75 | pidof -o $$ -o $PPID -o %PPID -x $nfsToolBase` |
---|
76 | debugmsg2 "The pids of $NFSTOOL are $nfsToolPid" |
---|
77 | if [[ -z $nfsToolPid ]]; then |
---|
78 | return |
---|
79 | fi |
---|
80 | kill -9 $nfsToolPid |
---|
81 | } |
---|
82 | |
---|
83 | # Check if a given service is configured to be restarted. |
---|
84 | # Returns: the number of restarts or 0 if configured for no restart |
---|
85 | checkRestart() { |
---|
86 | service=$1 |
---|
87 | eval service_level='$'MONITOR_$service |
---|
88 | debugmsg2 "$service level is $service_level" |
---|
89 | [ $service_level -ge $LEVEL2 ] && return $NUMBER_OF_RESTARTS |
---|
90 | return 0 |
---|
91 | } |
---|
92 | |
---|
93 | # Get the status of a given service. |
---|
94 | # Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING |
---|
95 | getStatus() { |
---|
96 | service=$1 |
---|
97 | checkStatus $service |
---|
98 | status=$? |
---|
99 | if [ $status -eq 0 ]; then |
---|
100 | debugmsg2 "$service is running" |
---|
101 | return $SERVICE_RUNNING |
---|
102 | else |
---|
103 | debugmsg "$service is not running (status $status)" |
---|
104 | return $SERVICE_NOT_RUNNING |
---|
105 | fi |
---|
106 | } |
---|
107 | |
---|
108 | invokeFailover() { |
---|
109 | debugmsg "Invoking failover..." |
---|
110 | # Stop nfs gracefully to prevent client from getting ESTALE |
---|
111 | debuglog2 touch /tmp/ha-nfs-reboot 2> /dev/null |
---|
112 | stop.nfs |
---|
113 | # Kill the gpfs daemon on the node to invoke failover |
---|
114 | debugmsg "Stopping GPFS..." |
---|
115 | /etc/init.d/gpfs stop |
---|
116 | exit |
---|
117 | } |
---|
118 | |
---|
119 | invokeFailoverReboot() { |
---|
120 | debugmsg "Invoking failover with reboot..." |
---|
121 | # Stop nfs gracefully to prevent client from getting ESTALE |
---|
122 | stop.nfs |
---|
123 | # Kill the gpfs daemon on the node to invoke failover |
---|
124 | debugmsg "Stopping GPFS before reboot..." |
---|
125 | /etc/init.d/gpfs stop |
---|
126 | reboot |
---|
127 | exit |
---|
128 | } |
---|
129 | |
---|
130 | ################# |
---|
131 | # Monitoring nfsd |
---|
132 | ################# |
---|
133 | |
---|
134 | # This function samples /proc/net/rpc/nfsd twice within a given interval |
---|
135 | # and compares the two samples to detect any nfsd rpc activity |
---|
136 | # Returns: if activity was detected -> SERVICE_RUNNING |
---|
137 | # if there is no entry in proc or no activity was detected, no conclusion -> |
---|
138 | # SERVICE_UNKNOWN |
---|
139 | detectNfsdActivity() { |
---|
140 | set -A v2procs NAME COUNT NULL GETATTR SETATTR ROOT LOOKUP READLINK \ |
---|
141 | READ WRCACHE WRITE CREATE REMOVE RENAME \ |
---|
142 | LINK SYMLINK MKDIR RMDIR READDIR FSSTAT |
---|
143 | set -A v3procs NAME COUNT NULL GETATTR SETATTR LOOKUP ACCESS READLINK \ |
---|
144 | READ WRITE CREATE MKDIR SYMLINK MKNOD REMOVE RMDIR RENAME \ |
---|
145 | LINK READDIR READDIRPLUS FSSTAT FSINFO PATHCONF COMMIT |
---|
146 | |
---|
147 | procfile=/proc/net/rpc/nfsd |
---|
148 | if [ ! -f $procfile ]; then |
---|
149 | msg "Monitoring could not find /proc/net/rpc/nfsd" |
---|
150 | return $SERVICE_UNKNOWN |
---|
151 | fi |
---|
152 | # Sample v2/v3 activity (using proc) in the next NFS_RPC_ACT_SAMPLE_INTERVAL sec |
---|
153 | set -A v2procs1 $(cat $procfile | grep -w proc2) |
---|
154 | set -A v3procs1 $(cat $procfile | grep -w proc3) |
---|
155 | debugmsg2 "Sleeping for $NFS_RPC_ACT_SAMPLE_INTERVAL" |
---|
156 | sleep $NFS_RPC_ACT_SAMPLE_INTERVAL |
---|
157 | set -A v2procs2 $(cat $procfile | grep -w proc2) |
---|
158 | set -A v3procs2 $(cat $procfile | grep -w proc3) |
---|
159 | |
---|
160 | # Use the samples to detect activity |
---|
161 | p=2 # skipping name and count fields |
---|
162 | n=${#v2procs[@]} |
---|
163 | while [ $p -lt $n ]; do |
---|
164 | activity=$((${v2procs2[$p]}-${v2procs1[$p]})) |
---|
165 | if [ $activity -gt 0 ]; then |
---|
166 | debugmsg2 "nfsd activity detected: $activity ${v2procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec" |
---|
167 | return $SERVICE_RUNNING |
---|
168 | fi |
---|
169 | p=$((p+1)) |
---|
170 | done |
---|
171 | p=2 # skipping name and count fields |
---|
172 | n=${#v3procs[@]} |
---|
173 | while [ $p -lt $n ]; do |
---|
174 | activity=$((${v3procs2[$p]}-${v3procs1[$p]})) |
---|
175 | if [ $activity -gt 0 ]; then |
---|
176 | debugmsg2 "nfsd activity detected: $activity ${v3procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec" |
---|
177 | return $SERVICE_RUNNING |
---|
178 | fi |
---|
179 | p=$((p+1)) |
---|
180 | done |
---|
181 | # no activity detected |
---|
182 | debugmsg2 "Could not detect nfsd activity using /proc" |
---|
183 | return $SERVICE_UNKNOWN |
---|
184 | } |
---|
185 | |
---|
186 | # If nfsd is configured to be restarted, this function attempts to start/restart |
---|
187 | # the nfsd process as many times as configured. The operation parameter is set |
---|
188 | # to start/restart. |
---|
189 | # The success of restart is determined by checking the status of nfsd. |
---|
190 | # Note that portmap has to be running in order to restart nfsd. |
---|
191 | # If restart has failed, alert and failover are invoked as configured. |
---|
192 | # For nfsd, we use the /etc/init.d/nfs(RH)|nfsserver(SUSE) utility since it stops |
---|
193 | # the service before restarting it (which is important to make sure that nfsd |
---|
194 | # re-registerin with portmap, for example, in the case of restarting nfsd after |
---|
195 | # restarting portmap. This utility also reloads /etc/exports. |
---|
196 | startNfsd() { |
---|
197 | checkRestart NFSD # This call takes care of MONITOR_NFSD < LEVEL2 (numberOfRestarts=0) |
---|
198 | numberOfRestarts=$? |
---|
199 | attemptNo=0 |
---|
200 | debugmsg2 "About to start nfsd (up to $numberOfRestarts times)" |
---|
201 | while [ $numberOfRestarts -gt 0 ]; do |
---|
202 | attemptNo=$((attemptNo+1)) |
---|
203 | debugmsg2 "In startnfsd, attempt number = $attemptNo" |
---|
204 | nfsService start |
---|
205 | # give the restart/start chance to complete |
---|
206 | sleep $RESTART_TIMEOUT |
---|
207 | getStatus nfsd |
---|
208 | status=$? |
---|
209 | debugmsg2 "The nfsd status after $attemptNo attempts to start is $status" |
---|
210 | [[ $status == $SERVICE_RUNNING ]] && return |
---|
211 | # start attempt has failed/hangs -> kill the process and retry |
---|
212 | debugmsg "start nfsd failed/hangs, about to kill the start process." |
---|
213 | nfsToolKill |
---|
214 | numberOfAttempts=$((numberOfRestarts-1)) |
---|
215 | done |
---|
216 | |
---|
217 | # If we get here, it means that all start attempts have failed |
---|
218 | # Failed to restart nfsd, check for failover configuration parameters |
---|
219 | if [[ $MONITOR_NFSD -eq $LEVEL3 ]]; then |
---|
220 | failovermsg nfsd |
---|
221 | invokeFailover |
---|
222 | else |
---|
223 | alertmsg nfsd |
---|
224 | fi |
---|
225 | } |
---|
226 | |
---|
227 | # NULL RPC test: use rpcinfo to send a null rpc to nfs v3 |
---|
228 | # Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING |
---|
229 | nfsdNullRpcTest() { |
---|
230 | hostname=$(hostname) |
---|
231 | # Use rpcinfo to send a null rpc to nfs v3 using UDP |
---|
232 | debuglog2 rpcinfo -u $hostname nfs 3 |
---|
233 | if [ $? -eq 0 ]; then |
---|
234 | debugmsg2 "Sent nullrpc to nfsd v3 using rpcinfo, the service is up and running" |
---|
235 | return $SERVICE_RUNNING |
---|
236 | else |
---|
237 | msg "Sent nullrpc to nfsd v3 using rpcinfo, the service is not available (it maybe because portmap was started after nfsd, or that rpcinfo is not installed)" |
---|
238 | return $SERVICE_NOT_RUNNING |
---|
239 | fi |
---|
240 | } |
---|
241 | |
---|
242 | # Run all the tests for monitoring nfsd, and take actions according |
---|
243 | # to the monioring level. |
---|
244 | monitorNfsd() { |
---|
245 | ifGPFSDownExit $GPFS_IP |
---|
246 | |
---|
247 | # Check that the nfsd process is running |
---|
248 | getStatus nfsd |
---|
249 | runStatus=$? |
---|
250 | |
---|
251 | case $runStatus in |
---|
252 | $SERVICE_RUNNING) |
---|
253 | debugmsg2 "Perform more tests, to make sure that nfsd is functioning" |
---|
254 | detectNfsdActivity |
---|
255 | case $? in |
---|
256 | $SERVICE_RUNNING) |
---|
257 | return ;; |
---|
258 | $SERVICE_UNKNOWN) |
---|
259 | nfsdNullRpcTest |
---|
260 | case $? in |
---|
261 | $SERVICE_NOT_RUNNING) |
---|
262 | startNfsd ;; |
---|
263 | $SERVICE_RUNNING) |
---|
264 | return ;; |
---|
265 | esac |
---|
266 | ;; |
---|
267 | esac |
---|
268 | ;; |
---|
269 | $SERVICE_NOT_RUNNING) |
---|
270 | # Note that if nfsd was not running and the start has succeeded, we assume |
---|
271 | # that the process is running, and only perform the next level of tests |
---|
272 | # (rpc activity, and null rpc) next time. |
---|
273 | startNfsd |
---|
274 | return ;; |
---|
275 | esac |
---|
276 | } |
---|
277 | |
---|
278 | #################### |
---|
279 | # Monitoring mountd |
---|
280 | #################### |
---|
281 | restartMountd() { |
---|
282 | checkRestart MOUNTD # This takes care of MONITOR_MOUNTD < LEVEL3 (numberOfRestarts=0) |
---|
283 | numberOfRestarts=$? |
---|
284 | attemptNo=0 |
---|
285 | debugmsg2 "About to restart mountd (up to $numberOfRestarts times as configured, note: 0 means that the configuration level is lower than LEVEL2)" |
---|
286 | while [ $numberOfRestarts -gt 0 ]; do |
---|
287 | attemptNo=$((attemptNo+1)) |
---|
288 | startMountd |
---|
289 | restartPID=$! |
---|
290 | debugmsg2 "restartPID=$restartPID" |
---|
291 | # give the restart a chance to complete |
---|
292 | sleep $RESTART_TIMEOUT |
---|
293 | # check the status of mountd after the restart |
---|
294 | debugmsg2 "Try to restart mountd (attempt $attemptNo), checking the status:" |
---|
295 | getStatus /usr/sbin/rpc.mountd |
---|
296 | if [ $? -eq $SERVICE_RUNNING ]; then |
---|
297 | debugmsg2 "Succeeded to restart the mountd service at $attemptNo attempt" |
---|
298 | return |
---|
299 | fi |
---|
300 | # restart attempt has failed/hangs -> kill the process and retry |
---|
301 | debugmsg "Restarting mountd failed/hangs, about to kill the restart process." |
---|
302 | if [ -e /sbin/startproc ]; then |
---|
303 | debuglog kill -9 $restartPID |
---|
304 | else |
---|
305 | nfsToolKill # FIX |
---|
306 | fi |
---|
307 | numberOfRestarts=$((numberOfRestarts-1)) |
---|
308 | done |
---|
309 | |
---|
310 | # If we get to this point, it means that all restart attempts have failed |
---|
311 | msg "Failed to restart the mountd (tried $attemptNo times as configured)" |
---|
312 | # Failed to restart mountd, check for failover configuration parameters |
---|
313 | if [[ $MONITOR_MOUNTD == $LEVEL3 ]]; then |
---|
314 | failovermsg mountd |
---|
315 | invokeFailover |
---|
316 | else |
---|
317 | alertmsg mountd |
---|
318 | fi |
---|
319 | } |
---|
320 | |
---|
321 | |
---|
322 | # This is the main function for monitoring mountd, and take actions according |
---|
323 | # to the monioring level. |
---|
324 | monitorMountd() { |
---|
325 | ifGPFSDownExit $GPFS_IP |
---|
326 | getStatus /usr/sbin/rpc.mountd |
---|
327 | case $? in |
---|
328 | $SERVICE_RUNNING) |
---|
329 | return |
---|
330 | ;; |
---|
331 | $SERVICE_NOT_RUNNING) |
---|
332 | restartMountd |
---|
333 | ;; |
---|
334 | esac |
---|
335 | } |
---|
336 | |
---|
337 | ######################## |
---|
338 | # Monitoring the network |
---|
339 | ######################## |
---|
340 | |
---|
341 | # Monitor the network. |
---|
342 | # For now the only tests performed are: (1) whether the link is connected or not, using |
---|
343 | # ethtool. (2) ping the gateway. More tests can be added here later. |
---|
344 | monitorNetwork() { |
---|
345 | ifGPFSDownExit $GPFS_IP |
---|
346 | # TEST1: make sure that all interfaces that are used for nfs serving are connected |
---|
347 | nfsIfs=$(getNfsIFs $GPFS_IP) |
---|
348 | if [[ -z $nfsIfs ]]; then |
---|
349 | msg "No configured NFS IP addresses detected on any of the node's interfaces" |
---|
350 | nwFailoverCondition "no configured nfs interfaces" |
---|
351 | else |
---|
352 | for eth in $nfsIfs; do |
---|
353 | tmp=$(mmgetifconf | grep -w $eth | awk '{print $1}') |
---|
354 | if [[ -z $tmp ]]; then |
---|
355 | nwFailoverCondition "interface is down" |
---|
356 | fi |
---|
357 | |
---|
358 | checkLinkStatus $eth |
---|
359 | if [ $? -eq 0 ]; then |
---|
360 | continue |
---|
361 | else |
---|
362 | nwFailoverCondition "link is not connected" |
---|
363 | fi |
---|
364 | done |
---|
365 | fi |
---|
366 | |
---|
367 | # TEST2: check that all NFS IP addresses are enabled |
---|
368 | nfsIPs=$(getNfsIPs $GPFS_IP) |
---|
369 | for ip in $nfsIPs; do |
---|
370 | mmgetifconf | grep -q $ip |
---|
371 | [ $? -eq 0 ] && continue |
---|
372 | debugmsg "monitor detected $ip is down, restarting" |
---|
373 | ifUp $ip |
---|
374 | done |
---|
375 | |
---|
376 | # Now check that all NFS IP addresses for failover nodes are enabled |
---|
377 | nfsIPs= |
---|
378 | for ip in $(getFailedNodes $GPFS_IP); do |
---|
379 | nfsIPs="$nfsIPs $(getNfsIPs $ip)" |
---|
380 | done |
---|
381 | for ip in $nfsIPs; do |
---|
382 | mmgetifconf | grep -q $ip |
---|
383 | [ $? -eq 0 ] && continue |
---|
384 | [ "$(IPaddr $ip monitor)" == "OK" ] && continue |
---|
385 | debugmsg "monitor detected $ip is down, restarting" |
---|
386 | debuglog IPaddr $ip start |
---|
387 | done |
---|
388 | |
---|
389 | # TEST3: ping the gateway |
---|
390 | pingDefaultGateway |
---|
391 | } |
---|
392 | |
---|
393 | # List all interfaces used for NFS serving from NODELIST |
---|
394 | getNfsIFs () { |
---|
395 | thisGpfsIP=$1 |
---|
396 | eth="" |
---|
397 | # Get the list of nfs ip addresses for the given gpfs ip address |
---|
398 | nfsIPList=$(getNfsIPs $thisGpfsIP) |
---|
399 | # Handle the case that there is not entry for the node in nfs.nodes |
---|
400 | if [[ -z $nfsIPList ]]; then |
---|
401 | debugmsg "ALERT: No entry was found in nfs.nodes for this node (gpfs ip address:$thisGpfsIP)" |
---|
402 | nwFailoverCondition "no configured nfs interfaces" |
---|
403 | fi |
---|
404 | debugmsg2 "The list of ips is $nfsIPList" |
---|
405 | for nfsIP in $nfsIPList; do |
---|
406 | # get the "original" interface (e.g. the original interface for eth0:1 is eth0) |
---|
407 | origEth=$(getEthInterface $nfsIP | awk -F: '{print $1}') |
---|
408 | if [[ -z $origEth ]]; then |
---|
409 | debugmsg "ALERT: The nfs ip address $nfsIP is not assigned an interface" |
---|
410 | continue |
---|
411 | fi |
---|
412 | echo $origEth |
---|
413 | debugmsg2 "the actual interface for $nfsIP is $origEth" |
---|
414 | done |
---|
415 | } |
---|
416 | |
---|
417 | # Get the inerface for a given IP address |
---|
418 | # The format of nfs.nodes: GPFS_IP NFS_IP1 NFS_IP2 ... |
---|
419 | # NOTE: The original function was copied from Marc and was changed. |
---|
420 | # May requires future integration. |
---|
421 | getEthInterface() { |
---|
422 | eth="" |
---|
423 | # calls an executable that returns a line for each interface on the machine, |
---|
424 | # and what ip address it is assigned |
---|
425 | tmp=/tmp/mmgetifconf.$$ |
---|
426 | mmgetifconf > $tmp |
---|
427 | exec 3< $tmp |
---|
428 | while read -u3 iface ip mask; do |
---|
429 | if [[ $ip == $1 ]]; then |
---|
430 | eth=$iface |
---|
431 | fi |
---|
432 | done |
---|
433 | unlink $tmp |
---|
434 | # eth may be empty if there is no interface associated with this ip address |
---|
435 | echo $eth |
---|
436 | } |
---|
437 | |
---|
438 | # Locate and ping the default gateway. |
---|
439 | # On failure, alert and invoke failover if configured. |
---|
440 | pingDefaultGateway() { |
---|
441 | gwIP=$(route -n | awk '/UG/ {print $2}') |
---|
442 | [[ -z $gwIP ]] && return |
---|
443 | |
---|
444 | # Make sure the local machine is not set as the default gateway |
---|
445 | tmp=/tmp/mmgetifconf.$$ |
---|
446 | mmgetifconf > $tmp |
---|
447 | exec 3< $tmp |
---|
448 | while read -u3 iface ip mask; do |
---|
449 | [[ $ip == $gwIP ]] && return |
---|
450 | done |
---|
451 | unlink $tmp |
---|
452 | |
---|
453 | # try to ping the gateway |
---|
454 | ping -c 1 -w 5 $gwIP > /dev/null |
---|
455 | outPing=$? |
---|
456 | if [ $outPing -ne 0 ]; then |
---|
457 | msg "Failed to ping the gateway at $gwIP (err $outPing)" |
---|
458 | nwFailoverCondition "can't ping the gateway" |
---|
459 | else |
---|
460 | debugmsg2 "Succeeded to ping the gateway at $gwIP (ping returns $outPing)" |
---|
461 | fi |
---|
462 | } |
---|
463 | |
---|
464 | # Handle the network alert and failover if configured. |
---|
465 | nwFailoverCondition() { |
---|
466 | message=$1 |
---|
467 | debugmsg2 nwFailoverCondition $message |
---|
468 | if [[ $MONITOR_NETWORK -eq $LEVEL3 ]]; then |
---|
469 | failovermsg network $message |
---|
470 | invokeFailover |
---|
471 | else |
---|
472 | alertmsg network $message |
---|
473 | fi |
---|
474 | } |
---|
475 | |
---|
476 | |
---|
477 | ######################## |
---|
478 | # Monitoring portmap |
---|
479 | ######################## |
---|
480 | # Check if portmap is up; invoke failover and/or alert if configured. |
---|
481 | # TODO: we can test if this node is mounting anything, and if not |
---|
482 | # we can restart portmap, and re-register the nfs prcesses with it. |
---|
483 | # Currently, if this machine is mouning anything, lockd does not re-register with portmap. |
---|
484 | monitorPortmap() { |
---|
485 | getStatus /sbin/portmap |
---|
486 | case $? in |
---|
487 | $SERVICE_RUNNING) |
---|
488 | return |
---|
489 | ;; |
---|
490 | $SERVICE_NOT_RUNNING) |
---|
491 | if [[ $MONITOR_PORTMAP -eq $LEVEL1 ]]; then |
---|
492 | alertmsg portmap |
---|
493 | else |
---|
494 | failovermsg portmap |
---|
495 | invokeFailoverReboot |
---|
496 | fi |
---|
497 | ;; |
---|
498 | esac |
---|
499 | } |
---|
500 | |
---|
501 | ###################################### |
---|
502 | # Monitoring locking (lockd and statd) |
---|
503 | ###################################### |
---|
504 | |
---|
505 | # This is the main function for monitoring locking (lockd, and statd) |
---|
506 | monitorLocking() { |
---|
507 | ifGPFSDownExit $GPFS_IP |
---|
508 | |
---|
509 | # Monitor lockd. |
---|
510 | getStatus lockd |
---|
511 | case $? in |
---|
512 | $SERVICE_RUNNING) |
---|
513 | ;; |
---|
514 | $SERVICE_NOT_RUNNING) |
---|
515 | if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then |
---|
516 | failovermsg lockd |
---|
517 | invokeFailover |
---|
518 | else |
---|
519 | alertmsg lockd |
---|
520 | fi |
---|
521 | ;; |
---|
522 | esac |
---|
523 | |
---|
524 | # Monitor statd |
---|
525 | if [ -f /sbin/rpc.statd ]; then |
---|
526 | getStatus /sbin/rpc.statd |
---|
527 | case $? in |
---|
528 | $SERVICE_RUNNING) |
---|
529 | ;; |
---|
530 | $SERVICE_NOT_RUNNING) |
---|
531 | if [[ $MONITOR_STATD -ge $LEVEL2 ]]; then |
---|
532 | restartStatd |
---|
533 | else |
---|
534 | alertmsg statd |
---|
535 | fi |
---|
536 | ;; |
---|
537 | esac |
---|
538 | fi |
---|
539 | } |
---|
540 | |
---|
541 | |
---|
542 | # This function attempts to restart statd in the background (only once). |
---|
543 | restartStatd() { |
---|
544 | [ ! -f /sbin/rpc.statd ] && return |
---|
545 | |
---|
546 | # Kill the statd process if exists (important for registering with portmap) |
---|
547 | debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd) |
---|
548 | debuglog /etc/init.d/nfslock start |
---|
549 | sleep $RESTART_TIMEOUT |
---|
550 | getStatus /sbin/rpc.statd |
---|
551 | case $? in |
---|
552 | $SERVICE_RUNNING) |
---|
553 | ;; |
---|
554 | $SERVICE_NOT_RUNNING) |
---|
555 | if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then |
---|
556 | failovermsg statd |
---|
557 | invokeFailover |
---|
558 | else |
---|
559 | alertmsg statd |
---|
560 | fi |
---|
561 | ;; |
---|
562 | esac |
---|
563 | } |
---|
564 | |
---|
565 | ###################################### |
---|
566 | # Monitoring rsh/ssh daemon |
---|
567 | ###################################### |
---|
568 | |
---|
569 | startSshd() { |
---|
570 | checkRestart SSHD # This call takes care of MONITOR_SSHD < LEVEL3 (numberOfRestarts=0) |
---|
571 | numberOfRestarts=$? |
---|
572 | service=$(rshService) |
---|
573 | attemptNo=0 |
---|
574 | debugmsg2 "About to restart sshd (up to $numberOfRestarts times as configured, |
---|
575 | note: 0 means that the configuration level is lower than LEVEL2)" |
---|
576 | while [[ $numberOfRestarts -gt 0 ]]; do |
---|
577 | attemptNo=$((attemptNo+1)) |
---|
578 | /etc/init.d/$service restart > /tmp/$service_restart.out 2>&1 & |
---|
579 | restartPID=$! |
---|
580 | sleep $((attemptNo*RESTART_TIMEOUT)) |
---|
581 | debugmsg2 "Try to restart $service (attempt $attemptNo), checking the status:" |
---|
582 | getStatus $service |
---|
583 | restartStatus=$? |
---|
584 | debugmsg2 "The $service status after the $attemptNo attempt of restart is $restartStatus" |
---|
585 | if [[ $restartStatus == $SERVICE_RUNNING ]]; then |
---|
586 | return |
---|
587 | fi |
---|
588 | debugmsg "Restarting $service failed/hangs, about to kill the restart process. |
---|
589 | The output of restart attempt is in /tmp/$service_restart.out" |
---|
590 | kill -9 $restartPID 2>&1 |
---|
591 | numberOfRestartsLeft=$((numberOfRestartsLeft-1)) |
---|
592 | done |
---|
593 | # Failed to restart service, check for failover configuration parameters |
---|
594 | msg "Failed to restart the $service process (tried $attemptNo times as configured)" |
---|
595 | if [[ $MONITOR_SSHD == $LEVEL3 ]]; then |
---|
596 | failovermsg $service |
---|
597 | invokeFailover |
---|
598 | else |
---|
599 | alertmsg $service |
---|
600 | fi |
---|
601 | } |
---|
602 | |
---|
603 | monitorSshd() { |
---|
604 | ifGPFSDownExit $GPFS_IP |
---|
605 | service=$(rshService) |
---|
606 | getStatus $service |
---|
607 | case $? in |
---|
608 | $SERVICE_RUNNING) |
---|
609 | return |
---|
610 | ;; |
---|
611 | $SERVICE_NOT_RUNNING) |
---|
612 | startSshd |
---|
613 | ;; |
---|
614 | esac |
---|
615 | } |
---|
616 | |
---|
617 | ###################################### |
---|
618 | # Monitoring gpfs daemon |
---|
619 | ###################################### |
---|
620 | monitorGPFS() { |
---|
621 | ifGPFSDownExit $GPFS_IP |
---|
622 | if [ $? != 0 ]; then |
---|
623 | failovermsg GPFS |
---|
624 | exit |
---|
625 | fi |
---|
626 | } |
---|
627 | |
---|
628 | ###################################### |
---|
629 | # Main |
---|
630 | ###################################### |
---|
631 | nfsMonitor() { |
---|
632 | GPFS_IP=$(myGPFSIP) |
---|
633 | |
---|
634 | tempvar=1 |
---|
635 | while [ $tempvar == 1 ] ; do |
---|
636 | sleep $MONITOR_INTERVAL |
---|
637 | |
---|
638 | # GPFS monitoring |
---|
639 | if [[ $MONITOR_GPFS -gt $LEVEL3 ]]; then |
---|
640 | debugmsg2 "==========GPFS monitoring===============" |
---|
641 | monitorGPFS |
---|
642 | debugmsg2 "done monitoring GPFS" |
---|
643 | fi |
---|
644 | |
---|
645 | # Network monitoring |
---|
646 | if [[ $MONITOR_NETWORK -gt $LEVEL0 ]]; then |
---|
647 | debugmsg2 "==========NW monitoring===============" |
---|
648 | monitorNetwork |
---|
649 | debugmsg2 "done monitoring the network" |
---|
650 | fi |
---|
651 | |
---|
652 | # Monitoring portmap |
---|
653 | # Note that all of the rpc services have to be registered with portmap |
---|
654 | # in order for new clients to access them. In our monitoring script, |
---|
655 | # we only test for portmap once, but if it fails afterward, the services |
---|
656 | # may not be available for new clients even though they are running. |
---|
657 | # This is the case until portmap is restarted again, |
---|
658 | # and the rpc processes re-register with it. |
---|
659 | if [[ $MONITOR_PORTMAP -gt $LEVEL0 ]]; then |
---|
660 | debugmsg2 "===========portmap monitoring==============" |
---|
661 | monitorPortmap |
---|
662 | debugmsg2 "done monitoring portmap" |
---|
663 | fi |
---|
664 | |
---|
665 | # Monitoring nfsd |
---|
666 | # The system monitors nfsd only if the nfsd is configured to be monitored in |
---|
667 | # the configuration file. |
---|
668 | # There are several monitoring methods/levels: |
---|
669 | # (test 1) Check that the nfsd process is running. |
---|
670 | # (test 2) Monitor the rpc-nfs activity. |
---|
671 | # (test 3) Send null rpc to the nfsd service. |
---|
672 | # |
---|
673 | # Order of tests: |
---|
674 | # Perfom (test 1). If the process is not running, goto Action. |
---|
675 | # If the process is running, perform (test 2), if there is nfs activity, |
---|
676 | # goto Done. |
---|
677 | # If no activity is detected, perform (test 3); if fails -- goto Action; |
---|
678 | # if pass -- goto Done. |
---|
679 | |
---|
680 | # Action: if nfsd is not running & configured to be restarted then the nfsd is |
---|
681 | # restarted. |
---|
682 | # If all restart attempts have failed, the node is declared "dead" for nfs |
---|
683 | # serving, and if nfsd is configured as "failover" then the node is failed over |
---|
684 | # to another node, and a user level alert is invoked. |
---|
685 | |
---|
686 | # Done: nfsd is up and running, continue. |
---|
687 | |
---|
688 | if [[ $MONITOR_NFSD -gt $LEVEL0 ]]; then |
---|
689 | debugmsg2 "==========nfsd monitoring===============" |
---|
690 | monitorNfsd |
---|
691 | debugmsg2 "done monitoring nfsd" |
---|
692 | fi |
---|
693 | |
---|
694 | # Monitoring mountd |
---|
695 | # The system monitors mountd only if mountd is configured to be monitored. |
---|
696 | # (test 1) Check that the mountd process is running. |
---|
697 | # Order of tests: |
---|
698 | # Perfom (test 1). If the process is not running, goto Action. |
---|
699 | # If the process is running, goto Done. |
---|
700 | |
---|
701 | # Action: if mountd is not running & mountd is configured to be restarted then the mountd is |
---|
702 | # restarted. |
---|
703 | # If all restart attempts have failed, the node is declared "dead" for nfs |
---|
704 | # serving. If mountd is configured as "failover" then the node is failed over |
---|
705 | # to another node, and alert is sent. |
---|
706 | |
---|
707 | if [[ $MONITOR_MOUNTD -gt $LEVEL0 ]]; then |
---|
708 | debugmsg2 "============mountd monitoring=============" |
---|
709 | monitorMountd |
---|
710 | debugmsg2 "done monitoring mountd" |
---|
711 | fi |
---|
712 | |
---|
713 | # Monitoring locking (lockd and statd) |
---|
714 | # The system monitors locking only if configured. |
---|
715 | # There are several monitoring methods/levels: |
---|
716 | # (test 1) Check that the lockd processes is running |
---|
717 | # (test 2) Check that the statd processes is running |
---|
718 | |
---|
719 | # If lockd is not running, the node is declared "dead" for nfs serving, |
---|
720 | # and if locking is configured as "failover" then the node is failed over to another |
---|
721 | # node, and alert is sent. |
---|
722 | # If statd is not running, a restart takes place if statd is configured to be restarted. |
---|
723 | # If the restart has failed, the node is declared "dead" for nfs serving, |
---|
724 | # and if configured, the node is failed over to another |
---|
725 | # node, and alert is sent. |
---|
726 | |
---|
727 | if [[ $MONITOR_STATD -gt $LEVEL0 ]]; then |
---|
728 | debugmsg2 "==========statd monitoring===============" |
---|
729 | monitorLocking |
---|
730 | debugmsg2 "done monitoring statd" |
---|
731 | fi |
---|
732 | |
---|
733 | # Monitoring sshd |
---|
734 | # The system monitors sshd only if sshd is configured to be monitored. |
---|
735 | # (test 1) Check that the sshd process is running. |
---|
736 | # Order of tests: |
---|
737 | # Perfom (test 1). If the process is not running, goto Action. |
---|
738 | # If the process is running, goto Done. |
---|
739 | # Action: if sshd is not running is configured to be restarted, then restart. |
---|
740 | # If all restart attempts have failed, the node is declared "dead" for ssh |
---|
741 | # serving. If sshd is configured as "failover" then the node is failed over |
---|
742 | # to another node, and alert is sent. |
---|
743 | |
---|
744 | if [[ $MONITOR_SSHD -gt $LEVEL0 ]]; then |
---|
745 | debugmsg2 "============sshd monitoring=============" |
---|
746 | monitorSshd |
---|
747 | debugmsg2 "done monitoring sshd" |
---|
748 | fi |
---|
749 | |
---|
750 | done |
---|
751 | } |
---|
752 | |
---|
753 | stopNfsMonitor() { |
---|
754 | nfsMonToolPid=`pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor` |
---|
755 | if [[ -z $nfsMonToolPid ]]; then |
---|
756 | debugmsg2 "Warning: Couldn't find the monitoring process to stop" |
---|
757 | return |
---|
758 | fi |
---|
759 | kill -9 $nfsMonToolPid |
---|
760 | msg "Monitoring has stopped." |
---|
761 | } |
---|
762 | |
---|
763 | startNfsMonitor() { |
---|
764 | nfsMonitor & |
---|
765 | msg "Monitoring has started." |
---|
766 | } |
---|
767 | |
---|
768 | restartNfsMonitor() { |
---|
769 | stopNfsMonitor |
---|
770 | startNfsMonitor |
---|
771 | } |
---|
772 | |
---|
773 | statusNfsMonitor() { |
---|
774 | nfsMonToolPid=$(pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor) |
---|
775 | if [[ -z $nfsMonToolPid ]]; then |
---|
776 | debugmsg "nfsmonitor is not running" |
---|
777 | return |
---|
778 | else |
---|
779 | debugmsg "nfsmonitor is running" |
---|
780 | fi |
---|
781 | } |
---|
782 | |
---|
783 | ################################# |
---|
784 | # Main program |
---|
785 | ################################# |
---|
786 | case "$1" in |
---|
787 | -s|start) |
---|
788 | startNfsMonitor |
---|
789 | ;; |
---|
790 | -e|stop) |
---|
791 | stopNfsMonitor |
---|
792 | ;; |
---|
793 | -r|restart) |
---|
794 | restartNfsMonitor |
---|
795 | ;; |
---|
796 | -q|status) |
---|
797 | statusNfsMonitor |
---|
798 | ;; |
---|
799 | *) |
---|
800 | echo $"Usage: $0 [-s|start|-e|end|-r|restart|-q|status]" |
---|
801 | exit 1 |
---|
802 | ;; |
---|
803 | esac |
---|