Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

nfsmonitor @ 148

Last change on this file since 148 was 16, checked in by rock, 17 years ago

Property svn:executable set to ``*
File size: 24.0 KB

Line
1	#!/bin/ksh
2	# @(#)23 1.12.1.10 src/avs/fs/mmfs/samples/nfscluster/nfsmonitor, mmfs, avs_rgpfs24, rgpfs24s011a 3/20/07 16:46:39
3	#
4
5	# HA-NFS monitoring
6	# Usage: nfsmonitor [start\|stop\|restart\|status]
7
8	# Monitor levels
9	LEVEL0=0 # NO_MONITORING
10	LEVEL1=1 # ALERT_ONLY
11	LEVEL2=2 # RESTART (not applicable for all services)
12	LEVEL3=3 # FAILOVER
13
14	MONITOR_INTERVAL=${MONITOR_INTERVAL-15}
15	MONITOR_NETWORK=${MONITOR_NETWORK-3}
16	MONITOR_PORTMAP=${MONITOR_PORTMAP-3}
17	MONITOR_NFSD=${MONITOR_NFSD-3}
18	MONITOR_MOUNTD=${MONITOR_MOUNTD-3}
19	MONITOR_STATD=${MONITOR_STATD-3}
20	MONITOR_SSHD=${MONITOR_SSHD-3}
21
22	# All times below are in seconds
23	NFS_RPC_ACT_SAMPLE_INTERVAL=3
24	RESTART_TIMEOUT=7
25
26	# Internal flags
27	NUMBER_OF_RESTARTS=3
28
29	# Functions return values (used only within this file)
30	SERVICE_RUNNING=1
31	SERVICE_NOT_RUNNING=2
32	SERVICE_UNKNOWN=3
33
34	if [ ! -f /var/mmfs/etc/nfsfuncs ]; then
35	echo "$0: Can't find NFS functions in /var/mmfs/etc"
36	exit 0
37	fi
38	. /var/mmfs/etc/nfsfuncs
39
40	# Display an alert message in syslog. Optionally, call a customer-provided alert script
41	# Usage: alert <message> <node issuing alert>
42	alert() {
43	service=$1
44	actionmsg=$2
45	shift 2
46	[ -n "$" ] && comment="($)"
47	message="Monitoring detected $service is inactive, $action $comment"
48	msg "$message"
49	[ ! -e /var/mmfs/etc/alert ] && return
50	/var/mmfs/etc/alert "$message" "$GPFS_IP"
51	}
52
53	alertmsg() {
54	service=$1
55	shift
56	alert $service "no action taken as configured" $*
57	}
58
59	failovermsg() {
60	service=$1
61	shift
62	alert $service "node failure initiated as configured" $*
63	}
64
65	# Terminate all processes that are running the nfs tool.
66	# This is useful for the case that the restart process is hanging...
67	nfsToolKill() {
68	if [ -f /etc/init.d/nfsserver ]; then
69	NFSTOOL=/etc/init.d/nfsserver
70	else
71	NFSTOOL=/etc/init.d/nfs
72	fi
73	nfsToolBase=${NFSTOOL##*/}
74	nfsToolPid=`pidof -o $$ -o $PPID -o %PPID -x $NFSTOOL \|\| \
75	pidof -o $$ -o $PPID -o %PPID -x $nfsToolBase`
76	debugmsg2 "The pids of $NFSTOOL are $nfsToolPid"
77	if [[ -z $nfsToolPid ]]; then
78	return
79	fi
80	kill -9 $nfsToolPid
81	}
82
83	# Check if a given service is configured to be restarted.
84	# Returns: the number of restarts or 0 if configured for no restart
85	checkRestart() {
86	service=$1
87	eval service_level='$'MONITOR_$service
88	debugmsg2 "$service level is $service_level"
89	[ $service_level -ge $LEVEL2 ] && return $NUMBER_OF_RESTARTS
90	return 0
91	}
92
93	# Get the status of a given service.
94	# Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING
95	getStatus() {
96	service=$1
97	checkStatus $service
98	status=$?
99	if [ $status -eq 0 ]; then
100	debugmsg2 "$service is running"
101	return $SERVICE_RUNNING
102	else
103	debugmsg "$service is not running (status $status)"
104	return $SERVICE_NOT_RUNNING
105	fi
106	}
107
108	invokeFailover() {
109	debugmsg "Invoking failover..."
110	# Stop nfs gracefully to prevent client from getting ESTALE
111	debuglog2 touch /tmp/ha-nfs-reboot 2> /dev/null
112	stop.nfs
113	# Kill the gpfs daemon on the node to invoke failover
114	debugmsg "Stopping GPFS..."
115	/etc/init.d/gpfs stop
116	exit
117	}
118
119	invokeFailoverReboot() {
120	debugmsg "Invoking failover with reboot..."
121	# Stop nfs gracefully to prevent client from getting ESTALE
122	stop.nfs
123	# Kill the gpfs daemon on the node to invoke failover
124	debugmsg "Stopping GPFS before reboot..."
125	/etc/init.d/gpfs stop
126	reboot
127	exit
128	}
129
130	#################
131	# Monitoring nfsd
132	#################
133
134	# This function samples /proc/net/rpc/nfsd twice within a given interval
135	# and compares the two samples to detect any nfsd rpc activity
136	# Returns: if activity was detected -> SERVICE_RUNNING
137	# if there is no entry in proc or no activity was detected, no conclusion ->
138	# SERVICE_UNKNOWN
139	detectNfsdActivity() {
140	set -A v2procs NAME COUNT NULL GETATTR SETATTR ROOT LOOKUP READLINK \
141	READ WRCACHE WRITE CREATE REMOVE RENAME \
142	LINK SYMLINK MKDIR RMDIR READDIR FSSTAT
143	set -A v3procs NAME COUNT NULL GETATTR SETATTR LOOKUP ACCESS READLINK \
144	READ WRITE CREATE MKDIR SYMLINK MKNOD REMOVE RMDIR RENAME \
145	LINK READDIR READDIRPLUS FSSTAT FSINFO PATHCONF COMMIT
146
147	procfile=/proc/net/rpc/nfsd
148	if [ ! -f $procfile ]; then
149	msg "Monitoring could not find /proc/net/rpc/nfsd"
150	return $SERVICE_UNKNOWN
151	fi
152	# Sample v2/v3 activity (using proc) in the next NFS_RPC_ACT_SAMPLE_INTERVAL sec
153	set -A v2procs1 $(cat $procfile \| grep -w proc2)
154	set -A v3procs1 $(cat $procfile \| grep -w proc3)
155	debugmsg2 "Sleeping for $NFS_RPC_ACT_SAMPLE_INTERVAL"
156	sleep $NFS_RPC_ACT_SAMPLE_INTERVAL
157	set -A v2procs2 $(cat $procfile \| grep -w proc2)
158	set -A v3procs2 $(cat $procfile \| grep -w proc3)
159
160	# Use the samples to detect activity
161	p=2 # skipping name and count fields
162	n=${#v2procs[@]}
163	while [ $p -lt $n ]; do
164	activity=$((${v2procs2[$p]}-${v2procs1[$p]}))
165	if [ $activity -gt 0 ]; then
166	debugmsg2 "nfsd activity detected: $activity ${v2procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec"
167	return $SERVICE_RUNNING
168	fi
169	p=$((p+1))
170	done
171	p=2 # skipping name and count fields
172	n=${#v3procs[@]}
173	while [ $p -lt $n ]; do
174	activity=$((${v3procs2[$p]}-${v3procs1[$p]}))
175	if [ $activity -gt 0 ]; then
176	debugmsg2 "nfsd activity detected: $activity ${v3procs[$p]} operations in $NFS_RPC_ACT_SAMPLE_INTERVAL sec"
177	return $SERVICE_RUNNING
178	fi
179	p=$((p+1))
180	done
181	# no activity detected
182	debugmsg2 "Could not detect nfsd activity using /proc"
183	return $SERVICE_UNKNOWN
184	}
185
186	# If nfsd is configured to be restarted, this function attempts to start/restart
187	# the nfsd process as many times as configured. The operation parameter is set
188	# to start/restart.
189	# The success of restart is determined by checking the status of nfsd.
190	# Note that portmap has to be running in order to restart nfsd.
191	# If restart has failed, alert and failover are invoked as configured.
192	# For nfsd, we use the /etc/init.d/nfs(RH)\|nfsserver(SUSE) utility since it stops
193	# the service before restarting it (which is important to make sure that nfsd
194	# re-registerin with portmap, for example, in the case of restarting nfsd after
195	# restarting portmap. This utility also reloads /etc/exports.
196	startNfsd() {
197	checkRestart NFSD # This call takes care of MONITOR_NFSD < LEVEL2 (numberOfRestarts=0)
198	numberOfRestarts=$?
199	attemptNo=0
200	debugmsg2 "About to start nfsd (up to $numberOfRestarts times)"
201	while [ $numberOfRestarts -gt 0 ]; do
202	attemptNo=$((attemptNo+1))
203	debugmsg2 "In startnfsd, attempt number = $attemptNo"
204	nfsService start
205	# give the restart/start chance to complete
206	sleep $RESTART_TIMEOUT
207	getStatus nfsd
208	status=$?
209	debugmsg2 "The nfsd status after $attemptNo attempts to start is $status"
210	[[ $status == $SERVICE_RUNNING ]] && return
211	# start attempt has failed/hangs -> kill the process and retry
212	debugmsg "start nfsd failed/hangs, about to kill the start process."
213	nfsToolKill
214	numberOfAttempts=$((numberOfRestarts-1))
215	done
216
217	# If we get here, it means that all start attempts have failed
218	# Failed to restart nfsd, check for failover configuration parameters
219	if [[ $MONITOR_NFSD -eq $LEVEL3 ]]; then
220	failovermsg nfsd
221	invokeFailover
222	else
223	alertmsg nfsd
224	fi
225	}
226
227	# NULL RPC test: use rpcinfo to send a null rpc to nfs v3
228	# Returns: SERVICE_RUNNING or SERVICE_NOT_RUNNING
229	nfsdNullRpcTest() {
230	hostname=$(hostname)
231	# Use rpcinfo to send a null rpc to nfs v3 using UDP
232	debuglog2 rpcinfo -u $hostname nfs 3
233	if [ $? -eq 0 ]; then
234	debugmsg2 "Sent nullrpc to nfsd v3 using rpcinfo, the service is up and running"
235	return $SERVICE_RUNNING
236	else
237	msg "Sent nullrpc to nfsd v3 using rpcinfo, the service is not available (it maybe because portmap was started after nfsd, or that rpcinfo is not installed)"
238	return $SERVICE_NOT_RUNNING
239	fi
240	}
241
242	# Run all the tests for monitoring nfsd, and take actions according
243	# to the monioring level.
244	monitorNfsd() {
245	ifGPFSDownExit $GPFS_IP
246
247	# Check that the nfsd process is running
248	getStatus nfsd
249	runStatus=$?
250
251	case $runStatus in
252	$SERVICE_RUNNING)
253	debugmsg2 "Perform more tests, to make sure that nfsd is functioning"
254	detectNfsdActivity
255	case $? in
256	$SERVICE_RUNNING)
257	return ;;
258	$SERVICE_UNKNOWN)
259	nfsdNullRpcTest
260	case $? in
261	$SERVICE_NOT_RUNNING)
262	startNfsd ;;
263	$SERVICE_RUNNING)
264	return ;;
265	esac
266	;;
267	esac
268	;;
269	$SERVICE_NOT_RUNNING)
270	# Note that if nfsd was not running and the start has succeeded, we assume
271	# that the process is running, and only perform the next level of tests
272	# (rpc activity, and null rpc) next time.
273	startNfsd
274	return ;;
275	esac
276	}
277
278	####################
279	# Monitoring mountd
280	####################
281	restartMountd() {
282	checkRestart MOUNTD # This takes care of MONITOR_MOUNTD < LEVEL3 (numberOfRestarts=0)
283	numberOfRestarts=$?
284	attemptNo=0
285	debugmsg2 "About to restart mountd (up to $numberOfRestarts times as configured, note: 0 means that the configuration level is lower than LEVEL2)"
286	while [ $numberOfRestarts -gt 0 ]; do
287	attemptNo=$((attemptNo+1))
288	startMountd
289	restartPID=$!
290	debugmsg2 "restartPID=$restartPID"
291	# give the restart a chance to complete
292	sleep $RESTART_TIMEOUT
293	# check the status of mountd after the restart
294	debugmsg2 "Try to restart mountd (attempt $attemptNo), checking the status:"
295	getStatus /usr/sbin/rpc.mountd
296	if [ $? -eq $SERVICE_RUNNING ]; then
297	debugmsg2 "Succeeded to restart the mountd service at $attemptNo attempt"
298	return
299	fi
300	# restart attempt has failed/hangs -> kill the process and retry
301	debugmsg "Restarting mountd failed/hangs, about to kill the restart process."
302	if [ -e /sbin/startproc ]; then
303	debuglog kill -9 $restartPID
304	else
305	nfsToolKill # FIX
306	fi
307	numberOfRestarts=$((numberOfRestarts-1))
308	done
309
310	# If we get to this point, it means that all restart attempts have failed
311	msg "Failed to restart the mountd (tried $attemptNo times as configured)"
312	# Failed to restart mountd, check for failover configuration parameters
313	if [[ $MONITOR_MOUNTD == $LEVEL3 ]]; then
314	failovermsg mountd
315	invokeFailover
316	else
317	alertmsg mountd
318	fi
319	}
320
321
322	# This is the main function for monitoring mountd, and take actions according
323	# to the monioring level.
324	monitorMountd() {
325	ifGPFSDownExit $GPFS_IP
326	getStatus /usr/sbin/rpc.mountd
327	case $? in
328	$SERVICE_RUNNING)
329	return
330	;;
331	$SERVICE_NOT_RUNNING)
332	restartMountd
333	;;
334	esac
335	}
336
337	########################
338	# Monitoring the network
339	########################
340
341	# Monitor the network.
342	# For now the only tests performed are: (1) whether the link is connected or not, using
343	# ethtool. (2) ping the gateway. More tests can be added here later.
344	monitorNetwork() {
345	ifGPFSDownExit $GPFS_IP
346	# TEST1: make sure that all interfaces that are used for nfs serving are connected
347	nfsIfs=$(getNfsIFs $GPFS_IP)
348	if [[ -z $nfsIfs ]]; then
349	msg "No configured NFS IP addresses detected on any of the node's interfaces"
350	nwFailoverCondition "no configured nfs interfaces"
351	else
352	for eth in $nfsIfs; do
353	tmp=$(mmgetifconf \| grep -w $eth \| awk '{print $1}')
354	if [[ -z $tmp ]]; then
355	nwFailoverCondition "interface is down"
356	fi
357
358	checkLinkStatus $eth
359	if [ $? -eq 0 ]; then
360	continue
361	else
362	nwFailoverCondition "link is not connected"
363	fi
364	done
365	fi
366
367	# TEST2: check that all NFS IP addresses are enabled
368	nfsIPs=$(getNfsIPs $GPFS_IP)
369	for ip in $nfsIPs; do
370	mmgetifconf \| grep -q $ip
371	[ $? -eq 0 ] && continue
372	debugmsg "monitor detected $ip is down, restarting"
373	ifUp $ip
374	done
375
376	# Now check that all NFS IP addresses for failover nodes are enabled
377	nfsIPs=
378	for ip in $(getFailedNodes $GPFS_IP); do
379	nfsIPs="$nfsIPs $(getNfsIPs $ip)"
380	done
381	for ip in $nfsIPs; do
382	mmgetifconf \| grep -q $ip
383	[ $? -eq 0 ] && continue
384	[ "$(IPaddr $ip monitor)" == "OK" ] && continue
385	debugmsg "monitor detected $ip is down, restarting"
386	debuglog IPaddr $ip start
387	done
388
389	# TEST3: ping the gateway
390	pingDefaultGateway
391	}
392
393	# List all interfaces used for NFS serving from NODELIST
394	getNfsIFs () {
395	thisGpfsIP=$1
396	eth=""
397	# Get the list of nfs ip addresses for the given gpfs ip address
398	nfsIPList=$(getNfsIPs $thisGpfsIP)
399	# Handle the case that there is not entry for the node in nfs.nodes
400	if [[ -z $nfsIPList ]]; then
401	debugmsg "ALERT: No entry was found in nfs.nodes for this node (gpfs ip address:$thisGpfsIP)"
402	nwFailoverCondition "no configured nfs interfaces"
403	fi
404	debugmsg2 "The list of ips is $nfsIPList"
405	for nfsIP in $nfsIPList; do
406	# get the "original" interface (e.g. the original interface for eth0:1 is eth0)
407	origEth=$(getEthInterface $nfsIP \| awk -F: '{print $1}')
408	if [[ -z $origEth ]]; then
409	debugmsg "ALERT: The nfs ip address $nfsIP is not assigned an interface"
410	continue
411	fi
412	echo $origEth
413	debugmsg2 "the actual interface for $nfsIP is $origEth"
414	done
415	}
416
417	# Get the inerface for a given IP address
418	# The format of nfs.nodes: GPFS_IP NFS_IP1 NFS_IP2 ...
419	# NOTE: The original function was copied from Marc and was changed.
420	# May requires future integration.
421	getEthInterface() {
422	eth=""
423	# calls an executable that returns a line for each interface on the machine,
424	# and what ip address it is assigned
425	tmp=/tmp/mmgetifconf.$$
426	mmgetifconf > $tmp
427	exec 3< $tmp
428	while read -u3 iface ip mask; do
429	if [[ $ip == $1 ]]; then
430	eth=$iface
431	fi
432	done
433	unlink $tmp
434	# eth may be empty if there is no interface associated with this ip address
435	echo $eth
436	}
437
438	# Locate and ping the default gateway.
439	# On failure, alert and invoke failover if configured.
440	pingDefaultGateway() {
441	gwIP=$(route -n \| awk '/UG/ {print $2}')
442	[[ -z $gwIP ]] && return
443
444	# Make sure the local machine is not set as the default gateway
445	tmp=/tmp/mmgetifconf.$$
446	mmgetifconf > $tmp
447	exec 3< $tmp
448	while read -u3 iface ip mask; do
449	[[ $ip == $gwIP ]] && return
450	done
451	unlink $tmp
452
453	# try to ping the gateway
454	ping -c 1 -w 5 $gwIP > /dev/null
455	outPing=$?
456	if [ $outPing -ne 0 ]; then
457	msg "Failed to ping the gateway at $gwIP (err $outPing)"
458	nwFailoverCondition "can't ping the gateway"
459	else
460	debugmsg2 "Succeeded to ping the gateway at $gwIP (ping returns $outPing)"
461	fi
462	}
463
464	# Handle the network alert and failover if configured.
465	nwFailoverCondition() {
466	message=$1
467	debugmsg2 nwFailoverCondition $message
468	if [[ $MONITOR_NETWORK -eq $LEVEL3 ]]; then
469	failovermsg network $message
470	invokeFailover
471	else
472	alertmsg network $message
473	fi
474	}
475
476
477	########################
478	# Monitoring portmap
479	########################
480	# Check if portmap is up; invoke failover and/or alert if configured.
481	# TODO: we can test if this node is mounting anything, and if not
482	# we can restart portmap, and re-register the nfs prcesses with it.
483	# Currently, if this machine is mouning anything, lockd does not re-register with portmap.
484	monitorPortmap() {
485	getStatus /sbin/portmap
486	case $? in
487	$SERVICE_RUNNING)
488	return
489	;;
490	$SERVICE_NOT_RUNNING)
491	if [[ $MONITOR_PORTMAP -eq $LEVEL1 ]]; then
492	alertmsg portmap
493	else
494	failovermsg portmap
495	invokeFailoverReboot
496	fi
497	;;
498	esac
499	}
500
501	######################################
502	# Monitoring locking (lockd and statd)
503	######################################
504
505	# This is the main function for monitoring locking (lockd, and statd)
506	monitorLocking() {
507	ifGPFSDownExit $GPFS_IP
508
509	# Monitor lockd.
510	getStatus lockd
511	case $? in
512	$SERVICE_RUNNING)
513	;;
514	$SERVICE_NOT_RUNNING)
515	if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then
516	failovermsg lockd
517	invokeFailover
518	else
519	alertmsg lockd
520	fi
521	;;
522	esac
523
524	# Monitor statd
525	if [ -f /sbin/rpc.statd ]; then
526	getStatus /sbin/rpc.statd
527	case $? in
528	$SERVICE_RUNNING)
529	;;
530	$SERVICE_NOT_RUNNING)
531	if [[ $MONITOR_STATD -ge $LEVEL2 ]]; then
532	restartStatd
533	else
534	alertmsg statd
535	fi
536	;;
537	esac
538	fi
539	}
540
541
542	# This function attempts to restart statd in the background (only once).
543	restartStatd() {
544	[ ! -f /sbin/rpc.statd ] && return
545
546	# Kill the statd process if exists (important for registering with portmap)
547	debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd)
548	debuglog /etc/init.d/nfslock start
549	sleep $RESTART_TIMEOUT
550	getStatus /sbin/rpc.statd
551	case $? in
552	$SERVICE_RUNNING)
553	;;
554	$SERVICE_NOT_RUNNING)
555	if [[ $MONITOR_STATD -eq $LEVEL3 ]]; then
556	failovermsg statd
557	invokeFailover
558	else
559	alertmsg statd
560	fi
561	;;
562	esac
563	}
564
565	######################################
566	# Monitoring rsh/ssh daemon
567	######################################
568
569	startSshd() {
570	checkRestart SSHD # This call takes care of MONITOR_SSHD < LEVEL3 (numberOfRestarts=0)
571	numberOfRestarts=$?
572	service=$(rshService)
573	attemptNo=0
574	debugmsg2 "About to restart sshd (up to $numberOfRestarts times as configured,
575	note: 0 means that the configuration level is lower than LEVEL2)"
576	while [[ $numberOfRestarts -gt 0 ]]; do
577	attemptNo=$((attemptNo+1))
578	/etc/init.d/$service restart > /tmp/$service_restart.out 2>&1 &
579	restartPID=$!
580	sleep $((attemptNo*RESTART_TIMEOUT))
581	debugmsg2 "Try to restart $service (attempt $attemptNo), checking the status:"
582	getStatus $service
583	restartStatus=$?
584	debugmsg2 "The $service status after the $attemptNo attempt of restart is $restartStatus"
585	if [[ $restartStatus == $SERVICE_RUNNING ]]; then
586	return
587	fi
588	debugmsg "Restarting $service failed/hangs, about to kill the restart process.
589	The output of restart attempt is in /tmp/$service_restart.out"
590	kill -9 $restartPID 2>&1
591	numberOfRestartsLeft=$((numberOfRestartsLeft-1))
592	done
593	# Failed to restart service, check for failover configuration parameters
594	msg "Failed to restart the $service process (tried $attemptNo times as configured)"
595	if [[ $MONITOR_SSHD == $LEVEL3 ]]; then
596	failovermsg $service
597	invokeFailover
598	else
599	alertmsg $service
600	fi
601	}
602
603	monitorSshd() {
604	ifGPFSDownExit $GPFS_IP
605	service=$(rshService)
606	getStatus $service
607	case $? in
608	$SERVICE_RUNNING)
609	return
610	;;
611	$SERVICE_NOT_RUNNING)
612	startSshd
613	;;
614	esac
615	}
616
617	######################################
618	# Monitoring gpfs daemon
619	######################################
620	monitorGPFS() {
621	ifGPFSDownExit $GPFS_IP
622	if [ $? != 0 ]; then
623	failovermsg GPFS
624	exit
625	fi
626	}
627
628	######################################
629	# Main
630	######################################
631	nfsMonitor() {
632	GPFS_IP=$(myGPFSIP)
633
634	tempvar=1
635	while [ $tempvar == 1 ] ; do
636	sleep $MONITOR_INTERVAL
637
638	# GPFS monitoring
639	if [[ $MONITOR_GPFS -gt $LEVEL3 ]]; then
640	debugmsg2 "==========GPFS monitoring==============="
641	monitorGPFS
642	debugmsg2 "done monitoring GPFS"
643	fi
644
645	# Network monitoring
646	if [[ $MONITOR_NETWORK -gt $LEVEL0 ]]; then
647	debugmsg2 "==========NW monitoring==============="
648	monitorNetwork
649	debugmsg2 "done monitoring the network"
650	fi
651
652	# Monitoring portmap
653	# Note that all of the rpc services have to be registered with portmap
654	# in order for new clients to access them. In our monitoring script,
655	# we only test for portmap once, but if it fails afterward, the services
656	# may not be available for new clients even though they are running.
657	# This is the case until portmap is restarted again,
658	# and the rpc processes re-register with it.
659	if [[ $MONITOR_PORTMAP -gt $LEVEL0 ]]; then
660	debugmsg2 "===========portmap monitoring=============="
661	monitorPortmap
662	debugmsg2 "done monitoring portmap"
663	fi
664
665	# Monitoring nfsd
666	# The system monitors nfsd only if the nfsd is configured to be monitored in
667	# the configuration file.
668	# There are several monitoring methods/levels:
669	# (test 1) Check that the nfsd process is running.
670	# (test 2) Monitor the rpc-nfs activity.
671	# (test 3) Send null rpc to the nfsd service.
672	#
673	# Order of tests:
674	# Perfom (test 1). If the process is not running, goto Action.
675	# If the process is running, perform (test 2), if there is nfs activity,
676	# goto Done.
677	# If no activity is detected, perform (test 3); if fails -- goto Action;
678	# if pass -- goto Done.
679
680	# Action: if nfsd is not running & configured to be restarted then the nfsd is
681	# restarted.
682	# If all restart attempts have failed, the node is declared "dead" for nfs
683	# serving, and if nfsd is configured as "failover" then the node is failed over
684	# to another node, and a user level alert is invoked.
685
686	# Done: nfsd is up and running, continue.
687
688	if [[ $MONITOR_NFSD -gt $LEVEL0 ]]; then
689	debugmsg2 "==========nfsd monitoring==============="
690	monitorNfsd
691	debugmsg2 "done monitoring nfsd"
692	fi
693
694	# Monitoring mountd
695	# The system monitors mountd only if mountd is configured to be monitored.
696	# (test 1) Check that the mountd process is running.
697	# Order of tests:
698	# Perfom (test 1). If the process is not running, goto Action.
699	# If the process is running, goto Done.
700
701	# Action: if mountd is not running & mountd is configured to be restarted then the mountd is
702	# restarted.
703	# If all restart attempts have failed, the node is declared "dead" for nfs
704	# serving. If mountd is configured as "failover" then the node is failed over
705	# to another node, and alert is sent.
706
707	if [[ $MONITOR_MOUNTD -gt $LEVEL0 ]]; then
708	debugmsg2 "============mountd monitoring============="
709	monitorMountd
710	debugmsg2 "done monitoring mountd"
711	fi
712
713	# Monitoring locking (lockd and statd)
714	# The system monitors locking only if configured.
715	# There are several monitoring methods/levels:
716	# (test 1) Check that the lockd processes is running
717	# (test 2) Check that the statd processes is running
718
719	# If lockd is not running, the node is declared "dead" for nfs serving,
720	# and if locking is configured as "failover" then the node is failed over to another
721	# node, and alert is sent.
722	# If statd is not running, a restart takes place if statd is configured to be restarted.
723	# If the restart has failed, the node is declared "dead" for nfs serving,
724	# and if configured, the node is failed over to another
725	# node, and alert is sent.
726
727	if [[ $MONITOR_STATD -gt $LEVEL0 ]]; then
728	debugmsg2 "==========statd monitoring==============="
729	monitorLocking
730	debugmsg2 "done monitoring statd"
731	fi
732
733	# Monitoring sshd
734	# The system monitors sshd only if sshd is configured to be monitored.
735	# (test 1) Check that the sshd process is running.
736	# Order of tests:
737	# Perfom (test 1). If the process is not running, goto Action.
738	# If the process is running, goto Done.
739	# Action: if sshd is not running is configured to be restarted, then restart.
740	# If all restart attempts have failed, the node is declared "dead" for ssh
741	# serving. If sshd is configured as "failover" then the node is failed over
742	# to another node, and alert is sent.
743
744	if [[ $MONITOR_SSHD -gt $LEVEL0 ]]; then
745	debugmsg2 "============sshd monitoring============="
746	monitorSshd
747	debugmsg2 "done monitoring sshd"
748	fi
749
750	done
751	}
752
753	stopNfsMonitor() {
754	nfsMonToolPid=`pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor`
755	if [[ -z $nfsMonToolPid ]]; then
756	debugmsg2 "Warning: Couldn't find the monitoring process to stop"
757	return
758	fi
759	kill -9 $nfsMonToolPid
760	msg "Monitoring has stopped."
761	}
762
763	startNfsMonitor() {
764	nfsMonitor &
765	msg "Monitoring has started."
766	}
767
768	restartNfsMonitor() {
769	stopNfsMonitor
770	startNfsMonitor
771	}
772
773	statusNfsMonitor() {
774	nfsMonToolPid=$(pidof -o $$ -o $PPID -o %PPID -x ./nfsmonitor)
775	if [[ -z $nfsMonToolPid ]]; then
776	debugmsg "nfsmonitor is not running"
777	return
778	else
779	debugmsg "nfsmonitor is running"
780	fi
781	}
782
783	#################################
784	# Main program
785	#################################
786	case "$1" in
787	-s\|start)
788	startNfsMonitor
789	;;
790	-e\|stop)
791	stopNfsMonitor
792	;;
793	-r\|restart)
794	restartNfsMonitor
795	;;
796	-q\|status)
797	statusNfsMonitor
798	;;
799	*)
800	echo $"Usage: $0 [-s\|start\|-e\|end\|-r\|restart\|-q\|status]"
801	exit 1
802	;;
803	esac

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gpfs_3.1_ker2.6.20/lpp/mmfs/samples/nfscluster/nfsmonitor @ 148

Download in other formats: