#!/bin/ksh # @(#)64 1.17.1.18 src/avs/fs/mmfs/samples/nfscluster/nfsfuncs, mmfs, avs_rgpfs24, rgpfs24s011a 3/7/07 20:57:04 # if [ ! -f /var/mmfs/etc/nfsdefs ]; then echo "$0: Can't find NFS defines(nfsdefs) in /var/mmfs/etc" exit 0 fi . /var/mmfs/etc/nfsdefs # Configure path used for programs PATH=$PATH:/sbin:/usr/sbin:/usr/bin:/bin # standard programs PATH=$PATH:/usr/lpp/mmfs/bin:/var/mmfs/etc # for mmfs programs PATH=$PATH:/etc/ha.d/resource.d:/usr/lib/heartbeat # for HA stuff export PATH # Shared storage for HA-NFS # The directory structure is as follows: # shared/.ha # nfs (mirrors /var/lib/nfs) # rmtab # node1 # statd # sm # sm.bak # ... # recovery # node1 # nodeX # ... # # Local storage for node1 # /var/lib/nfs # rmtab -> shared/.ha/nfs/rmtab # (RHEL): # statd -> shared/.ha/nfs/node1/statd # (SLES): # sm -> shared/.ha/nfs/node1/statd/sm # sm.bak -> shared/.ha/nfs/node1/statd/sm.bak # SHARED_HA=${SHARED_ROOT}/.ha SHARED_NFS=${SHARED_HA}/nfs SHARED_RECOVERY=${SHARED_HA}/recovery # For failover of locks to happen correctly, the lockmgrs on all GPFS nodes # need to listen on different port numbers so clients are forced to # re-establish socket connection with the takeover node for reclaims # Default port number to use for NLM (NFS lock manager) # For node i, nlmport = NLM_PORT + i [ -z "$NLM_PORT" ] && NLM_PORT=10000 # Dependencies for distribution - RHEL, SLES if [ -d /etc/sysconfig/network ]; then IFPATH="/etc/sysconfig/network" else IFPATH="/etc/sysconfig/network-scripts" fi typeset -i iptakeover=0 # does GPFS perform IP failover? debug=0 # debug level for messages to be logged? notifyfix=0 # SM_NOTIFY fix required for SLES? monitor=1 # monitoring of daemons required? customLog=1 # Log file specified hardMount=1 # Clients use "hard" NFS mounts [ -f $NODELIST ] && iptakeover=1 [ -n "$DEBUG" ] && debug=$DEBUG [ -n "$NOTIFYFIX" ] && notifyfix=1 [ -n "$MONITOR" ] && monitor=$MONITOR [ -z "$LOGFILE" ] && LOGFILE=/var/mmfs/gen/mmfslog && customLog=0 [ -z "$NFSD_PROCS" ] && NFSD_PROCS=32 if [ -n "$GPFS_RSH" ]; then GPFS_rshPath=$(which $GPFS_RSH) else GPFS_rshPath=$(which rsh) fi export GPFS_rshPath ################################################################################ # Utility functions # ################################################################################ die() { echo "$*" exit 1 } _log() { let level=$1; shift if [ $debug -ge $level ]; then echo "`date`: $*" >> $LOGFILE 2>&1 $* >> $LOGFILE 2>&1 else $* > /dev/null 2>&1 fi } log() { _log 0 $* } debuglog() { _log 1 $* } debuglog2() { _log 2 $* } _msg() { level=$1; shift [ $debug -ge $level ] && echo "`date`: $*" >> $LOGFILE 2>&1 } msg() { _msg 0 $* logger -t HA-NFS "$*" } debugmsg() { _msg 1 $* } debugmsg2() { _msg 2 $* } err() { msg "Error: $*" } warn() { msg "Warning: $*" } _mkdir() { debuglog2 mkdir -m 0700 -p $* } _rmdir() { debuglog2 rm -rf $* } _unlink() { debuglog2 unlink $1 } _cp() { debuglog2 cp -dpf $* } _mv() { debuglog2 mv -f $* } # Skip blank and comment lines (Ugh!) invalid() { line=$1 line=${line## } [[ "$line" = "" || "${line#\#}" != "$line" ]] && return 0 return 1 } # Return distribution getDistro() { if grep -q "SUSE LINUX Enterprise Server 9" /etc/issue; then echo "SLES_8" elif grep -q "SUSE SLES 8" /etc/issue; then echo "SLES_9" elif grep -q "Fedora\|Red Hat" /etc/issue; then echo "RH" else echo "" fi } # Save old log file - use last modified time rotatelog() { if [[ $customLog -eq 1 && -f $LOGFILE ]]; then ext=$(stat -c "%y" $LOGFILE) ext=${ext%.*} # get date and time #ext=${ext// /.} # replace space with . - doesn't work with pdksh ext=$(echo $ext | sed 's/ /./g') # replace space with . mv $LOGFILE $LOGFILE.$ext fi } ################################################################################ # Network functions # ################################################################################ # Get IP address from hostname: use /etc/hosts first ipaddr() { ip=$(grep -w "${1}" /etc/hosts | grep -v ^# | awk '{print $1}') if [ -z "$ip" ]; then line=$(host -n $1 | grep 'has address' | awk '{print $4}') fi if [ -n "$ip" ]; then echo $ip else echo $1 fi } # Get host name from IP address ipname() { name=$(grep -w $1 /etc/hosts | grep -v ^# | awk '{print $2}') if [ -z "$name" ]; then name=$(host -n $1 | grep -v 'not found:') name=${name##* } # Last word is the host name name=${name%%.} # Strip trailing dot fi echo $name } shortipname() { name=$(ipname $1) echo $name | awk -F. '{print $1}' } # Get matching subnet given two IP addresses getsubnet() { ip1=$1 ip2=$2 mask=$3 typeset -i i1 i2 m1 IFS=. set $ip1 i1=$((($1<<24)+($2<<16)+($3<<8)+$4)) # comment to fix hilit set $ip2 i2=$((($1<<24)+($2<<16)+($3<<8)+$4)) # comment to fix hilit set $mask m1=$((($1<<24)+($2<<16)+($3<<8)+$4)) # comment to fix hilit if [[ $((i1&m1)) == $((i2&m1)) ]]; then echo $((i1&m1)) fi } # Get configuration file for a given IP address from IFPATH # Note: Only the first file that matches the given IP is returned getifcfg() { echo $(grep -lw "^IPADDR.*='$1'" ${IFPATH}/ifcfg-* 2> /dev/null | head -n1) } # Get interface name given its IP address getifname() { iface=$(getifcfg $1) iface=${iface##*/} # Strip path iface=${iface#*-} # Strip ifcfg- [ -n "$iface" ] && iface=$(getcfg-interface -- $iface) # FIX: SLES only echo $iface } # Bring up interface corresponding to a given IP address ifUp() { # Check if it is already configured and up #if [ -n "$(ifconfig | grep -wo $1)" ]; then iface=$(getifname $1) debuglog ifup $iface #fi # Send an arp to the default gateway just in case... gwIP=$(route -n | awk '/UG/ {print $2}') iface=$(mmgetifconf | grep $1 | awk '{print $1}') if [ -n "$iface" ]; then for ip in $gwIP; do arping -q -c 5 -s $1 -I $iface $ip done fi } ifDown() { eth=$(mmgetifconf | grep -w $1 | awk '{print $1}') debugmsg "Invoking ifdown on $eth for ip $1" if [ -n "$eth" ]; then # FIX: SuSE only; # ifdown won't work on RedHat for an interface like eth0:0 debuglog ifdown $eth fi } # Bring up "bond" interface ifBondUp() { iface=$1 debuglog modprobe bonding $BONDING_MODULE_OPTS debuglog ifconfig $iface up # Get all slave interfaces from hardware descriptions BSINTERFACES="" for i in $(set | egrep "^BONDING_SLAVE") ; do BONDING_SLAVE=${i##*=} [ -z "$BONDING_SLAVE" ] && continue BSIFACE=$(getcfg-interface -- $BONDING_SLAVE) # FIX: SLES only if [ $? != 0 ] ; then debugmsg "Could not get an interface for slave" continue fi # prepare only available slave devices if [ -d /sys/class/net/$BSIFACE ] ; then BSINTERFACES="$BSINTERFACES $BSIFACE" else debugmsg "Bonding Slave $BSIFACE is not available. Skipped" fi done # enslave the slave ifaces only once if [ -n "$BSINTERFACES" ]; then debuglog ifenslave $iface $BSINTERFACES fi } # Check if a given IP address is an alias (virtual) isVirtualIP() { grep -qlw "^IPADDR..*='$1'" ${IFPATH}/ifcfg-* 2> /dev/null return $? } ################################################################################ # Nodes list functions # ################################################################################ # Extract GPFS IP, iface and netmask from nodes file with the format: # GPFS_IP[:eth:mask] NFS_IP1 NFS_IP2 ... getip() { echo $1 | awk -F: '{print $1}' } getiface() { echo $1 | awk -F: '{print $2}' } getnetmask() { echo $1 | awk -F: '{print $3}' } # Get all NFS IP addresses from nodes file getAllNfsIPs() { exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi echo $nfs_list done } # Get NFS IP addresses for a given GPFS IP address from nodes file getNfsIPs() { exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi gpfs_ip=$(getip $gpfs_if) if [ "$gpfs_ip" == "$1" ]; then debugmsg2 "getNfsIPs: $gpfs_ip $nfs_list" echo $nfs_list break fi done } # Get interface for a given NFS+GPFS address getEth() { gpfs_ip=$1 nfs_ip=$2 eth="" debugmsg "getEth: gpfs_ip $gpfs_ip nfs_ip $nfs_ip" # First try to get eth from node list exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi if [ "$(getip $gpfs_if)" == "$gpfs_ip" ]; then iface=$(getiface $gpfs_if) if [ -n "$iface" ]; then debugmsg "getEth: from $NODELIST $gpfs_ip interface $iface" echo $iface return fi fi done # Now try to get eth from list of interfaces tmp=/tmp/mmgetifconf.$$ mmgetifconf > $tmp exec 3< $tmp while read -u3 iface ip mask; do subnet=$(getsubnet $ip $nfs_ip $mask) if [ -n "$subnet" ]; then debugmsg "getEth: from mmgetifconf $nfs_ip interface $iface" echo $iface return fi done unlink $tmp msg "getEth: not found $gpfs_ip interface" echo "" } # Get netmask for a given NFS+GPFS IP address getmask() { gpfs_ip=$1 nfs_ip=$2 # First try to get mask from node list exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi if [ "$(getip $gpfs_if)" == "$gpfs_ip" ]; then mask=$(getnetmask $gpfs_if) if [ -n "$mask" ]; then debugmsg "getmask: from $NODELIST $gpfs_ip netmask $mask" echo $mask return fi fi done # Now try from list of interfaces #sub_ip=$(echo $nfs_ip | cut -d . -f1,2,3) sub_ip=${nfs_ip%.*} mask=$(mmgetifconf | grep -m1 $sub_ip | awk '{print $3}') if [ -n "$mask" ]; then debugmsg "getmask: from get_ifconf $gpfs_ip netmask $mask" echo $mask return fi msg "getmask: not found $gpfs_ip netmask default" echo "255.255.255.0" } # return the next node for a given node and a start point. getNextNode() { given=$1 start=$2 typeset -i next n i=0 set -A gpfsIPs $(getAllGPFSIPs) n=${#gpfsIPs[@]} while [ i -lt $n ]; do [ "${gpfsIPs[$i]}" == "$given" ] && break i=$i+1 done [ $i -eq $n ] && return # not found next=$i+1 [ $next -eq $n ] && next=0 [ "${gpfsIPs[$next]}" == "$start" ] && return echo ${gpfsIPs[$next]} } ################################################################################ # GPFS functions # ################################################################################ # Get GPFS IP addresses from nodes file getAllGPFSIPs() { exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi debugmsg2 "getAllGPFSIPs: $gpfs_if" echo $(getip $gpfs_if) done } # Get current node's GPFS IP address myGPFSIP() { for ip in $(getAllGPFSIPs); do my_ip=$(mmgetifconf | grep -w $ip) if [ "$my_ip" != "" ]; then echo $ip break fi done } isGpfsFS() { # Get list of GPFS filesystems from /etc/fstab fses1="$(grep -w gpfs /etc/fstab | awk '{print $2}')" exp1=$1 for fs1 in $fses1; do debugmsg isGpfsFS: exp=$exp1 fs=$fs1 if [ "${exp1##$fs1}" != "$exp1" ]; then debugmsg isGpfsFS: return 0 return 0 fi done debugmsg isGpfsFS: $1 return 1 return 1 } # Mount GPFS filesystems that are to be NFS-exported mountExportedFS() { # Get list of GPFS filesystems from /etc/fstab fses="$(grep -w gpfs /etc/fstab | awk '{print $2}')" # Get list of GPFS exports from /etc/exports exports="$(awk '{print $1}' /etc/exports | grep ^/ | sort | uniq)" for exp in $exports; do for fs in $fses; do if [ "${exp##$fs}" != "$exp" ]; then debuglog mount $fs fi done done } isExported() { # Get list of GPFS filesystems from /etc/mtab fs="$(grep -w "gpfs .*dev=$1" /etc/mtab | awk '{print $2}')" debugmsg isExported: $fs # Get list of GPFS exports from /etc/exports exports="$(awk '{print $1}' /etc/exports | grep ^/ | sort | uniq)" for exp in $exports; do if [ "${exp##$fs}" != "$exp" ]; then debugmsg isExported: $fs return 0 return 0 fi done debugmsg isExported: $fs return 1 return 1 } isAnyExported() { mountExportedFS # Get list of GPFS filesystems from /etc/fstab fses="$(grep " gpfs .*dev=" /etc/mtab | awk '{print $2}')" debugmsg isAnyExported: $fses # Get list of GPFS exports from /etc/exports exports="$(awk '{print $1}' /etc/exports | grep ^/ | sort | uniq)" for exp in $exports; do for fs in $fses; do debugmsg isAnyExported: exp=$exp fs=$fs if [ "${exp##$fs}" != "$exp" ]; then debugmsg isAnyExported: return 0 return 0 fi done isGpfsFS $exp rc=$? # export if not gpfs if [ $rc -ne 0 ]; then debugmsg isAnyExported: $exp is not GPFS return 0 return 0 fi done debugmsg isAnyExported: return 1 return 1 } isMounted() { # Get list of GPFS filesystems from /etc/mtab fses="$(grep " gpfs .*dev=" /etc/mtab | awk '{print $2}')" debugmsg isMounted: $fses # Get list of GPFS exports from /etc/exports exp=$1 for fs in $fses; do if [ "${exp##$fs}" != "$exp" ]; then debugmsg isMounted: return 0 return 0 fi done debugmsg isMounted: return 1 return 1 } isSharedRoot() { # Get filesystem from /etc/mtab fs=$(grep -w "gpfs .*dev=$1" /etc/mtab | awk '{print $2}') if [ "${SHARED_ROOT##$fs}" != "$SHARED_ROOT" ]; then return 0 fi return 1 } # Run mmdsh command mmdshcmd() { debugmsg "mmdsh -vL $*" mmdsh -vL $* } # Run mmdsh command and return exit code correctly mmdshcmdRC() { debugmsg "mmdsh -vL $*" typeset -i rc=0 myIP=$(myGPFSIP) targets=$1 cmd=../../../../$2 # relative to /usr/lpp/mmfs/bin shift 2 parms=$* remoteVerb=hanfs # FIX: Use the following from mmglobfuncs tmpDir=/var/mmfs/tmp/ mmremote=/usr/lpp/mmfs/bin/mmremote MMMODE=LC NO_LINK=_NO_LINK_ rm -f $tmpDir/$remoteVerb.* mmdsh -vL $targets $mmremote onbehalf2 $myIP $remoteVerb $MMMODE $NO_LINK $cmd $parms rcInfo=$(ls $tmpDir$remoteVerb.* 2> /dev/null | sort -rn | head -1) rm -f $tmpDir$remoteVerb.* if [ -n "$rcInfo" ]; then rc=${rcInfo#$tmpDir$remoteVerb\.} fi return $rc } # Run command on all GPFS nodes mmdshAll() { gpfsIPs=$(getAllGPFSIPs) gpfsIPs=$(echo $gpfsIPs | sed 's/ /,/g') mmdshcmd $gpfsIPs $* } # stop another node stopNode() { [ -z "$1" ] && return 1 cmd="/var/mmfs/etc/nfsmonitor -e && /var/mmfs/etc/stop.nfs" mmdshcmd $1 "$cmd" & return 0 } # Return 0 (success) if quorum is "Active" on this node; 1 otherwise isNodeUp() { [ -z "$1" ] && return 1 #status=$(tsstatus -m | grep -w "$1") #status=$(mmgetstate -k -N $1 | grep -w "active") #status=$(mmdshcmd $1 "/usr/lpp/mmfs/bin/mmremote mmGetState | grep -w active") #debugmsg "mmgetstate $1: $status" #[ -n "$status" ] && return 0 mmfsadm dump cfgmgr | grep -q "$1 .* up " return $? } # Exit GPFS for given IP address is "down" ifGPFSDownExit() { [ -n "$(pidof mmfsd)" ] && return 0 stop.nfs mmfsadm cleanup exit 1 } ################################################################################ # Configuration functions # ################################################################################ # Check status of a service checkStatus() { if [ -e /sbin/checkproc ]; then opts="" if [[ "$1" == "nfsd" || "$1" == "lockd" ]]; then opts="-n" fi checkproc $opts $1 > /dev/null 2>&1 return $? elif [ -f /etc/rc.d/init.d/functions ]; then savedpath=$PATH . /etc/rc.d/init.d/functions PATH=$savedpath status $1 > /dev/null 2>&1 return $? else return 3 fi } # Get service for communicating between GPFS nodes rshService() { if [ -n "$GPFS_RSHD" ]; then echo "$GPFS_RSHD" elif [ "$GPFS_rshPath" == "/usr/bin/rsh" ]; then echo "xinetd" elif [ "$GPFS_rshPath" == "/usr/bin/ssh" ]; then echo "sshd" else debugmsg "Unsupported service $GPFS_rshPath" echo "" fi } # Start rsh (or ssh) server for communication between GPFS nodes startRshd() { service=$(rshService) checkStatus $service if [ $? -ne 0 ]; then /etc/init.d/$service start fi } # Configure NLM ports configNLMPorts() { # Determine which port to use for NLM from the node id # and ensure it is set typeset -i nlmport curport nodeid=0 while [ $nodeid -eq 0 ]; do #nodeid=$(mmgetstate -k | awk -F: '{print $2}') #nodeid=$(mmlscluster | grep -w $1 | awk '{print $1}') nodeid=$(mmdsm dsmGetNodeNumber) [ $nodeid -eq 0 ] && warn "Cannot get nodeid for $1 from mmgetstate, retrying..." done nlmport=$NLM_PORT+$nodeid # Ensure that nfsd is loaded debuglog modprobe nfsd curport=$(rpcinfo -p 2> /dev/null | grep -m1 nlockmgr | awk '{print $4}') [ -z "$curport" ] && curport=$(sysctl -n fs.nfs.nlm_tcpport) if [ $curport -ne $nlmport ]; then debugmsg "Current NLM port used is $curport, should be $nlmport" debuglog sysctl -w fs.nfs.nlm_tcpport=$nlmport debuglog sysctl -w fs.nfs.nlm_udpport=$nlmport nfsService stop nfsService start # Check if we the port got assigned correctly curport=$(rpcinfo -p 2> /dev/null | grep -m1 nlockmgr | awk '{print $4}') [ -z "$curport" ] && curport=$(sysctl -n fs.nfs.nlm_tcpport) if [ $curport -ne $nlmport ]; then err "Cannot change existing port $curport to $nlmport for HA-NFS. Terminating..." return 1 fi fi return 0 } # Get system boot time getBootTime() { grep -w btime /proc/stat | awk '{print $2}' } # Create a place to backup entries (statd/sm) different from /var/lib/nfs # We need this because sending SM_NOTIFY messages on failover/failback wipes out # client entries from /var/lib/nfs/sm and these do not get created again until # next reboot # This backup will be cleaned up on next reboot backupSmDir() { typeset -i current_btime saved_btime if [ -f $1/btime ]; then current_btime=$(getBootTime) saved_btime=$(cat $1/btime) if [ $current_btime -gt $saved_btime ]; then # Erase backup smdir since a reboot has happened debugmsg "Erasing backup statd dirs in $1" _rmdir $1/sm $1/sm.bak echo $current_btime > $1/btime fi else _mkdir $1/sm $1/sm.bak # Save boot time so we can decide when to cleanup $1 btime=$(getBootTime) debugmsg "Saving current boot time $btime in $1" echo $btime > $1/btime fi } # Keep the following data from /var/lib/nfs in shared space (GPFS) # so all nodes have access to it for failover/failback purposes: # rmtab # sm # sm.bak shareSmDir() { myip=$1 sh_rmtab=$SHARED_NFS/rmtab [ ! -f $sh_rmtab ] && touch $sh_rmtab && chmod 644 $sh_rmtab # No need to share rmtab. Its no longer used to validate NFS requests. # ln -sf $sh_rmtab /var/lib/nfs/rmtab sh_statd=$SHARED_NFS/$myip/statd _mkdir $sh_statd/sm $sh_statd/sm.bak if [ -e /var/lib/nfs/statd ]; then smdir=/var/lib/nfs/statd/sm else smdir=/var/lib/nfs/sm fi if [[ -d $smdir && ! -L $smdir ]]; then # Move stuff from local smdir to shared _mv $smdir/* $sh_statd/sm _mv ${smdir}.bak/* $sh_statd/sm.bak _rmdir $smdir ${smdir}.bak fi if [ ! -d $smdir ]; then ln -sf $sh_statd/sm $smdir if [ $? -ne 0 ]; then err "Failed to link $smdir to $sh_statd/sm" return 1 fi ln -sf $sh_statd/sm.bak ${smdir}.bak if [ $? -ne 0 ]; then err "Failed to link ${smdir}.bak to $sh_statd/sm.bak" return 1 fi fi if [ -e /var/lib/nfs/statd ]; then # Redhat requires rpcuser as uid/gid for statd stuff chown -R rpcuser.rpcuser $sh_statd fi return 0 } # Remove the symlinks created by shareSmDir unshareSmDir() { if [ -e /var/lib/nfs/statd ]; then smdir=/var/lib/nfs/statd/sm else smdir=/var/lib/nfs/sm fi [ -L /var/lib/nfs/rmtab ] && _unlink /var/lib/nfs/rmtab && touch /var/lib/nfs/rmtab [ -L $smdir ] && _unlink $smdir && _mkdir $smdir [ -L ${smdir}.bak ] && _unlink ${smdir}.bak && _mkdir ${smdir}.bak } # Configure GPFS for HA-NFS - first time only configHA() { myip=$1 # Check if this is the first time we are configuring # FIX: check needed here? # Configure NLM ports # Note: This is now done by the startup script /etc/init.d/gpfs # configNLMPorts $myip # Check the shared directory is available and on GPFS if [ ! -d $SHARED_ROOT ]; then err "Cannot find shared directory $SHARED_ROOT" return 1 fi df -Tl $SHARED_ROOT | grep -qw gpfs if [ $? -ne 0 ]; then err "$SHARED_ROOT found but is not on a GPFS filesystem" return 1 fi debugmsg "Shared fs is $SHARED_ROOT" # Create shared data for HA-NFS (statd, rmtab) and recovery _mkdir $SHARED_NFS $SHARED_RECOVERY shareSmDir $myip return $? } ################################################################################ # NFS functions # ################################################################################ startMountd() { savedpath=$PATH RPCMOUNTDOPTS= #Unrolling mountd part of /etc/init.d/nfs [ -f /etc/init.d/functions ] && . /etc/init.d/functions [ -f /etc/sysconfig/network ] && . /etc/sysconfig/network [ -f /etc/sysconfig/nfs ] && . /etc/sysconfig/nfs PATH=$savedpath [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT" case $MOUNTD_NFS_V2 in no|NO) RPCMOUNTDOPTS="$RPCMOUNTDOPTS --no-nfs-version 2" ;; esac case $MOUNTD_NFS_V3 in no|NO) RPCMOUNTDOPTS="$RPCMOUNTDOPTS --no-nfs-version 3" ;; esac if [ -e /sbin/startproc ]; then debuglog startproc /usr/sbin/rpc.mountd $RPCMOUNTDOPTS else daemon rpc.mountd $RPCMOUNTDOPTS fi } nfsService() { nfslock=/etc/init.d/nfslock if [ -f /etc/init.d/nfsserver ]; then nfs=/etc/init.d/nfsserver else nfs=/etc/init.d/nfs fi case $1 in start) msg "Starting NFS services" sysctl -e -q -w fs.nfs.use_underlying_lock_ops=1 debuglog $nfs start debuglog $nfslock start if [ -n "$MOUNTD_PORT" ]; then # Make sure mountd is bound to the right port if specified curport=$(rpcinfo -p 2> /dev/null | grep -m1 mountd | awk '{print $4}') if [ "$curport" != "$MOUNTD_PORT" ]; then pid=$(pidof rpc.mountd) debugmsg "Current mountd port is $curport, should be $MOUNTD_PORT. Stopping current rpc.mountd (pid $pid) and restarting with correct port." debuglog kill -9 $pid startMountd fi fi # Update number of nfsd processes debuglog rpc.nfsd $NFSD_PROCS # Reload exportfs anyway since starting nfs server may not do this # if it was already running exportfs -r ;; stop) msg "Stopping NFS services" debuglog $nfs stop ;; terminate) msg "Cleaning NFS services" debuglog $nfs stop debuglog $nfslock stop ;; soft-restart) debuglog rpc.nfsd 0 debuglog sleep 1 # FIX: required? debuglog rpc.nfsd $NFSD_PROCS ;; *) echo "Usage: $0 start|stop|restart" esac } ################################################################################ # NLM functions # ################################################################################ # Restart rpc.statd # Note that we don't want to issue a "nfslock restart" directly since this would # involve restarting lockd (which results in lock recovery) which we don't want restartStatd() { if [ -f /sbin/rpc.statd ]; then # Kill the statd process if exists (important for registering with portmap) debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd) # Start the statd process debuglog /etc/init.d/nfslock start fi } # Release all locks by sending a KILL signal to kernel lockd thread resetLockd() { if [ -f /etc/init.d/nfsserver ]; then # SuSE debuglog /etc/init.d/nfslock stop else # Redhat fails to kill lockd to start grace period, so do it explicitly pid=$(ps -aef | grep -w "\[lockd\]" | awk '{print $2}') [ -n "$pid" ] && kill -9 $pid fi } # Check grace period support in kernel checkDynamicGrace() { debuglog2 mount -t nfsd nfsd /proc/fs/nfsd [ -f /proc/fs/nfsd/grace ] && return 1 debugmsg2 "Cannot find /proc/fs/nfsd/grace, will restart lockd (and reclaim all locks) for failover/failback." return 0 } # Start grace period startGrace() { checkDynamicGrace if [ $? -eq 1 ]; then echo 1 > /proc/fs/nfsd/grace else # Kernel does not support starting grace period through /proc # Only thing to do is kill lockd msg "Dynamic enabling of grace period not supported in this kernel. Restarting lockd" resetLockd fi } # Send SM_NOTIFY message to client on server restart # Usage: notify statd_dir [server IP] notify() { opts="" if [ -e /sbin/sm-notify ]; then # SLES - kernel-space statd _cp $1/sm/* /var/lib/nfs/sm [ -n "$2" ] && opts="-m 1 -v $2" debugmsg2 "Notify clients: " && log ls $1/sm debuglog2 sm-notify $opts elif [ -e /sbin/rpc.statd ]; then # RHEL - user-space statd [ -n "$2" ] && opts="-P $2" debugmsg2 "Notify clients: " && log ls $1/sm debuglog2 rpc.statd -N -n $1 $opts fi } # Send SM_NOTIFY message to client on all available interfaces # SLES9 has a bug wherein the NFS client compares the hostname in the notify # message against the hostname it registered during mount to determine whether # to handle SM_NOTIFY requests coming from the server. We try to work around # the problem by (optionally) sending notify messages on combinations of # hostnames and IP address notifyClient() { debugmsg2 "notify host:$1, vip:$VIP, statddir:$2" #debugmsg "Notify on local name" #notify $2 # If VIP is specified, always send notify on the VIP if [ -n "$VIP" ]; then debugmsg "SM_NOTIFY clients for VIP $VIP" notify $2 $VIP [ $notifyfix -eq 0 ] && return fi shortname=$(shortipname $1) debugmsg "Notify for host $shortname" notify $2 $shortname [ $notifyfix -eq 0 ] && return host=$(ipname $1) if [[ -n "$host" && "$host" != "$shortname" ]]; then debugmsg "Notify for host.domain $host" notify $2 $host fi ip=$(ipaddr $1) if [[ -n "$ip" && "$ip" != "$host" && "$ip" != "$shortname" ]]; then debugmsg "Notify for IP $ip" notify $2 $ip fi } ################################################################################ # Failover functions # ################################################################################ startReclaim() { gpfs_ip=$1 smdir=${SHARED_NFS}/$gpfs_ip/statd/sm statedir=/tmp/statd msg "Reclaim of NLM locks initiated for node $gpfs_ip" _mkdir $statedir/sm $statedir/sm.bak _cp $smdir/* $statedir/sm.bak if [ $iptakeover -eq 1 ]; then for nfsip in "$(getNfsIPs $gpfs_ip)"; do _cp $statedir/sm.bak/* $statedir/sm notifyClient $nfsip $statedir done else # get VIP from loopback ip=$(mmgetifconf | grep -w 'lo' | awk '{print $2}') _cp $statedir/sm.bak/* $statedir/sm if [[ -n "$ip" && "$ip" != "$gpfs_ip" ]]; then notifyClient $ip $statedir fi fi # restore list of lock users _cp $statedir/sm.bak/* $smdir } # Find the next entry after one with node that failed and use it as the # takeover node. If no more line wrap around to the top. Check that the node is # up, if not use the next entry. There can be few NFS external IP address for # each GPFS IP. selectNode() { failed_node=$1 typeset -i next=0 gpfs_ip="" while true; do exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi gpfs_ip=$(getip $gpfs_if) debugmsg2 "selectNode: GPFS IP: $gpfs_ip, NFS IP:$nfs_list" if [ $next -eq 1 ]; then [ "$gpfs_ip" == "$failed_node" ] && continue isNodeUp $gpfs_ip if [ $? -eq 0 ]; then debugmsg "takeover node is $gpfs_ip" echo $gpfs_ip return else debugmsg "selectNode: takeover_node $gpfs_ip is down" continue fi else [ "$gpfs_ip" == "$failed_node" ] && next=1 fi done # Didn't find a takeover node, so start from top searching for new node debugmsg "selectNode: start from top" exec 3< $NODELIST while read -u3 gpfs_if nfs_list; do # Skip empty and comment lines if invalid $gpfs_if; then continue fi gpfs_ip=$(getip $gpfs_if) debugmsg2 "selectNode: GPFS IP: $gpfs_ip, NFS IP: $nfs_list" if [ "$gpfs_ip" != "$failed_node" ]; then isNodeUp $gpfs_ip if [ $? -eq 0 ]; then debugmsg "takeover node is $gpfs_ip" echo $gpfs_ip return else debugmsg "selectNode: takeover_node $gpfs_ip is down" continue fi fi done sleep 10 done } selectNode2() { failed=$1 candidate=$(getNextNode $failed $failed) while true; do while [ -z "$candidate" ]; do sleep 10 candidate=$(getNextNode $failed $failed) done isNodeUp $candidate if [ $? -eq 0 ]; then echo $candidate return fi candidate=$(getNextNode $candidate $failed) done } recoverNode() { failed_nfs_ip=$1 failed_gpfs_ip=$2 debugmsg "start recoverNode $failed_nfs_ip" [ -z "$failed_nfs_ip" ] && return 1 if [ -z "$(ifconfig | grep -wo $failed_nfs_ip)" ]; then typeset -i numberOfRetries=20 attemptNo=0 while [[ $attemptNo -lt $numberOfRetries && "$(IPaddr $failed_nfs_ip monitor)" == "OK" ]]; do attemptNo=$attemptNo+1 sleep 1 done # call stonith exit if [ $attemptNo -eq $numberOfRetries ]; then if [ -e /var/mmfs/etc/stonith ]; then debugmsg "call /var/mmfs/etc/stonith with $failed_gpfs_ip $failed_nfs_ip" /var/mmfs/etc/stonith $failed_gpfs_ip $failed_nfs_ip debugmsg "back from call to /var/mmfs/etc/stonith" fi fi typeset -i numberOfRetries=15 attemptNo=0 while [[ $attemptNo -lt $numberOfRetries && "$(IPaddr $failed_nfs_ip monitor)" == "OK" ]]; do attemptNo=$attemptNo+1 sleep 1 done if [ $attemptNo -eq $numberOfRetries ]; then # Somebody else has failed_nfs_ip - maybe the failed node is not down? msg "Error: some other host already has address $failed_nfs_ip. Recovery will not happen." return 1 fi fi debugmsg "recoverNode $failed_nfs_ip" debuglog IPaddr $failed_nfs_ip start eth=$(mmgetifconf | grep -w $failed_nfs_ip | awk '{print $1}') debugmsg "Checking if interface for ip $failed_nfs_ip is up" if [ -n "$eth" ]; then return 0 fi debugmsg "No interface for ip $failed_nfs_ip is up" return 1 } IPtakeover() { me=$1 failed=$2 typeset -i do_reclaim=0 [ "$me" == "$failed" ] && return msg "Initiating IP takeover of $failed due to node failure" _mkdir ${SHARED_RECOVERY}/$me debuglog touch ${SHARED_RECOVERY}/$me/$failed stopNode $failed nfsIPs=$(getNfsIPs $failed) debugmsg "IPtakeover ips: $nfsIPs" for ip in $nfsIPs; do # Takeover IP and issue gratuitous ARP to the clients for the node # that failed so that clients can reconnect to the new address recoverNode $ip $failed [ $? -eq 0 ] && do_reclaim=1 done if [ $do_reclaim -ne 0 ]; then # got the IP, check if we are still the node to do takeover if [ ! -f $SHARED_RECOVERY/$me/$failed ]; then # drop the inteface nfsips=$(getNfsIPs $failed) debugmsg "Node $failed recovery canceled" for ip in $nfsips; do ifDown $ip done return fi else # did not get IP, takeover failed, remove the entry _unlink ${SHARED_RECOVERY}/$me/$failed return fi debugmsg "IPtakeover: File contents:" debugmsg $(ls -R ${SHARED_RECOVERY}/$me) _cp ${SHARED_NFS}/$failed/statd/sm/* ${SHARED_NFS}/$me/statd/sm checkDynamicGrace [ $? -eq 1 ] && startReclaim $failed } # Use mii-diag, mii-tool or ethtool to detect network link status # Return 0 if link beat detected, 1 if invalid (no device), 2 if no link beat # FIX: If none of the tools exist, return 2 since there is no way to detect # link status. This means caller is responsible for handling the error # correctly. netdiag() { # We trust mii-diag works for all interfaces; # if it exists, return its status tool=$(which mii-diag 2> /dev/null) if [ -n "$tool" ]; then debuglog2 $tool -s $eth return $? fi # mii-diag doesn't exist - try both mii-tool and ethtool tool=$(which mii-tool 2> /dev/null) if [ -n "$tool" ]; then output=$($tool $eth 2> /dev/null) if [ $? -eq 0 ]; then status=$(echo $output | awk '{print $NF}') [ "$status" == "ok" ] && return 0 fi fi tool=$(which ethtool 2> /dev/null) if [ -n "$tool" ]; then output=$($tool $eth 2> /dev/null) if [ $? -eq 0 ]; then status=$(echo $output | grep "Link detected" | awk '{print $NF}') [ $status == "yes" ] && return 0 fi fi # We reach here either if no tool exists or if there is an error return 2 } # Returns 0 if ready, 1 otherwise checkLinkStatus() { eth=$1 # Check if the interface is down debuglog2 ifconfig $eth [ $? -ne 0 ] && return 1 typeset -i tries=3 n=0 while [ $n -lt $tries ]; do n=$n+1 debugmsg2 "Checking link status for $eth - attempt $n" netdiag $eth if [ $? -eq 0 ]; then debugmsg2 "Tested the link for $eth, and it is connected" return 0 else debugmsg2 "Tested the link for $eth, and it is NOT connected" sleep 5 continue fi done return 1 } getRecoveryNodes() { ls -A $SHARED_RECOVERY 2> /dev/null } getFailedNodes() { ls -A $SHARED_RECOVERY/$1 2> /dev/null } IPfailback() { # Find failover IP address failover_ips="" for recovery_node in $(getRecoveryNodes); do debugmsg2 "Checking if $recovery_node is recovery node" for failed_node in $(getFailedNodes $recovery_node); do debugmsg "Failed node is $failed_node" if [ "$failed_node" == "$myip" ]; then failover_ips="$failover_ips $recovery_node" fi done done if [ -z "$failover_ips" ]; then debugmsg "No failback is needed" return fi debugmsg "Failover nodes are $failover_ips" # if recovery node dosen't hold my external IP then no failback tmp="" nfsIPs=$(getNfsIPs $myip) for nfsip in $nfsIPs; do for failover_ip in $failover_ips; do tmp=$(mmdshcmd $failover_ip "/usr/lpp/mmfs/bin/mmgetifconf | grep $nfsip") debugmsg2 check for ip=$nfsip on $failover_ip tmp=$tmp [ -n "$tmp" ] && break 2 done done if [ -z "$tmp" ]; then # remove my node from the recovery node list debugmsg "IP is not in use" else # wait for any of the NFS IP to be up, but try all IPs at least once let up=0 while [ $up -eq 0 ]; do for ip in $nfsIPs; do iface=$(getifname $ip) # if this is a virtual address, the interface is already up isVirtualIP $ip RC=$? if [ $RC -ne 0 ]; then debugmsg2 wait for $ip ifcfg=$(getifcfg $ip) debugmsg2 "Sourcing $ifcfg" . $ifcfg if [ "$BONDING_MASTER" == "yes" ]; then ifBondUp $iface else debuglog ifconfig $iface up fi fi checkLinkStatus $iface if [ $? -eq 0 ]; then debugmsg2 "Testing the link for $iface, and it is connected" if ! arping -q -c 2 -w 3 -D -I $iface $ip ; then debugmsg2 $ip is ready up=1 fi fi [ $RC -ne 0 ] && ifconfig $iface down done [ $up -eq 0 ] && sleep 5 done fi # remove my node from the recovery node list debuglog rm $SHARED_RECOVERY/*/$myip > /dev/null 2>&1 # save list of lock users for debugmsg "Failover IP is $failover_ip" smdir=/tmp/statd/sm _mkdir $smdir ${smdir}.bak _cp ${SHARED_NFS}/$failover_ip/statd/sm/* ${smdir}.bak # Get recovery node to free my IP addresses mmdshAll "/var/mmfs/etc/nfsgrace $myip" >> $LOGFILE 2>&1 checkDynamicGrace if [ $? -eq 1 ]; then mmdshcmd $failover_ip "/var/mmfs/etc/nfsnodeback $failover_ip $myip" >> $LOGFILE 2>&1 else gpfsIPs=$(getAllGPFSIPs) for ip in $gpfsIPs; do debugmsg2 "ip $ip, myip $ip" if [ "$ip" != "$myip" ]; then mmdshcmd $ip "/var/mmfs/etc/nfsnodeback $ip $failover_ip $myip" >> $LOGFILE 2>&1 fi done fi }