#!/bin/ksh
# @(#)64	1.17.1.18  src/avs/fs/mmfs/samples/nfscluster/nfsfuncs, mmfs, avs_rgpfs24, rgpfs24s011a 3/7/07 20:57:04
#

if [ ! -f /var/mmfs/etc/nfsdefs ]; then
    echo "$0: Can't find NFS defines(nfsdefs) in /var/mmfs/etc"
    exit 0
fi

. /var/mmfs/etc/nfsdefs

# Configure path used for programs
PATH=$PATH:/sbin:/usr/sbin:/usr/bin:/bin                # standard programs
PATH=$PATH:/usr/lpp/mmfs/bin:/var/mmfs/etc		# for mmfs programs
PATH=$PATH:/etc/ha.d/resource.d:/usr/lib/heartbeat	# for HA stuff
export PATH

# Shared storage for HA-NFS
# The directory structure is as follows:
# shared/.ha
#     nfs (mirrors /var/lib/nfs)
#         rmtab
#         node1
#	      statd
#	          sm
#	          sm.bak
#	  ...
#     recovery
#	  node1
#	      nodeX
#	  ...
#
# Local storage for node1
# /var/lib/nfs
#	rmtab  -> shared/.ha/nfs/rmtab
# (RHEL):
#	statd  -> shared/.ha/nfs/node1/statd
# (SLES):
#	sm     -> shared/.ha/nfs/node1/statd/sm
#	sm.bak -> shared/.ha/nfs/node1/statd/sm.bak
#
SHARED_HA=${SHARED_ROOT}/.ha
SHARED_NFS=${SHARED_HA}/nfs
SHARED_RECOVERY=${SHARED_HA}/recovery

# For failover of locks to happen correctly, the lockmgrs on all GPFS nodes
# need to listen on different port numbers so clients are forced to 
# re-establish socket connection with the takeover node for reclaims
# Default port number to use for NLM (NFS lock manager)
# For node i, nlmport = NLM_PORT + i
[ -z "$NLM_PORT" ] && NLM_PORT=10000

# Dependencies for distribution - RHEL, SLES
if [ -d /etc/sysconfig/network ]; then
    IFPATH="/etc/sysconfig/network"
else
    IFPATH="/etc/sysconfig/network-scripts"
fi

typeset -i	iptakeover=0 	# does GPFS perform IP failover?
		debug=0 	# debug level for messages to be logged?
		notifyfix=0	# SM_NOTIFY fix required for SLES?
		monitor=1	# monitoring of daemons required?
		customLog=1	# Log file specified
		hardMount=1	# Clients use "hard" NFS mounts

[ -f $NODELIST ]     && iptakeover=1
[ -n "$DEBUG" ]      && debug=$DEBUG
[ -n "$NOTIFYFIX" ]  && notifyfix=1
[ -n "$MONITOR" ]    && monitor=$MONITOR
[ -z "$LOGFILE" ]    && LOGFILE=/var/mmfs/gen/mmfslog && customLog=0
[ -z "$NFSD_PROCS" ] && NFSD_PROCS=32

if [ -n "$GPFS_RSH" ]; then
    GPFS_rshPath=$(which $GPFS_RSH)
else
    GPFS_rshPath=$(which rsh)
fi
export GPFS_rshPath

################################################################################
#				Utility functions			       #
################################################################################

die() {
    echo "$*"
    exit 1
}

_log() {
    let level=$1; shift
    if [ $debug -ge $level ]; then
	echo "`date`: $*" >> $LOGFILE 2>&1
	$* >> $LOGFILE 2>&1
    else
	$* > /dev/null 2>&1
    fi
}

log() {
    _log 0 $*
}

debuglog() {
    _log 1 $*
}

debuglog2() {
    _log 2 $*
}

_msg() {
    level=$1; shift
    [ $debug -ge $level ] && echo "`date`: $*" >> $LOGFILE 2>&1
}

msg() {
    _msg 0 $*
    logger -t HA-NFS "$*"
}

debugmsg() {
    _msg 1 $*
}

debugmsg2() {
    _msg 2 $*
}

err() {
    msg "Error: $*"
}

warn() {
    msg "Warning: $*"
}

_mkdir() {
    debuglog2 mkdir -m 0700 -p $*
}

_rmdir() {
    debuglog2 rm -rf $*
}

_unlink() {
    debuglog2 unlink $1
}

_cp() {
    debuglog2 cp -dpf $*
}

_mv() {
    debuglog2 mv -f $*
}

# Skip blank and comment lines (Ugh!)
invalid() {
    line=$1
    line=${line## }
    [[ "$line" = "" || "${line#\#}" != "$line" ]] && return 0
    return 1
}

# Return distribution
getDistro() {
    if grep -q "SUSE LINUX Enterprise Server 9" /etc/issue; then
        echo "SLES_8"
    elif grep -q "SUSE SLES 8" /etc/issue; then
        echo "SLES_9"
    elif grep -q "Fedora\|Red Hat" /etc/issue; then
        echo "RH"
    else
        echo ""
    fi
}

# Save old log file - use last modified time
rotatelog() {
    if [[ $customLog -eq 1 && -f $LOGFILE ]]; then
	ext=$(stat -c "%y" $LOGFILE)
	ext=${ext%.*}	# get date and time
	#ext=${ext// /.}	# replace space with . - doesn't work with pdksh
	ext=$(echo $ext | sed 's/ /./g')	# replace space with .
	mv $LOGFILE $LOGFILE.$ext
    fi
}

################################################################################
#				Network functions			       #
################################################################################

# Get IP address from hostname: use /etc/hosts first
ipaddr() {
    ip=$(grep -w "${1}" /etc/hosts | grep -v ^# | awk '{print $1}')
    if [ -z "$ip" ]; then
	line=$(host -n $1 | grep 'has address' | awk '{print $4}')
    fi
    if [ -n "$ip" ]; then
	echo $ip
    else
	echo $1
    fi
}

# Get host name from IP address
ipname() {
    name=$(grep -w $1 /etc/hosts | grep -v ^# | awk '{print $2}')
    if [ -z "$name" ]; then
	name=$(host -n $1 | grep -v 'not found:')
        name=${name##* }	# Last word is the host name
        name=${name%%.}		# Strip trailing dot
    fi
    echo $name
}

shortipname() {
    name=$(ipname $1)
    echo $name | awk -F. '{print $1}'
}

# Get matching subnet given two IP addresses
getsubnet() {
    ip1=$1
    ip2=$2
    mask=$3

    typeset -i i1 i2 m1
    IFS=.
    set $ip1
    i1=$((($1<<24)+($2<<16)+($3<<8)+$4)) # comment to fix hilit

    set $ip2
    i2=$((($1<<24)+($2<<16)+($3<<8)+$4)) # comment to fix hilit

    set $mask
    m1=$((($1<<24)+($2<<16)+($3<<8)+$4)) # comment to fix hilit

    if [[ $((i1&m1)) == $((i2&m1)) ]]; then
	echo $((i1&m1))
    fi
}

# Get configuration file for a given IP address from IFPATH
# Note: Only the first file that matches the given IP is returned
getifcfg() {
    echo $(grep -lw "^IPADDR.*='$1'" ${IFPATH}/ifcfg-* 2> /dev/null | head -n1)
}

# Get interface name given its IP address
getifname() {
    iface=$(getifcfg $1)
    iface=${iface##*/} 	# Strip path
    iface=${iface#*-}	# Strip ifcfg-
    [ -n "$iface" ] && iface=$(getcfg-interface -- $iface) # FIX: SLES only
    echo $iface
}

# Bring up interface corresponding to a given IP address
ifUp() {
    # Check if it is already configured and up
    #if [ -n "$(ifconfig | grep -wo $1)" ]; then
        iface=$(getifname $1)
        debuglog ifup $iface
    #fi

    # Send an arp to the default gateway just in case...
    gwIP=$(route -n | awk '/UG/ {print $2}')
    iface=$(mmgetifconf | grep $1 | awk '{print $1}')
    if [ -n "$iface" ]; then
        for ip in $gwIP; do
            arping -q -c 5 -s $1 -I $iface $ip
        done
    fi
}

ifDown() {
    eth=$(mmgetifconf | grep -w $1 | awk '{print $1}')
    debugmsg "Invoking ifdown on $eth for ip $1"

    if [ -n "$eth" ]; then
	# FIX: SuSE only; 
	# ifdown won't work on RedHat for an interface like eth0:0
	debuglog ifdown $eth
    fi
}

# Bring up "bond" interface
ifBondUp() {
    iface=$1
    debuglog modprobe bonding $BONDING_MODULE_OPTS
    debuglog ifconfig $iface up

    # Get all slave interfaces from hardware descriptions
    BSINTERFACES=""
    for i in $(set | egrep "^BONDING_SLAVE") ; do
	BONDING_SLAVE=${i##*=}
	[ -z "$BONDING_SLAVE" ] && continue
	BSIFACE=$(getcfg-interface -- $BONDING_SLAVE) # FIX: SLES only
	if [ $? != 0 ] ; then
	    debugmsg "Could not get an interface for slave"
	    continue
	fi
        # prepare only available slave devices
	if [ -d /sys/class/net/$BSIFACE ] ; then
	    BSINTERFACES="$BSINTERFACES $BSIFACE"
	else
	    debugmsg "Bonding Slave $BSIFACE is not available. Skipped"
	fi
    done
    # enslave the slave ifaces only once
    if [ -n "$BSINTERFACES" ]; then
	debuglog ifenslave $iface $BSINTERFACES
    fi
}

# Check if a given IP address is an alias (virtual)
isVirtualIP() {
    grep -qlw "^IPADDR..*='$1'" ${IFPATH}/ifcfg-* 2> /dev/null
    return $?
}

################################################################################
#			Nodes list functions				       #
################################################################################

# Extract GPFS IP, iface and netmask from nodes file with the format:
# GPFS_IP[:eth:mask] NFS_IP1 NFS_IP2 ...
getip() {
    echo $1 | awk -F: '{print $1}'
}

getiface() {
    echo $1 | awk -F: '{print $2}'
}

getnetmask() {
    echo $1 | awk -F: '{print $3}'
}

# Get all NFS IP addresses from nodes file
getAllNfsIPs() {
    exec 3< $NODELIST
    while read -u3 gpfs_if nfs_list; do
        # Skip empty and comment lines
        if invalid $gpfs_if; then 
            continue
        fi
	echo $nfs_list
    done
}

# Get NFS IP addresses for a given GPFS IP address from nodes file
getNfsIPs() {
    exec 3< $NODELIST
    while read -u3 gpfs_if nfs_list; do
        # Skip empty and comment lines
        if invalid $gpfs_if; then 
            continue
        fi
	gpfs_ip=$(getip $gpfs_if)
	if [ "$gpfs_ip" == "$1" ]; then
	    debugmsg2 "getNfsIPs: $gpfs_ip $nfs_list"
	    echo $nfs_list
	    break
	fi
    done
}

# Get interface for a given NFS+GPFS address
getEth() {
    gpfs_ip=$1
    nfs_ip=$2
    eth=""

    debugmsg "getEth: gpfs_ip $gpfs_ip nfs_ip $nfs_ip"

    # First try to get eth from node list
    exec 3< $NODELIST
    while read -u3 gpfs_if nfs_list; do
        # Skip empty and comment lines
        if invalid $gpfs_if; then 
            continue
        fi
	if [ "$(getip $gpfs_if)" == "$gpfs_ip" ]; then
	    iface=$(getiface $gpfs_if)
	    if [ -n "$iface" ]; then
		debugmsg "getEth: from $NODELIST $gpfs_ip interface $iface"
		echo $iface
		return
	    fi
	fi
    done

    # Now try to get eth from list of interfaces
    tmp=/tmp/mmgetifconf.$$
    mmgetifconf > $tmp
    exec 3< $tmp
    while read -u3 iface ip mask; do
	subnet=$(getsubnet $ip $nfs_ip $mask)
	if [ -n "$subnet" ]; then
	    debugmsg "getEth: from mmgetifconf $nfs_ip interface $iface"
	    echo $iface
	    return
	fi
    done
    unlink $tmp

    msg "getEth: not found $gpfs_ip interface"
    echo ""
}

# Get netmask for a given NFS+GPFS IP address
getmask() {
    gpfs_ip=$1
    nfs_ip=$2

    # First try to get mask from node list
    exec 3< $NODELIST
    while read -u3 gpfs_if nfs_list; do
        # Skip empty and comment lines
        if invalid $gpfs_if; then 
            continue
        fi
	if [ "$(getip $gpfs_if)" == "$gpfs_ip" ]; then
	    mask=$(getnetmask $gpfs_if)
	    if [ -n "$mask" ]; then
		debugmsg "getmask: from $NODELIST $gpfs_ip netmask $mask"
		echo $mask
		return
	    fi
	fi
    done

    # Now try from list of interfaces
    #sub_ip=$(echo $nfs_ip | cut -d . -f1,2,3)
    sub_ip=${nfs_ip%.*}
    mask=$(mmgetifconf | grep -m1 $sub_ip | awk '{print $3}')
    if [ -n "$mask" ]; then
	debugmsg "getmask: from get_ifconf $gpfs_ip netmask $mask"
	echo $mask
	return
    fi

    msg "getmask: not found $gpfs_ip netmask default"
    echo "255.255.255.0"
}

# return the next node for a given node and a start point.
getNextNode() {
	given=$1
	start=$2
	typeset -i next n i=0
	set -A gpfsIPs $(getAllGPFSIPs)
	n=${#gpfsIPs[@]}
	
	while [ i -lt $n ]; do
		[ "${gpfsIPs[$i]}" == "$given" ] && break
		i=$i+1
	done
	[ $i -eq $n ] && return	# not found
	next=$i+1
	[ $next -eq $n ] && next=0
	[ "${gpfsIPs[$next]}" == "$start" ] && return
	echo ${gpfsIPs[$next]}
}


################################################################################
#				GPFS functions				       #
################################################################################

# Get GPFS IP addresses from nodes file
getAllGPFSIPs() {
    exec 3< $NODELIST
    while read -u3 gpfs_if nfs_list; do
        # Skip empty and comment lines
        if invalid $gpfs_if; then 
            continue
        fi
	debugmsg2 "getAllGPFSIPs: $gpfs_if"
	echo $(getip $gpfs_if)
    done
}

# Get current node's GPFS IP address
myGPFSIP() {
    for ip in $(getAllGPFSIPs); do
        my_ip=$(mmgetifconf | grep -w $ip)
        if [ "$my_ip" != "" ]; then
            echo $ip
            break
        fi
    done
}

isGpfsFS() {
    # Get list of GPFS filesystems from /etc/fstab
    fses1="$(grep -w gpfs /etc/fstab | awk '{print $2}')"
    exp1=$1
    for fs1 in $fses1; do
        debugmsg isGpfsFS: exp=$exp1 fs=$fs1
        if [ "${exp1##$fs1}" != "$exp1" ]; then
            debugmsg isGpfsFS: return 0
            return 0
        fi
    done
    debugmsg isGpfsFS: $1 return 1
    return 1
}

# Mount GPFS filesystems that are to be NFS-exported
mountExportedFS() {
    # Get list of GPFS filesystems from /etc/fstab
    fses="$(grep -w gpfs /etc/fstab | awk '{print $2}')"
    # Get list of GPFS exports from /etc/exports
    exports="$(awk '{print $1}' /etc/exports | grep ^/ | sort | uniq)"
    for exp in $exports; do
        for fs in $fses; do
            if [ "${exp##$fs}" != "$exp" ]; then
                debuglog mount $fs
            fi
        done
    done
}

isExported() {
    # Get list of GPFS filesystems from /etc/mtab
    fs="$(grep -w "gpfs .*dev=$1" /etc/mtab | awk '{print $2}')"
    debugmsg isExported: $fs
    # Get list of GPFS exports from /etc/exports
    exports="$(awk '{print $1}' /etc/exports | grep ^/ | sort | uniq)"
    for exp in $exports; do
        if [ "${exp##$fs}" != "$exp" ]; then
            debugmsg isExported: $fs return 0
            return 0
        fi
    done
    debugmsg isExported: $fs return 1
    return 1
}

isAnyExported() {
    mountExportedFS
    # Get list of GPFS filesystems from /etc/fstab
    fses="$(grep " gpfs .*dev=" /etc/mtab | awk '{print $2}')"
    debugmsg isAnyExported: $fses
    # Get list of GPFS exports from /etc/exports
    exports="$(awk '{print $1}' /etc/exports | grep ^/ | sort | uniq)"
    for exp in $exports; do
        for fs in $fses; do
            debugmsg isAnyExported: exp=$exp fs=$fs
            if [ "${exp##$fs}" != "$exp" ]; then
                debugmsg isAnyExported: return 0
                return 0
            fi
        done
        isGpfsFS $exp
        rc=$?
        # export if not gpfs
        if [ $rc -ne 0 ]; then
                debugmsg isAnyExported: $exp is not GPFS return 0
                return 0
        fi
    done
    debugmsg isAnyExported: return 1
    return 1
}

isMounted() {
    # Get list of GPFS filesystems from /etc/mtab
    fses="$(grep " gpfs .*dev=" /etc/mtab | awk '{print $2}')"
    debugmsg isMounted: $fses
    # Get list of GPFS exports from /etc/exports
    exp=$1
    for fs in $fses; do
        if [ "${exp##$fs}" != "$exp" ]; then
            debugmsg isMounted: return 0
            return 0
        fi
    done
    debugmsg isMounted: return 1
    return 1
}

isSharedRoot() {
    # Get filesystem from /etc/mtab
    fs=$(grep -w "gpfs .*dev=$1" /etc/mtab | awk '{print $2}')
    if [ "${SHARED_ROOT##$fs}" != "$SHARED_ROOT" ]; then
         return 0
       fi
    return 1
}

# Run mmdsh command
mmdshcmd() {
    debugmsg "mmdsh -vL $*"
    mmdsh -vL $*
}

# Run mmdsh command and return exit code correctly
mmdshcmdRC() {
    debugmsg "mmdsh -vL $*"

    typeset -i rc=0
    myIP=$(myGPFSIP)
    targets=$1
    cmd=../../../../$2  	# relative to /usr/lpp/mmfs/bin
    shift 2
    parms=$*

    remoteVerb=hanfs
    # FIX: Use the following from mmglobfuncs
    tmpDir=/var/mmfs/tmp/
    mmremote=/usr/lpp/mmfs/bin/mmremote
    MMMODE=LC
    NO_LINK=_NO_LINK_

    rm -f $tmpDir/$remoteVerb.*
    mmdsh -vL $targets $mmremote onbehalf2 $myIP $remoteVerb $MMMODE $NO_LINK $cmd $parms
    rcInfo=$(ls $tmpDir$remoteVerb.* 2> /dev/null | sort -rn | head -1)
    rm -f $tmpDir$remoteVerb.*
    if [ -n "$rcInfo" ]; then
        rc=${rcInfo#$tmpDir$remoteVerb\.}
    fi
    return $rc
}

# Run command on all GPFS nodes
mmdshAll() {
    gpfsIPs=$(getAllGPFSIPs)
    gpfsIPs=$(echo $gpfsIPs | sed 's/ /,/g')
    mmdshcmd $gpfsIPs $*
}

# stop another node
stopNode() {
    [ -z "$1" ] && return 1
    cmd="/var/mmfs/etc/nfsmonitor -e && /var/mmfs/etc/stop.nfs"
    mmdshcmd $1 "$cmd" &
    return 0
}

# Return 0 (success) if quorum is "Active" on this node; 1 otherwise
isNodeUp() {
    [ -z "$1" ] && return 1
    #status=$(tsstatus -m | grep -w "$1")
    #status=$(mmgetstate -k -N $1 | grep -w "active")
    #status=$(mmdshcmd $1 "/usr/lpp/mmfs/bin/mmremote mmGetState | grep -w active")
    #debugmsg "mmgetstate $1: $status"
    #[ -n "$status" ] && return 0
    mmfsadm dump cfgmgr | grep -q "$1 .* up "
    return $?
}

# Exit GPFS for given IP address is "down"
ifGPFSDownExit() {
    [ -n "$(pidof mmfsd)" ] && return 0
    stop.nfs
    mmfsadm cleanup
    exit 1
}

################################################################################
#			Configuration functions				       #
################################################################################

# Check status of a service
checkStatus() {
    if [ -e /sbin/checkproc ]; then
        opts=""
        if [[ "$1" == "nfsd" || "$1" == "lockd" ]]; then
            opts="-n"
        fi
        checkproc $opts $1 > /dev/null 2>&1
        return $?
    elif [ -f /etc/rc.d/init.d/functions ]; then
        savedpath=$PATH
        . /etc/rc.d/init.d/functions
        PATH=$savedpath
        status $1 > /dev/null 2>&1
        return $?
    else
        return 3 
    fi
}

# Get service for communicating between GPFS nodes
rshService() {
    if [ -n "$GPFS_RSHD" ]; then
        echo "$GPFS_RSHD"
    elif [ "$GPFS_rshPath" == "/usr/bin/rsh" ]; then
        echo "xinetd"
    elif [ "$GPFS_rshPath" == "/usr/bin/ssh" ]; then
        echo "sshd"
    else 
        debugmsg "Unsupported service $GPFS_rshPath"
        echo ""
    fi
}

# Start rsh (or ssh) server for communication between GPFS nodes
startRshd() {
    service=$(rshService)
    checkStatus $service
    if [ $? -ne 0 ]; then
        /etc/init.d/$service start
    fi
}

# Configure NLM ports
configNLMPorts() {
    # Determine which port to use for NLM from the node id 
    # and ensure it is set
    typeset -i nlmport curport nodeid=0
    while [ $nodeid -eq 0 ]; do
        #nodeid=$(mmgetstate -k | awk -F: '{print $2}')
        #nodeid=$(mmlscluster | grep -w $1 | awk '{print $1}')
        nodeid=$(mmdsm dsmGetNodeNumber)
        [ $nodeid -eq 0 ] && warn "Cannot get nodeid for $1 from mmgetstate, retrying..."
    done
    nlmport=$NLM_PORT+$nodeid

    # Ensure that nfsd is loaded
    debuglog modprobe nfsd

    curport=$(rpcinfo -p 2> /dev/null | grep -m1 nlockmgr | awk '{print $4}')
    [ -z "$curport" ] && curport=$(sysctl -n fs.nfs.nlm_tcpport)
    if [ $curport -ne $nlmport ]; then
	debugmsg "Current NLM port used is $curport, should be $nlmport"
	debuglog sysctl -w fs.nfs.nlm_tcpport=$nlmport
	debuglog sysctl -w fs.nfs.nlm_udpport=$nlmport
	nfsService stop
	nfsService start
	# Check if we the port got assigned correctly
	curport=$(rpcinfo -p 2> /dev/null | grep -m1 nlockmgr | awk '{print $4}')
	[ -z "$curport" ] && curport=$(sysctl -n fs.nfs.nlm_tcpport)
	if [ $curport -ne $nlmport ]; then
	    err "Cannot change existing port $curport to $nlmport for HA-NFS. Terminating..."
	    return 1
	fi
    fi
    return 0
}

# Get system boot time
getBootTime() {
    grep -w btime /proc/stat | awk '{print $2}'
}

# Create a place to backup entries (statd/sm) different from /var/lib/nfs
# We need this because sending SM_NOTIFY messages on failover/failback wipes out
# client entries from /var/lib/nfs/sm and these do not get created again until 
# next reboot
# This backup will be cleaned up on next reboot
backupSmDir() {
    typeset -i current_btime saved_btime

    if [ -f $1/btime ]; then
	current_btime=$(getBootTime)
	saved_btime=$(cat $1/btime)
	if [ $current_btime -gt $saved_btime ]; then
	    # Erase backup smdir since a reboot has happened
	    debugmsg "Erasing backup statd dirs in $1"
	    _rmdir $1/sm $1/sm.bak
            echo $current_btime > $1/btime
	fi
    else
	_mkdir $1/sm $1/sm.bak
        # Save boot time so we can decide when to cleanup $1
	btime=$(getBootTime)
	debugmsg "Saving current boot time $btime in $1"
	echo $btime > $1/btime
    fi
}

# Keep the following data from /var/lib/nfs in shared space (GPFS)
# so all nodes have access to it for failover/failback purposes:
#	rmtab
#	sm
#	sm.bak
shareSmDir() {
    myip=$1

    sh_rmtab=$SHARED_NFS/rmtab
    [ ! -f $sh_rmtab ] && touch $sh_rmtab && chmod 644 $sh_rmtab
#   No need to share rmtab. Its no longer used to validate NFS requests.
#   ln -sf $sh_rmtab /var/lib/nfs/rmtab

    sh_statd=$SHARED_NFS/$myip/statd
    _mkdir $sh_statd/sm $sh_statd/sm.bak

    if [ -e /var/lib/nfs/statd ]; then
	smdir=/var/lib/nfs/statd/sm
    else
	smdir=/var/lib/nfs/sm
    fi
    if [[ -d $smdir && ! -L $smdir ]]; then
	# Move stuff from local smdir to shared
	_mv $smdir/* $sh_statd/sm
	_mv ${smdir}.bak/* $sh_statd/sm.bak
	_rmdir $smdir ${smdir}.bak
    fi
    if [ ! -d $smdir ]; then
	ln -sf $sh_statd/sm $smdir
	if [ $? -ne 0 ]; then
            err "Failed to link $smdir to $sh_statd/sm"
            return 1
        fi
	ln -sf $sh_statd/sm.bak ${smdir}.bak
	if [ $? -ne 0 ]; then
            err "Failed to link ${smdir}.bak to $sh_statd/sm.bak"
            return 1
        fi
    fi
    if [ -e /var/lib/nfs/statd ]; then
	# Redhat requires rpcuser as uid/gid for statd stuff
	chown -R rpcuser.rpcuser $sh_statd
    fi
    return 0
}

# Remove the symlinks created by shareSmDir
unshareSmDir() {
    if [ -e /var/lib/nfs/statd ]; then
	smdir=/var/lib/nfs/statd/sm
    else
	smdir=/var/lib/nfs/sm
    fi
    [ -L /var/lib/nfs/rmtab ] && _unlink /var/lib/nfs/rmtab && touch /var/lib/nfs/rmtab
    [ -L $smdir ] && _unlink $smdir && _mkdir $smdir
    [ -L ${smdir}.bak ] && _unlink ${smdir}.bak && _mkdir ${smdir}.bak
}

# Configure GPFS for HA-NFS - first time only
configHA() {
    myip=$1

    # Check if this is the first time we are configuring
    # FIX: check needed here?

    # Configure NLM ports
    # Note: This is now done by the startup script /etc/init.d/gpfs
    # configNLMPorts $myip

    # Check the shared directory is available and on GPFS
    if [ ! -d $SHARED_ROOT ]; then 
        err "Cannot find shared directory $SHARED_ROOT" 
        return 1
    fi
    df -Tl $SHARED_ROOT | grep -qw gpfs
    if [ $? -ne 0 ]; then 
        err "$SHARED_ROOT found but is not on a GPFS filesystem"
        return 1
    fi
    debugmsg "Shared fs is $SHARED_ROOT"

    # Create shared data for HA-NFS (statd, rmtab) and recovery

    _mkdir $SHARED_NFS $SHARED_RECOVERY

    shareSmDir $myip
    return $?
}

################################################################################
#				NFS functions				       #
################################################################################

startMountd() {
    savedpath=$PATH
    RPCMOUNTDOPTS=
    #Unrolling mountd part of /etc/init.d/nfs
    [ -f /etc/init.d/functions ] && . /etc/init.d/functions
    [ -f /etc/sysconfig/network ] && . /etc/sysconfig/network
    [ -f /etc/sysconfig/nfs ] && . /etc/sysconfig/nfs
    PATH=$savedpath
    [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
    case $MOUNTD_NFS_V2 in
	no|NO)
	    RPCMOUNTDOPTS="$RPCMOUNTDOPTS --no-nfs-version 2" ;;
    esac
    case $MOUNTD_NFS_V3 in
	no|NO)
	    RPCMOUNTDOPTS="$RPCMOUNTDOPTS --no-nfs-version 3" ;;
    esac
    if [ -e /sbin/startproc ]; then
	debuglog startproc /usr/sbin/rpc.mountd $RPCMOUNTDOPTS
    else
	daemon rpc.mountd $RPCMOUNTDOPTS
    fi
}

nfsService() {
    nfslock=/etc/init.d/nfslock
    if [ -f /etc/init.d/nfsserver ]; then
	nfs=/etc/init.d/nfsserver
    else
	nfs=/etc/init.d/nfs
    fi

    case $1 in
	start)
	    msg "Starting NFS services"
            sysctl -e -q -w fs.nfs.use_underlying_lock_ops=1
	    debuglog $nfs start
	    debuglog $nfslock start
            if [ -n "$MOUNTD_PORT" ]; then
                # Make sure mountd is bound to the right port if specified
                curport=$(rpcinfo -p 2> /dev/null | grep -m1 mountd | awk '{print $4}')
                if [ "$curport" != "$MOUNTD_PORT" ]; then
                    pid=$(pidof rpc.mountd)
	            debugmsg "Current mountd port is $curport, should be $MOUNTD_PORT. Stopping current rpc.mountd (pid $pid) and restarting with correct port."
                    debuglog kill -9 $pid
                    startMountd
	        fi
            fi
            # Update number of nfsd processes
            debuglog rpc.nfsd $NFSD_PROCS

            # Reload exportfs anyway since starting nfs server may not do this 
	    # if it was already running
	    exportfs -r
	    ;;

	stop)
	    msg "Stopping NFS services"
	    debuglog $nfs stop
	    ;;

	terminate)
	    msg "Cleaning NFS services"
	    debuglog $nfs stop
	    debuglog $nfslock stop
	    ;;

	soft-restart)
	    debuglog rpc.nfsd 0
            debuglog sleep 1	# FIX: required?
            debuglog rpc.nfsd $NFSD_PROCS
	    ;;

	*)
	    echo "Usage: $0 start|stop|restart"
    esac
}

################################################################################
#				NLM functions				       #
################################################################################

# Restart rpc.statd
# Note that we don't want to issue a "nfslock restart" directly since this would 
# involve restarting lockd (which results in lock recovery) which we don't want
restartStatd() {
    if [ -f /sbin/rpc.statd ]; then
	# Kill the statd process if exists (important for registering with portmap)
	debuglog kill -9 $(/sbin/pidof -x /sbin/rpc.statd)
	# Start the statd process
	debuglog /etc/init.d/nfslock start
    fi
}

# Release all locks by sending a KILL signal to kernel lockd thread
resetLockd() {
    if [ -f /etc/init.d/nfsserver ]; then
	# SuSE
	debuglog /etc/init.d/nfslock stop
    else
        # Redhat fails to kill lockd to start grace period, so do it explicitly
	pid=$(ps -aef | grep -w "\[lockd\]" | awk '{print $2}')
	[ -n "$pid" ] && kill -9 $pid
    fi
}

# Check grace period support in kernel
checkDynamicGrace() {
    debuglog2 mount -t nfsd nfsd /proc/fs/nfsd
    [ -f /proc/fs/nfsd/grace ] && return 1
    debugmsg2 "Cannot find /proc/fs/nfsd/grace, will restart lockd (and reclaim all locks) for failover/failback."
    return 0
}

# Start grace period
startGrace() {
    checkDynamicGrace
    if [ $? -eq 1 ]; then
	echo 1 > /proc/fs/nfsd/grace
    else
	# Kernel does not support starting grace period through /proc
	# Only thing to do is kill lockd
	msg "Dynamic enabling of grace period not supported in this kernel. Restarting lockd"
	resetLockd
    fi
}

# Send SM_NOTIFY message to client on server restart
# Usage: notify statd_dir [server IP]
notify() {
    opts=""
    if [ -e /sbin/sm-notify ]; then	# SLES - kernel-space statd
	_cp $1/sm/* /var/lib/nfs/sm
	[ -n "$2" ] && opts="-m 1 -v $2"
	debugmsg2 "Notify clients: " && log ls $1/sm
	debuglog2 sm-notify $opts
    elif [ -e /sbin/rpc.statd ]; then	# RHEL - user-space statd
	[ -n "$2" ] && opts="-P $2"
	debugmsg2 "Notify clients: " && log ls $1/sm
	debuglog2 rpc.statd -N -n $1 $opts
    fi
}

# Send SM_NOTIFY message to client on all available interfaces
# SLES9 has a bug wherein the NFS client compares the hostname in the notify
# message against the hostname it registered during mount to determine whether
# to handle SM_NOTIFY requests coming from the server. We try to work around
# the problem by (optionally) sending notify messages on combinations of 
# hostnames and IP address
notifyClient() {
    debugmsg2 "notify host:$1, vip:$VIP, statddir:$2"

    #debugmsg "Notify on local name"
    #notify $2

    # If VIP is specified, always send notify on the VIP
    if [ -n "$VIP" ]; then
	debugmsg "SM_NOTIFY clients for VIP $VIP"
	notify $2 $VIP
	[ $notifyfix -eq 0 ] && return
    fi

    shortname=$(shortipname $1)
    debugmsg "Notify for host $shortname"
    notify $2 $shortname
    [ $notifyfix -eq 0 ] && return

    host=$(ipname $1)
    if [[ -n "$host" && "$host" != "$shortname" ]]; then
	debugmsg "Notify for host.domain $host"
	notify $2 $host
    fi

    ip=$(ipaddr $1)
    if [[ -n "$ip" && "$ip" != "$host" && "$ip" != "$shortname" ]]; then
	debugmsg "Notify for IP $ip"
	notify $2 $ip
    fi
}

################################################################################
#				Failover functions			       #
################################################################################

startReclaim() {
    gpfs_ip=$1
    smdir=${SHARED_NFS}/$gpfs_ip/statd/sm
    statedir=/tmp/statd

    msg "Reclaim of NLM locks initiated for node $gpfs_ip"

    _mkdir $statedir/sm $statedir/sm.bak
    _cp $smdir/* $statedir/sm.bak

    if [ $iptakeover -eq 1 ]; then
	for nfsip in "$(getNfsIPs $gpfs_ip)"; do
	    _cp $statedir/sm.bak/* $statedir/sm
	    notifyClient $nfsip $statedir
	done
    else
	# get VIP from loopback
	ip=$(mmgetifconf | grep -w 'lo' | awk '{print $2}')
	_cp $statedir/sm.bak/* $statedir/sm
	if [[ -n "$ip"  && "$ip" != "$gpfs_ip" ]]; then
	    notifyClient $ip $statedir
	fi
    fi
    # restore list of lock users
    _cp $statedir/sm.bak/* $smdir
}

# Find the next entry after one with node that failed and use it as the
# takeover node. If no more line wrap around to the top. Check that the node is
# up, if not use the next entry. There can be few NFS external IP address for
# each GPFS IP.
selectNode() {
    failed_node=$1
    typeset -i next=0
    gpfs_ip=""

    while true; do
	exec 3< $NODELIST
	while read -u3 gpfs_if nfs_list; do
            # Skip empty and comment lines
	    if invalid $gpfs_if; then 
		continue
	    fi
	    gpfs_ip=$(getip $gpfs_if)
	    debugmsg2 "selectNode: GPFS IP: $gpfs_ip, NFS IP:$nfs_list"
	    if [ $next -eq 1 ]; then
		[ "$gpfs_ip" == "$failed_node" ] && continue
		isNodeUp $gpfs_ip
		if [ $? -eq 0 ]; then
		    debugmsg "takeover node is $gpfs_ip"
		    echo $gpfs_ip
		    return
		else
		    debugmsg "selectNode: takeover_node $gpfs_ip is down"
		    continue
		fi
	    else
		[ "$gpfs_ip" == "$failed_node" ] && next=1
	    fi	
	done

        # Didn't find a takeover node, so start from top searching for new node
	debugmsg "selectNode: start from top"
	exec 3< $NODELIST
	while read -u3 gpfs_if nfs_list; do
            # Skip empty and comment lines
	    if invalid $gpfs_if; then 
		continue
	    fi
	    gpfs_ip=$(getip $gpfs_if)
	    debugmsg2 "selectNode: GPFS IP: $gpfs_ip, NFS IP: $nfs_list"
	    if [ "$gpfs_ip" != "$failed_node" ]; then
		isNodeUp $gpfs_ip
		if [ $? -eq 0 ]; then
		    debugmsg "takeover node is $gpfs_ip"
		    echo $gpfs_ip
		    return
		else
		    debugmsg "selectNode: takeover_node $gpfs_ip is down"
		    continue
		fi
	    fi
	done
	sleep 10
    done
}

selectNode2() {
	failed=$1
        candidate=$(getNextNode $failed $failed)
       	while true; do
		while [ -z "$candidate" ]; do
			sleep 10
		        candidate=$(getNextNode $failed $failed)
		done
		isNodeUp $candidate
		if [ $? -eq 0 ]; then
			echo $candidate
			return
		fi
		candidate=$(getNextNode $candidate $failed)
	done
}

recoverNode() {
    failed_nfs_ip=$1
    failed_gpfs_ip=$2
    debugmsg "start recoverNode $failed_nfs_ip"
    [ -z "$failed_nfs_ip" ] && return 1

    if [ -z "$(ifconfig | grep -wo $failed_nfs_ip)" ]; then
       typeset -i numberOfRetries=20 attemptNo=0
       while [[ $attemptNo -lt $numberOfRetries &&
	    "$(IPaddr $failed_nfs_ip monitor)" == "OK" ]]; do
	    attemptNo=$attemptNo+1
	    sleep 1
       done
       # call stonith exit
       if [ $attemptNo -eq $numberOfRetries ]; then
          if [ -e /var/mmfs/etc/stonith ]; then
             debugmsg "call /var/mmfs/etc/stonith with $failed_gpfs_ip $failed_nfs_ip"
             /var/mmfs/etc/stonith $failed_gpfs_ip $failed_nfs_ip
             debugmsg "back from call to /var/mmfs/etc/stonith"
         fi
       fi
       typeset -i numberOfRetries=15 attemptNo=0
       while [[ $attemptNo -lt $numberOfRetries &&
	    "$(IPaddr $failed_nfs_ip monitor)" == "OK" ]]; do
	    attemptNo=$attemptNo+1
	    sleep 1
       done
       if [ $attemptNo -eq $numberOfRetries ]; then
          # Somebody else has failed_nfs_ip - maybe the failed node is not down?
          msg "Error: some other host already has address $failed_nfs_ip. Recovery will not happen."
          return 1
       fi
    fi

    debugmsg "recoverNode $failed_nfs_ip"
    debuglog IPaddr $failed_nfs_ip start

    eth=$(mmgetifconf | grep -w $failed_nfs_ip | awk '{print $1}')
    debugmsg "Checking if interface for ip $failed_nfs_ip is up"
    if [ -n "$eth" ]; then
	return 0
    fi
    debugmsg "No interface for ip $failed_nfs_ip is up"
    return 1
}

IPtakeover() {
    me=$1
    failed=$2
    typeset -i do_reclaim=0

    [ "$me" == "$failed" ] && return
    
    msg "Initiating IP takeover of $failed due to node failure"

    _mkdir ${SHARED_RECOVERY}/$me
    debuglog touch ${SHARED_RECOVERY}/$me/$failed

    stopNode $failed
    nfsIPs=$(getNfsIPs $failed)
    debugmsg "IPtakeover ips: $nfsIPs"
    for ip in $nfsIPs; do
        # Takeover IP and issue gratuitous ARP to the clients for the node
        # that failed so that clients can reconnect to the new address
	recoverNode $ip $failed
       	[ $? -eq 0 ] && do_reclaim=1 
    done

    if [ $do_reclaim -ne 0 ]; then
	# got the IP, check if we are still the node to do takeover
	if [ ! -f $SHARED_RECOVERY/$me/$failed ]; then
	   # drop the inteface
	   nfsips=$(getNfsIPs $failed)
	   debugmsg "Node $failed recovery canceled"
	   for ip in $nfsips; do
	      ifDown $ip
	   done
	   return
	fi
    else
        # did not get IP, takeover failed, remove the entry
	_unlink ${SHARED_RECOVERY}/$me/$failed
	return
    fi
    debugmsg "IPtakeover: File contents:"
    debugmsg $(ls -R ${SHARED_RECOVERY}/$me)
    _cp ${SHARED_NFS}/$failed/statd/sm/* ${SHARED_NFS}/$me/statd/sm
    checkDynamicGrace
    [ $? -eq 1 ] && startReclaim $failed
}

# Use mii-diag, mii-tool or ethtool to detect network link status
# Return 0 if link beat detected, 1 if invalid (no device), 2 if no link beat
# FIX: If none of the tools exist, return 2 since there is no way to detect
#      link status. This means caller is responsible for handling the error
#      correctly.
netdiag() {
    # We trust mii-diag works for all interfaces; 
    # if it exists, return its status
    tool=$(which mii-diag 2> /dev/null)
    if [ -n "$tool" ]; then
        debuglog2 $tool -s $eth
        return $?
    fi

    # mii-diag doesn't exist - try both mii-tool and ethtool
    tool=$(which mii-tool 2> /dev/null)
    if [ -n "$tool" ]; then
        output=$($tool $eth 2> /dev/null)
        if [ $? -eq 0 ]; then
            status=$(echo $output | awk '{print $NF}')
            [ "$status" == "ok" ] && return 0
        fi
    fi

    tool=$(which ethtool 2> /dev/null)
    if [ -n "$tool" ]; then
        output=$($tool $eth 2> /dev/null)
        if [ $? -eq 0 ]; then
            status=$(echo $output | grep "Link detected" | awk '{print $NF}')
            [ $status == "yes" ] && return 0
        fi
    fi

    # We reach here either if no tool exists or if there is an error
    return 2
}

# Returns 0 if ready, 1 otherwise
checkLinkStatus() {
    eth=$1

    # Check if the interface is down
    debuglog2 ifconfig $eth
    [ $? -ne 0 ] && return 1

    typeset -i tries=3 n=0
    while [ $n -lt $tries ]; do
        n=$n+1
        debugmsg2 "Checking link status for $eth - attempt $n"

        netdiag $eth
        if [ $? -eq 0 ]; then
	    debugmsg2 "Tested the link for $eth, and it is connected"
	    return 0
        else
    	    debugmsg2 "Tested the link for $eth, and it is NOT connected"
            sleep 5
            continue
        fi
    done
    return 1
}

getRecoveryNodes() {
    ls -A $SHARED_RECOVERY 2> /dev/null
}

getFailedNodes() {
    ls -A $SHARED_RECOVERY/$1 2> /dev/null
}


IPfailback() {
    # Find failover IP address
    failover_ips=""
    for recovery_node in $(getRecoveryNodes); do
	debugmsg2 "Checking if $recovery_node is recovery node"
	for failed_node in $(getFailedNodes $recovery_node); do
	    debugmsg "Failed node is $failed_node"
	    if [ "$failed_node" == "$myip" ]; then
		failover_ips="$failover_ips $recovery_node"
	    fi
	done
    done

    if [ -z "$failover_ips" ]; then
	debugmsg "No failback is needed"
	return
    fi
    debugmsg "Failover nodes are $failover_ips"

    # if recovery node dosen't hold my external IP then no failback
    tmp=""
    nfsIPs=$(getNfsIPs $myip)
    for nfsip in $nfsIPs; do
        for failover_ip in $failover_ips; do
	    tmp=$(mmdshcmd $failover_ip "/usr/lpp/mmfs/bin/mmgetifconf | grep $nfsip")
	    debugmsg2 check for ip=$nfsip on $failover_ip tmp=$tmp
	    [ -n "$tmp" ] && break 2
	done
    done
    if [ -z "$tmp" ]; then
       # remove my node from the recovery node list
	debugmsg "IP is not in use"
    else
        # wait for any of the NFS IP to be up, but try all IPs at least once
        let up=0 
	while [ $up -eq 0 ]; do
	    for ip in $nfsIPs; do
		iface=$(getifname $ip)
                # if this is a virtual address, the interface is already up
                isVirtualIP $ip
                RC=$?
                if [ $RC -ne 0 ]; then
		    debugmsg2 wait for $ip
		    ifcfg=$(getifcfg $ip)

		    debugmsg2 "Sourcing $ifcfg"
		    . $ifcfg

		    if [ "$BONDING_MASTER" == "yes" ]; then
		        ifBondUp $iface
		    else
		        debuglog ifconfig $iface up
		    fi
                fi
		checkLinkStatus $iface
		if [ $? -eq 0 ]; then
		    debugmsg2 "Testing the link for $iface, and it is connected"
		    if ! arping -q -c 2 -w 3 -D -I $iface $ip ; then
			debugmsg2 $ip is ready
			up=1
		    fi
		fi
                [ $RC -ne 0 ] && ifconfig $iface down
	    done
	    [ $up -eq 0 ] && sleep 5
	done
    fi

    # remove my node from the recovery node list
    debuglog rm $SHARED_RECOVERY/*/$myip > /dev/null 2>&1

    # save list of lock users for
    debugmsg "Failover IP is $failover_ip"
    smdir=/tmp/statd/sm
    _mkdir $smdir ${smdir}.bak
    _cp ${SHARED_NFS}/$failover_ip/statd/sm/* ${smdir}.bak

    # Get recovery node to free my IP addresses
    mmdshAll "/var/mmfs/etc/nfsgrace $myip" >> $LOGFILE 2>&1

    checkDynamicGrace
    if [ $? -eq 1 ]; then
        mmdshcmd $failover_ip "/var/mmfs/etc/nfsnodeback $failover_ip $myip" >> $LOGFILE 2>&1
    else
        gpfsIPs=$(getAllGPFSIPs)
        for ip in $gpfsIPs; do
            debugmsg2 "ip $ip, myip $ip"
	    if [ "$ip" != "$myip" ]; then
                mmdshcmd $ip "/var/mmfs/etc/nfsnodeback $ip $failover_ip $myip" >> $LOGFILE 2>&1
            fi
        done
    fi
}

