#!/bin/ksh ################################################################################ # # Module: gpfs.snap # # Description: # This script attempts to collect all of the data likely to be needed # when reporting a GPFS-related problem. # # Syntax: # gpfs.snap [-c "CmdString"] [-d OutputDirectory] [-p] [-x {1 | 2}] [-y | -z] # [-a | -W NodeFilename | -w NodeName[,NodeName...] | # -n NodeNumber[,NodeNumber...]] # # -c "CmdString" Run the command string on the specified nodes. # If -c "CmdString" is used, the data collected is # just the data for the specified command string; # the standard data collected by gpfs.snap is not # collected. CmdString may consist of multiple # commands separated by semi-colons. # The d, p, x, y, and z flags have no effect when # -c "CmdString" is specified. # -d OutputDirectory Directory to be used for output. # The default is /tmp/gpfs.snapOut # -p Skip the problem determination sequence (applies to master only). # -x 1 Check whether there is enough space but do not collect data. # 2 Collect data only; do not check whether there is enough space. # -y Collect snaps only from nodes specified. # -z Collect data only from this node - no "master" data. # # Node specification options: # -a Collect data on all nodes. This is the default. # Cannot be specified with -n, -w, or -W. # -n nodeNumList Collect data on the nodes in the list of node numbers. # Cannot be specified with -a or -W. # -w nodeList Collect data on the nodes in the list of node names. # Cannot be specified with -a or -W. # -W nodeFile Collect data on the nodes in the file. # Cannot be specified with -a, -n, or -w. # # Outputs: # If -d option is specified, this output file will be stored in the # user-specified directory. # If -d option is not specified, the output file will be put in the # /tmp/gpfs.snapOut directory. # When run without the -z flag, snaps from the nodes will all be collected # into a tar file named all.xxxxx.tar, where xxxxx is a timestamp. # Otherwise, the file name will be gpfs.snap.node_number.xxxxxxxx.out.tar.Z # (a compressed file), where xxxxxxxx is the timestamp for the time the # script was run. # # The file contains output from the following commands: # # lsdev -C ALWAYS for AIX # lspv ALWAYS for AIX # lsattr -El for all physical disks ALWAYS for AIX # lsvg -o, lsvg -l, lsvg, ls -l /dev/VGs ALWAYS for AIX # lsfs ALWAYS for AIX # # Files # /etc/fstab (Linux) # /etc/filesystems (AIX) # /var/adm/ras/mmfs.log.* # /var/mmfs/etc/* # /var/mmfs/gen/* # /var/mmfs/ssl/* # /var/mmfs/tmp/* # # Miscellaneous commands # ps -edf ALWAYS # errpt -a ALWAYS # df -k ALWAYS # lslpp -ha ALWAYS # lssrc -a ALWAYS # vmstat 5 5 ALWAYS # vmstat -s ALWAYS # # Network stuff (ALWAYS) # echo $NSorder # no -a # netstat -m # netstat -i -n # netstat -rn # netstat -D # entstat en* # tokstat tr* # ifconfig (on all adapters in hats groups) # # Dependencies: # The script must be run as root and requires rsh access to remote nodes. # ################################################################################ #"@(#)44 1.31.1.4 src/avs/fs/mmfs/ts/admin/gpfs.snap.sh, mmfs, avs_rgpfs24, rgpfs24s012a 4/2/07 01:34:01" starttime=$(date) VERSION=1.31.1.4 # Include global declarations and service routines. . /usr/lpp/mmfs/bin/mmglobfuncs . /usr/lpp/mmfs/bin/mmsdrfsdef sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap ]] && set -x $mmTRACE_ENTER "$*" # Global variables mmlsmgr=/usr/lpp/mmfs/bin/mmlsmgr outputDelimiter="######################################################################" # Local work files. Names should be of the form: # fn=${tmpDir}fn.${mmcmd}.$$ trcFile=${tmpDir}trcFile.${mmcmd}.$$ # file replacement for trclist newrcFile=${tmpDir}newrcFile.${mmcmd}.$$ # file replacement for newrclist commaFile=${tmpDir}commaFile.${mmcmd}.$$ # file equivalent of commalist nodefilecFile=${tmpDir}nodefilecFile.${mmcmd}.$$ # file replacement for nodefileclist nodefile2=${tmpDir}nodefile2.${mmcmd}.$$ # file of reachable specified nodes LOCAL_FILES=" $trcFile $newrcFile $commaFile $nodefilecFile $nodefile2 " # Local functions function removefromlist { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGremovefromlist ]] && set -x $mmTRACE_ENTER "$*" llist=$($cat $nodefile) $rm ${nodefile}.tmp 2>/dev/null $touch ${nodefile}.tmp k=0 for i in $llist do gotit=-1 for j in $1 do if [[ $i = $j ]] then gotit=$k break fi done if [[ $gotit = -1 ]] then echo $i >> ${nodefile}.tmp else l=$gotit ddone=0 while [[ $ddone = 0 ]] do (( m = l + 1 )) if [[ -n ${hostarray[$m]} ]] then hostarray[$l]=${hostarray[$m]} (( l = l + 1 )) else unset hostarray[$l] ddone=1 fi done fi (( k = k + 1 )) done $mv ${nodefile}.tmp ${nodefile} } #----- end of function removefromlist ----------------------- function checklist { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGchecklist ]] && set -x $mmTRACE_ENTER "$*" $rm $nodefile2 $tmpfile 2>/dev/null $touch $nodefile2 $tmpfile list=$($cat $nodefile) $rm /tmp/hostfile 2>/dev/null for i in $list do if [[ $i = $myhname ]] then continue fi bad=0 $ping -c1 -w5 $i >/dev/null 2>/tmp/err if [[ $? = 0 ]] then $mmdsh -L $i K5MUTE=1 /bin/hostname >/tmp/hostname 2>/tmp/err & waitforit if [[ -s /tmp/hostname ]] then echo $i >> $tmpfile thostname=$($cat /tmp/hostname | $awk '{print $2}') if [[ $thostname = $myhname ]] then continue fi echo $i >> $nodefile2 $cat /tmp/hostname | $awk '{print $2}' >>/tmp/hostfile else bad="mmdsh" fi else bad="ping" fi if [[ $bad != 0 ]] then [[ -z $cflag ]] && \ print "\nCannot collect data from $i. $bad failed:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $cat /tmp/err | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out already_failed="$already_failed $i" fi done $rm /tmp/err 2>/dev/null # Create two node files. # $nodefile contains all of the reachable nodes except the master node. # $nodefile2 contains all of the reachable nodes. $mv $nodefile2 $nodefile 2>/dev/null $mv $tmpfile $nodefile2 2>/dev/null hlist=$($cat /tmp/hostfile 2>/dev/null) set -f ; set -A hostarray $hlist ; set +f } #----- end of function checklist ---------------------------- function addtolist { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGaddtolist ]] && set -x $mmTRACE_ENTER "$*" list=$($cat $nodefile) hlist=$($cat /tmp/hostfile 2>/dev/null) j=0 for i in $1 do if [[ $i = $myhname ]] then continue fi gotit=0 for j in $list do if [[ $i = $j ]] then gotit=1 break fi done bad=0 if [[ $gotit != 1 ]] then already_got=0 for k in $already_failed do if [[ $i = $k ]] then already_got=1 break; fi done if [[ $already_got = 1 ]] then continue fi $ping -c1 -w5 $i >/dev/null 2>/tmp/err if [[ $? = 0 ]] then $mmdsh -L $i K5MUTE=1 /bin/hostname >/tmp/hostname 2>/tmp/err & waitforit if [[ -s /tmp/hostname ]] then gotit=0 j=0 hname=$($cat /tmp/hostname | $awk '{print $2}') if [[ $hname = $myhname ]] then continue fi while [[ -n ${hostarray[$j]} ]] do if [[ $hname = ${hostarray[$j]} ]] then gotit=1 break fi (( j = j + 1 )) done if [[ $gotit = 0 ]] then hostarray[$j]=$hname echo $i >> $nodefile fi else bad="mmdsh" fi else bad="ping" fi if [[ $bad != 0 ]] then print "\nCannot collect data from $i. $bad failed:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $cat /tmp/err | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out already_failed="$already_failed $i" fi fi done $rm /tmp/err 2>/dev/null } #----- end of function addtolist ---------------------------- function check_space { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_space ]] && set -x $mmTRACE_ENTER "$*" # The following extraction takes into account that the output # of df is arranged differently under Linux than under AIX. dfOutput=$($df -k $LOGDIR | $tail -n +2) if [[ $os = "AIX" ]] then FREE_SPACE=$(print $dfOutput | $awk '{print $3}') else FREE_SPACE=$(print $dfOutput | $awk '{print $4}') fi FREE_SPACE=$(expr $FREE_SPACE - 1) # Give ourselves a .1 safety margin. (( maxbytes = max_bytes + max_bytes / 10 )) (( total_bytes = total_bytes + total_bytes / 10 )) if [[ $total_bytes < 1000000 ]] then factor=30 else if [[ $total_bytes < 2000000 ]] then factor=25 else factor=20 fi fi # Guess how large the compressed file will be. (( zipped_bytes = total_bytes * factor / 100 )) (( adjusted_bytes = total_bytes + zipped_bytes )) if [[ $adjusted_bytes > $maxbytes ]] then maxbytes=$adjusted_bytes fi if [[ $master = 1 && -s $nodefile ]] then (( max_zipped_bytes = zipped_bytes * 2 )) # when we tar our own Z file.... notenough=$($grep "There is not enough space" $BASELOGDIR/pass1outfile | $cut -f1 -d :) for ii in $notenough do print "Node $ii reports it does not have enough space in ${BASELOGDIR}/${logdate}\nRemoving from list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out removefromlist $ii removefromrlist $ii done for ii in $rlist do size=$($grep "compressed file" $BASELOGDIR/pass1outfile | $grep $ii | $awk '{print $7}') if [[ -n $size ]] then # zipped_bytes = max at end # tmp_max_zipped bytes is max at any point (( zipped_bytes = zipped_bytes + size )) (( tmp_max_zipped_bytes = zipped_bytes + size )) if [[ $tmp_max_zipped_bytes -gt $max_zipped_bytes ]] then max_zipped_bytes=$tmp_max_zipped_bytes fi fi done for ii in $glist do size=$($grep "requires about" $BASELOGDIR/pass1outfile | $grep $ii | $awk '{print $5}') if [[ -n $size ]] then (( zipped_bytes = zipped_bytes + size )) (( tmp_max_zipped_bytes = zipped_bytes + size )) if [[ $tmp_max_zipped_bytes -gt $max_zipped_bytes ]] then max_zipped_bytes=$tmp_max_zipped_bytes fi fi done if [[ "$max_zipped_bytes" -gt "$maxbytes" ]] then maxbytes=$max_zipped_bytes fi else print "compressed file will be about $zipped_bytes bytes" fi total_block=$(expr $maxbytes / 1024) echo "gpfs.snap requires about $maxbytes bytes" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out if [[ "$total_block" -gt "$FREE_SPACE" ]] then echo "gpfs.snap requires about $maxbytes bytes" >> ${BASELOGDIR}/gpfs.snap_err.${logdate}.out echo "There is not enough space in ${BASELOGDIR}. Either increase\nthe filesystem size or choose a different filesystem with the -d option." | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $rm -r ${LOGDIR} 2>/dev/null if [[ $master = 1 ]] then $rm -r $NODESDIR 2>/dev/null fi exit 1 fi $rm $BASELOGDIR/pass1outfile 2>/dev/null } #----- end of function check_space -------------------------- function check_waiters2 { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_waiters2 ]] && set -x $mmTRACE_ENTER "$*" $mkdir -p ${LOGDIR}/waiters 2>/dev/null firstone=1 if [[ $pass = 1 ]] then cp ${BASELOGDIR}/*_waiters ${LOGDIR}/waiters 2>/dev/null if [[ -n $aflag ]] then if [[ -s $rcFile ]] then $mmdsh -F $rcFile K5MUTE=1 $rcp ${BASELOGDIR}/\*_waiters $my_hostname:${LOGDIR}/waiters & waitforit NULL 60 $mmdsh -F $rcFile K5MUTE=1 rm -f ${BASELOGDIR}/\*_waiters & waitforit NULL 60 fi else if [[ -s $brcFile ]] then $mmdsh -F $brcFile K5MUTE=1 $rcp ${BASELOGDIR}/\*_waiters $my_hostname:${LOGDIR}/waiters & waitforit NULL 60 $mmdsh -F $brcFile K5MUTE=1 rm -f ${BASELOGDIR}/\*_waiters & waitforit NULL 60 fi fi # end of if [[ -n $aflag ]] size=$(du -ks ${LOGDIR}/waiters | $cut -f1) if [[ -n $size ]] then addit $size ${LOGDIR}/waiters fi else list=$($ls ${LOGDIR}/waiters) for i in $list do if [[ -s ${LOGDIR}/waiters/${i} ]] then tarit waiters/${i} fi done fi # end of if [[ $pass = 1 ]] } #----- end of function check_waiters2 ----------------------- function check_files2 { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_files2 ]] && set -x $mmTRACE_ENTER "$*" dirlist=$($ls -d ${LOGDIR}/bad.*.files 2>/dev/null) for i in $dirlist do if [[ $pass = 1 ]] then size=$(du -ks $i | $cut -f1) addit $size $i else basedir=$(basename $i) filelist=$($ls $i) for j in $filelist do tarit $basedir/$j done fi done } #----- end of function check_files2 ------------------------- function check_files { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_files ]] && set -x $mmTRACE_ENTER "$*" name=$(basename $1) if [[ ! -s $2 ]] then return fi $mmdsh -F $2 K5MUTE=1 sum $1 > $name.outfile & waitforit NULL 60 diffs=$($sort -uk 2,2 $name.outfile | $wc -l) diffs=${diffs##*( )} if [[ $diffs -gt 1 ]] then list=$($sort -uk 2,2 $name.outfile | $awk '{print $2}') most=0 for i in $list do num=$($grep $i $name.outfile | $wc -l) if [[ $num -gt $most ]] then mostsum=$i most=$num fi done if [[ $base = mmsdrfs ]] then sdrfssum="$sdrfssum $mostsum" fi $mkdir ${LOGDIR}/bad.${name}.files badlist=$($grep -v $mostsum $name.outfile | $cut -f1 -d :) goodlist=$($grep $mostsum $name.outfile | $cut -f1 -d :) echo $goodlist | $grep $my_hostname >/dev/null if [[ $? = 0 ]] then cp $1 ${LOGDIR}/bad.${name}.files/${name}.$my_hostname.good else goodnode=$(echo $goodlist | $awk '{print $1}') $mmdsh -L $goodnode K5MUTE=1 $rcp $1 $my_hostname:${LOGDIR}/bad.${name}.files/${name}.${goodnode}.good & waitforit NULL 60 fi if [[ -n $3 ]] then print "\nThe following nodes $name files are different and are in the minority in $groupname $3" | $tee -a ${BASELOGDIR}/problem.${my_hostname} else print "\nThe following nodes $name files are different and are in the minority" | $tee -a ${BASELOGDIR}/problem.${my_hostname} fi echo "$badlist\n" | $tee -a ${BASELOGDIR}/problem.${my_hostname} for i in $badlist do $mmdsh -L $i K5MUTE=1 $rcp $1 $my_hostname:${LOGDIR}/bad.${name}.files/${name}.$i.bad & waitforit NULL 60 done else if [[ $name = mmsdrfs ]] then tsum=$($cat $name.outfile | $head -1 | $cut -f1 -d " ") sdrfssum="$sdrfssum $tsum" fi fi # end of if [[ $diffs -gt 1 ]] $rm $name.outfile 2>/dev/null } #----- end of function check_files -------------------------- function check_waiters { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_waiters ]] && set -x $mmTRACE_ENTER "$*" # Check this cluster. firstone=1 $mmdsh -F $nodefile2 "K5MUTE=1 mkdir ${BASELOGDIR} 2>/dev/null; K5MUTE=1 $mmfsadm dump waiters > ${BASELOGDIR}/$(hostname -s)_waiters" & waitforit NULL 60 $mmdsh -F $nodefile2 "K5MUTE=1 $mmfsadm dump waiters | grep -v '===== dump waiters ====='" > longwaiters 2>/dev/null & waitforit NULL 60 if [[ -s $brcFile ]] then $mmdsh -F $brcFile "K5MUTE=1 grep tmMsgRevoke ${BASELOGDIR}/$(hostname | cut -d. -f1)_waiters yamo" >> ${BASELOGDIR}/grepped-waiters 2>/dev/null & waitforit NULL 60 fi if [[ -n $bglist ]] then $mmdsh -L $my_hostname "K5MUTE=1 grep tmMsgRevoke ${BASELOGDIR}/\*_waiters yamo" >> ${BASELOGDIR}/grepped-waiters 2>/dev/null & waitforit NULL 60 fi if [[ -s ${BASELOGDIR}/grepped-waiters ]] then print "\nThere are waiters for tmMsgRevokes:" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out if [[ $yflag != 1 ]] then print "Data will be collected from these nodes:" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi newlist="" { while read line do tnode=$(echo $line | $cut -f2 -d :) thisnode=$(basename $tnode | $cut -f1 -d "_") echo $line | $grep "tmMsgRevoke on node" >/dev/null 2>&1 if [[ $? = 0 ]] then addr=${line##*tmMsgRevoke on node} print "waiter on $thisnode, tmMsgRevoke from $addr" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out echo $newlist | $grep -w $addr >/dev/null 2>&1 if [[ $? != 0 ]] then newlist="$newlist $addr" fi else print "waiter on $thisnode, tmMsgRevoke" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi echo $newlist | $grep -w $thisnode >/dev/null 2>&1 if [[ $? != 0 ]] then newlist="$newlist $thisnode" fi done echo "" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out } < ${BASELOGDIR}/grepped-waiters if [[ $yflag != 1 && -z $aflag ]] then addtolist "$newlist" fi fi # end of if [[ -s ${BASELOGDIR}/grepped-waiters ]] $rm ${BASELOGDIR}/grepped-waiters 2>/dev/null if [[ -s longwaiters ]] then $sort -nrk 4,4 longwaiters > ${LOGDIR}/long_waiters.sorted list=$($cat ${LOGDIR}/long_waiters.sorted | $head -5 | $cut -f1 -d ":") if [[ $yflag = 1 ]] then print "There are long waiters. The 5 longest waiters are on the following nodes:\n$list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out else print "There are long waiters. The 5 longest waiters are on the following nodes, which will be added to the list to collect data from\n$list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out addtolist "$list" fi fi # end of if [[ -s longwaiters ]] $rm longwaiters 2>/dev/null } #----- end of function check_waiters ------------------------ function check_dumps { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_dumps ]] && set -x $mmTRACE_ENTER "$*" filePrefix=$1 maxFiles=$2 dumpDir=$3 internal_list="" if [[ ! -a $dumpDir ]] then return fi savedir=$(pwd) cd $dumpDir numfiles=$($ls -ltr $filePrefix.*.* 2>/dev/null | $grep -v shutdown | $wc -l) if [[ $numfiles -eq 0 ]] then cd $savedir return fi if [[ $maxFiles -gt 0 && $numfiles -gt $maxFiles ]] then print "There are $numfiles $filePrefix files in $dumpDir.\nBecause these files are large I am only grabbing the latest $maxFiles.\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi today=$(date +%j) today=${today##+(0)} if [[ $maxFiles -lt 0 ]] then # A negative value indicates "collect them all". $ls -ltr $filePrefix.*.* 2>/dev/null | $grep -v shutdown > $tmpfile else # Collect up to the specified number of files. $ls -ltr $filePrefix.*.* 2>/dev/null | $grep -v shutdown | $tail -n -$maxFiles > $tmpfile fi # Create a list of files in global variable $internal_list # that will later be collected after we return to the caller. exec 3<&- exec 3< $tmpfile while read -u3 fileLine do month=$(echo $fileLine | $awk '{print $6}') day=$(echo $fileLine | $awk '{print $7}') name=$(echo $fileLine | $awk '{print $9}') days=0 i=1 while [[ -n ${months[$i]} ]] do if [[ $month = ${months[$i]} ]] then (( days = days + day )) break else (( days = days + ${days[$i]} )) (( i = i + 1 )) fi done # end of while [[ -n ${months[$i]} ]] do if [[ $(( today - days )) -gt 14 ]] then print "File $name is over 2 weeks old so I am not going to collect it." | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out else if [[ -z $internal_list ]] then internal_list=$name else internal_list="${internal_list} $name" fi fi # end of if [[ $(( today - days )) -gt 14 ]] done # end of while read -u3 fileLine do $rm -f $tmpfile cd $savedir } #----- end of function check_dumps -------------------------- function removefromrlist { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGremovefromrlist ]] && set -x $mmTRACE_ENTER "$*" for i in $glist do if [[ $i != $1 ]] then newglist="$newglist $i" fi done glist=$newglist firstone=1 $rm -f $newrcFile for i in $rlist do if [[ $i != $1 ]] then if [[ $firstone = 1 ]] then print -- $i > $newrcFile newrlist="$i" firstone=0 else print -- $i >> $newrcFile newrlist="$newrlist $i" fi fi done rlist=$newrlist rcFile=$newrcFile } #----- end of function removefromrlist ---------------------- function check_fs { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_fs ]] && set -x $mmTRACE_ENTER "$*" echo "yamo" > ${BASELOGDIR}/yamo yamosum=$($sum yamo | $cut -f1 -d " ") $mmdsh -F $1 "K5MUTE=1 sum ${BASELOGDIR}/yamo 2>/dev/null | awk '{print \$1}'" > rsumfile & waitforit firstone=1 trlist="" tglist="" $rm -f $trcFile g=0 h=0 for i in $2 do thissum=$($grep -E "^$i:" rsumfile | $awk '{print $2}') if [[ $thissum = $yamosum ]] then tglist="$tglist $i" if [[ -n $3 ]] then ghostarray[$g]=${hostarray[$h]} (( g = g + 1 )) fi else if [[ $firstone = 1 ]] then print -- $i > $trcFile trlist=$i firstone=0 else print -- $i >> $trcFile trlist="$trlist $i" fi fi (( h = h + 1 )) done $rm ${BASELOGDIR}/yamo 2>/dev/null $rm rsumfile 2>/dev/null } #----- end of function check_fs ----------------------------- function get_files { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGget_files ]] && set -x $mmTRACE_ENTER "$*" cd $LOGDIR if [[ $pass = 1 || $x_arg = 2 ]] then if [[ $master = 1 ]] then ddd=$(date +"%m %e %H %M %y") emon=$(echo $ddd | $awk '{print $1}') eday=$(echo $ddd | $awk '{print $2}') eyr=$(echo $ddd | $awk '{print $5}') emon=${emon##0} syr=$eyr; smon=$emon; if [[ $eday -gt 14 ]] then (( sday = eday - 14 )) else (( diff = 14 - eday )) if [[ $emon -eq 1 ]] then smon=12 (( syr = eyr - 1 )) (( sday = 31 - diff )) else (( smon = smon - 1 )) (( sday = ${days[$smon]} - diff )) fi fi fi if [[ $smon = $emon ]] then months=1 endday=$eday else months=2 endday=${days[$smon]} fi curmonth=1 cmon=$smon i=${sday##0} while [[ $curmonth -le $months ]] do if [[ $curmonth -eq 2 ]] then i=1 endday=$eday cmon=$emon fi if [[ $i -lt 10 ]] then e=${endday##0} if [[ $e -lt 10 ]] then ee=$e else ee=9 fi if [[ -z $greplist ]] then greplist="${months[$cmon]} [$i-$ee]" greplistb="^${months[$cmon]} *[$i-$ee] " greplist2="^${cmon}/0[$i-$ee]" else greplist="${greplist}|${months[$cmon]} [$i-$ee]" greplistb="${greplistb}|^${months[$cmon]} *[$i-$ee] " greplist2="${greplist2}|^${cmon}/0[$i-$ee]" fi fi ii=2 while [[ $ii -lt 5 ]] do (( jj = ii * 10 )) (( kk = jj - 11 )) (( ll = jj - 10 )) (( mm = ii - 1 )) if [[ $i -lt $jj && $endday -gt $kk ]] then if [[ $i -gt $kk ]] then (( s = i - ll )) else s=0 fi (( e = endday - ll )) if [[ $endday -ge $jj ]] then e=9 fi if [[ -z $greplist ]] then greplist="${months[$cmon]} ${mm}[$s-$e]" greplistb="^${months[$cmon]} ${mm}[$s-$e]" greplist2="^${cmon}/${mm}[$s-$e]" else greplist="${greplist}|${months[$cmon]} ${mm}[$s-$e]" greplistb="${greplistb}|^${months[$cmon]} ${mm}[$s-$e]" greplist2="${greplist2}|^${cmon}/${mm}[$s-$e]" fi fi (( ii = ii + 1 )) done (( curmonth = curmonth + 1 )) done if [[ $master = 1 ]] then if [[ -n $aflag ]] then $mmdsh -L $myhname -F $nodefile K5MUTE=1 "grep -E '$greplist' $rasDir/mmfs.log.[0-9]*" > mmfslogs.unsorted 2>/dev/null else if [[ -s $nodefile2 ]] then $mmdsh -F $nodefile2 K5MUTE=1 "grep -E '$greplist' $rasDir/mmfs.log.[0-9]*" > mmfslogs.unsorted 2>/dev/null fi fi $sort -k3,5 mmfslogs.unsorted > mmfslogs.sorted 2>/dev/null $rm mmfslogs.unsorted 2>/dev/null if [[ $pass = 1 ]] then size=$($ls -l mmfslogs.sorted | $awk '{print $5}') if [[ $size != 0 ]] then addit $size mmfslogs.sorted fi fi fi fi # end of if [[ $pass = 1 || $x_arg = 2 ]] if [[ $pass = 2 ]] then cd $BASELOGDIR tarit mmfslogs.sorted fi cd $LOGDIR $cat $rasDir/mmfs.log.[0-9]* > mmfs.logs.${my_hostname} size=$($ls -l mmfs.logs.${my_hostname} | $awk '{print $5}') if [[ $pass = 1 ]] then addit $size mmfs.logs.${my_hostname} cd $BASELOGDIR else cd $BASELOGDIR tarit mmfs.logs.${my_hostname} fi } #----- end of function get_files ---------------------------- ############################################################################ # # Function: Get all of the files in a specified directory. # # Input: $1 - directory whose files are to be gotten # # Output: Pass 1: Calculate space for the files in the directory. # Pass 2: Add the files in the directory to the tar file. # # Returns: 0 # ############################################################################ function get_files_dir # { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGget_files_dir ]] && set -x $mmTRACE_ENTER "$*" typeset dirName=$1 typeset saveDir fileList # Generate the list of files to get, but leave out complete.map files. # Then invoke get_files_list() to get the files. saveDir=$(pwd) cd $dirName 2>/dev/null fileList=$($ls -A 2>/dev/null | $grep -v "complete.map") cd $saveDir if [[ -n $fileList ]] then get_files_list "$dirName" "$fileList" fi return 0 } #----- end of function get_files_dir ------------------------ ############################################################################ # # Function: Get the files specified by means of a directory and a list. # # Input: $1 - directory from which to get files # $2 - list of files to be gotten from the directory # $3 - (optional) name of subdir to use in the tar file # # Output: Pass 1: Calculate space for the specified files. # Pass 2: Add the specified files to the tar file. # # Returns: 0 # ############################################################################ function get_files_list # [] { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGget_files_list ]] && set -x $mmTRACE_ENTER "$*" typeset dirName=$1 typeset fileList=$2 typeset subdirName=$3 typeset relDir tmpdir e # Based on the value of the subdir and the input directory parameters, # calculate $relDir and $dir and optionally create directory $LOGDIR/$dir. if [[ -n $subdirName ]] then relDir=$subdirName # use the name specified as an input else relDir=${dirName#/} # remove the leading / character fi tmpdir=$(echo $relDir | $grep "/") if [[ -n $tmpdir ]] then dir=${relDir%/*} $mkdir -p ${LOGDIR}/$dir 2>/dev/null else dir=$relDir fi # Create a symlink to the passed directory. ln -s $dirName ${LOGDIR}/$dir # Loop through the list passed as the 2nd parameter # and calculate the space required if this is pass 1 # or add the parts to the tar file if this is pass 2. for e in $fileList do if [[ -f $dirName/$e ]] then if [[ $pass = 1 ]] then temp_bytes=$($ls -l $dirName/$e | $awk '{ print $5 }') addit $temp_bytes "$dirName/$e file" else tarit "$relDir/$e" 1 fi fi # end of if [[ -f $dirName/$e ]] done # end of for e in $fileList do # Remove the symlink created earlier. $rm ${LOGDIR}/$relDir 2>/dev/null } #----- end of function get_files_list ----------------------- function get_always { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGget_always ]] && set -x $mmTRACE_ENTER "$*" if [[ $os = "AIX" ]] then doit "errpt_a" "/usr/bin/errpt -a" doit "lscfg_vp" "lscfg -vp" doit "lslpp_hac" "/usr/bin/lslpp -hac" doit "lssrc_a" "lssrc -a" doit "no_a" "no -a" if [[ $gotvmstat = 1 ]] then doit "vmstat_s" "vmstat -s" fi conslog=$(lscons) if [[ $conslog != +(/)dev+(/)* && -s $conslog ]] then doit "lscons" "cat $conslog" fi else doit "dmesg" "dmesg" doit "fdisk_l" "fdisk -l" doit "lsmod" "lsmod" doit "lspci" "lspci" doit "rpm_qa" "rpm -qa" doit "rpm_verify" "rpm --verify gpfs.base" 1 doit "rpm_verify" "rpm --verify gpfs.docs" 1 doit "rpm_verify" "rpm --verify gpfs.gpl" 1 doit "rpm_verify" "rpm --verify gpfs.msg.en_US" doit "uname_a" "uname -a" doit "proc_cpuinfo" "cat /proc/cpuinfo" doit "proc_version" "cat /proc/version" doit "site_mcr" "cat /usr/lpp/mmfs/src/config/site.mcr" doit "etc_release" "$grep '[a-zA-Z]' /etc/*release" fi # end of if [[ $os = "AIX" ]] doit "date" "date" doit "df_k" "df -k" doit "exportfs" "exportfs" doit "gpfs_executables" "$ls -l /usr/lpp/mmfs/bin" doit "ipcs_a" "ipcs -a" doit "ls_dev" "$ls -l /dev" doit "ps_edf" "ps -edf" doit "uptime" "uptime" doit "mmdevdiscover" "/usr/lpp/mmfs/bin/mmdevdiscover" doit "tspreparedisk_S" "/usr/lpp/mmfs/bin/tspreparedisk -S" if [[ $mmScriptTrace != /dev/null && -s $mmScriptTrace ]] then doit "mmScriptTrace" "cat $mmScriptTrace" fi return 0 } #----- end of function get_always --------------------------- function get_net_stuff { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGget_net_stuff ]] && set -x $mmTRACE_ENTER "$*" if [[ $os = "AIX" ]] then doit "netstat" "netstat -i -n" "1" doit "netstat" "netstat -m" "1" doit "netstat" "netstat -D" "1" INTERFACES=$($lsdev -Cc if | $grep -v Defined | $cut -d " " -f1) for i in $INTERFACES do doit "ifconfig" "ifconfig $i" 1 doit "odmget_CuAt" "odmget -q name=$i CuAt" 1 doit "lsattr" "lsattr -El $i" 1 done else INTERFACES=$(netstat -i -n | $cut -f1 -d " " | $grep -v "Kernel" | $grep -v "Iface") for i in $INTERFACES do doit "ifconfig" "ifconfig $i" 1 done fi # end of if [[ $os = "AIX" ]] doit "netstat" "netstat -i" "1" doit "netstat" "netstat -r" "1" doit "netstat" "netstat -rn" "1" doit "netstat" "netstat -v 2>/dev/null" "1" doit "netstat" "netstat -s" if [[ $pass = 2 ]] then tarit "ifconfig" if [[ $os = "AIX" ]] then tarit "lsattr" tarit "odmget_CuAt" fi fi } #----- end of function get_net_stuff ------------------------ function get_lvm_stuff { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGget_lvm_stuff ]] && set -x $mmTRACE_ENTER "$*" typeset lspvLine pdisk i VGs if [[ $os = "AIX" ]] then doit "lsfs" "lsfs" doit "lspv" "lspv" 1 LC_ALL=C $lspv > $lspvOutputFile exec 3<&- exec 3< $lspvOutputFile while read -u3 lspvLine do set -f ; set -- $lspvLine ; set +f pdisk=$1 doit "lspv" "lsattr -El $pdisk" "1" done $rm -f $lspvOutputFile doit "lsvg" "lsvg" "1" doit "lsvg" "lsvg -o" "1" VGs=$($lsvg -o) for i in $VGs do doit "lsvg" "lsvg -l $i" "1" doit "getlvodm_u" "getlvodm -u $i" "1" done VGs=$($lsvg) for i in $VGs do doit "lsvg" "$ls -l /dev/$i" "1" done if [[ $pass = 2 ]] then tarit lspv tarit lsvg tarit getlvodm_u fi fi # end of if [[ $os = "AIX" ]] } #----- end of function get_lvm_stuff ------------------------ function addit { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGaddit ]] && set -x $mmTRACE_ENTER "$*" # total_bytes is total_bytes at the end # max_tmp is max bytes while processing this file at one time # (size of tarfile + 2 * sizeof_file) # max_bytes is max bytes while processing ANY file (( total_bytes = total_bytes + $1 )) (( max_tmp = total_bytes + $1 )) if (( $max_tmp > $max_bytes )) then max_bytes=$max_tmp fi if [[ $first = 1 ]] then echo "estimate $2 will take $1 bytes" >> ${LOGDIR}/sizes else echo "estimate $2 will take $1 bytes" > ${LOGDIR}/sizes fi first=1 } #----- end of function addit -------------------------------- ############################################################################### # # Function waitforit (wait for the most recently-started background process) # ############################################################################### function waitforit { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGwaitforit ]] && set -x $mmTRACE_ENTER "$*" mpid=$! if [[ -n $1 && $1 != NULL ]] then tmpid=$mpid comm=$(echo $1 | $head -c 70) $sleep 1 mpid=$($ps -g $mypgid -o pid=PID,args=COMM | $grep "$comm" | $grep -v grep | $tail -n -1 | $awk '{print $1}') # In some cases commands fork themselves. We need to make sure # we have the parent process; try to find the right one. words=$(echo $mpid | $wc -w) words=${words##*( )} if [[ $words != 1 && $words != 0 ]] then echo "got a multiple: $mpid comm is $comm" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out # if there is more than one try to get the direct descendent of # gpfs.snap or the pid mmpid=$($ps -f | $grep "$comm" | $grep -v grep | $grep -E "$$|$tmpid" | $awk '{print $2}') if [[ -z $mmpid ]] then mpid=$(echo $mpid | $head -n 1) mpid=$(echo $mpid | $awk '{print $1}') else mpid=$mmpid fi fi fi # end of if [[ -n $1 && $1 != NULL ]] counter=1 if [[ -n $2 ]] then count=$2 else count=20 fi while [[ $counter -le $count && -n $mpid ]] do if [[ $counter -eq $count ]] then if [[ -n $1 && $1 != NULL ]] then comm=$1 else comm=$($ps -fp $mpid -o args=ARGS | $tail -n -1) fi print "\nThe following command timed out!:\n$comm\n" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out # kill any children of the process we are about to kill # (for the case where we were called with pipes in the command line) $ps -o pid=PID,ppid=PPID | $grep $mpid | $grep -v "grep $mpid" > tmpout { while read line do pid=$(echo $line | $awk '{print $1}') ppid=$(echo $line | $awk '{print $2}') # echo $line | read pid ppid if [[ $ppid = $mpid ]] then $kill -9 $pid fi done } < tmpout $rm tmpout echo "killing $mpid" $kill -9 $mpid 2>/dev/null $sleep 1 # special check to catch defunct children of c -c (pass 1) parent=$($ps -fp $mpid | $grep defunct | $awk '{print $3}') if [[ -n $parent ]] then parcomm=$($ps -p $parent | $tail -n -1 | $awk '{print $4}') if [[ $parcomm = wc ]] then $kill -9 $parent 2>/dev/null fi fi set +x return 1 fi # end of if [[ $counter -eq $count ]] $sleep 1 (( counter = counter + 1 )) mpid=$($ps -p $mpid | $awk '{print $1}' | $grep -v "PID") done # end of while [[ $counter -le $count && -n $mpid ]] do return 0 } #----- end of function waitforit ---------------------------- ############################################################################### # # Function doit # # All arguments are optional except $2. If you do not desire an action, # pass "" for the parameter, or just leave out trailing args completely. # # Arguments: # $1 The unique part of the name of the log file as in # $LOGDIR/$1.${my_hostname}.${logdate} # $2 The command to be run. stdout is redirected to the log file. # stderr is redirected to both the screen and the file # gpfs.snap_err.${logdate}.out in the $LOGDIR. # On a non-zero return code, an error message is printed to the # screen and the gpfs.snap_err.${logdate}.out file. # $3 "Output control" flag. # If null, the output is appended to the global tar file; # If 1, the output is not appended to the global tar file # (the log file will be left for further data to be added to it, # and a subsequent invocation with a null output control flag # will cause the log file to be added to the tar file then) # If 2, the output is prepended with the hostname of the node # to make the output look like that produced by mmdsh, # and then copied to the log file's parent directory # (this option is used by the -c option for collecting # output data from executing a command string). # ############################################################################### function doit # { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGdoit ]] && set -x $mmTRACE_ENTER "$*" typeset logFile=$1 typeset cmdToRun=$2 typeset outputControl=$3 if [[ $pass = 1 ]] then if [[ -n $cmdToRun ]] then ksh -c "PATH=$PATH $cmdToRun" 2>/dev/null | $wc -c >$YAMO & if [[ $cmdToRun = "netstat -D" ]] then waitforit "$cmdToRun" "60" else waitforit "$cmdToRun" fi if [[ $? = 0 && -s $YAMO ]] then temp_bytes=$($cat $YAMO | $awk '{print $1}') temp_bytes2=$(echo $cmdToRun | $wc -c) tmp2_bytes=$(( temp_bytes + temp_bytes2 + 153 )) addit $tmp2_bytes $cmdToRun fi fi $rm $YAMO 2>/dev/null else if [[ -n $cmdToRun ]] then print "" >> $LOGDIR/$logFile print "$outputDelimiter" >> $LOGDIR/$logFile print "Output for $cmdToRun on $($hostname | $cut -d. -f1)" >>${LOGDIR}/$logFile print "$outputDelimiter" >> $LOGDIR/$logFile ksh -c "PATH=$PATH $cmdToRun" >>$LOGDIR/$logFile 2>${BASELOGDIR}/gpfs.snap_err.out.temp & if [[ $cmdToRun = "netstat -D" ]] then waitforit "$cmdToRun" "60" else waitforit fi if [[ -s ${BASELOGDIR}/gpfs.snap_err.out.temp ]]; then print "\nErrata from $cmdToRun:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $cat ${BASELOGDIR}/gpfs.snap_err.out.temp | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $rm ${BASELOGDIR}/gpfs.snap_err.out.temp 2>/dev/null fi if [[ -z $outputControl ]] then tarit $logFile fi # end of if [[ -z $outputControl ]] fi # end of if [[ -n $cmdToRun ]] fi # end of if [[ $pass = 1 ]] } #----- end of function doit --------------------------------- ############################################################################ # # Function: Append a file to the global tarfile ($tarfile) # If $tarfile does not exist yet, create it. # # Input: $1 - file to be added to (or serve as the start of) $tarfile # $2 - "remove" flag (if 0, delete input file; otherwise, don't) # # Output: $tarfile has been created or augmented # # Returns: 0 # # Notes: The caller must be in $BASELOGDIR when calling tarit. # The file to be added to $tarfile must be in $LOGDIR. # ############################################################################ function tarit { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGtarit ]] && set -x $mmTRACE_ENTER "$*" if [[ -a ${SUBDIR}/$1 ]] then if [[ -a $tarfile ]] then tar -rf $tarfile ${SUBDIR}/$1 else tar -cf $tarfile ${SUBDIR}/$1 fi if [[ -z $2 ]] then $rm -r ${LOGDIR}/$1 2>/dev/null fi fi # end of if [[ -a ${SUBDIR}/$1 ]] } #----- end of function tarit -------------------------------- function do_master_stuff { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGdo_master_stuff ]] && set -x $mmTRACE_ENTER "$*" if [[ $os = "Linux" ]] then line=$($head -n1 $mmsdrfsfile) IFS_sv="$IFS" IFS=":" set -f ; set -A v -- - $line ; set +f IFS="$IFS_sv" addlist=${v[$PRIMARY_SERVER_Field]} echo "Primary server is: ${v[$PRIMARY_SERVER_Field]}" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out if [[ -n ${v[$BACKUP_SERVER_Field]} ]] then echo "Backup server is: ${v[$BACKUP_SERVER_Field]}" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out addlist="$addlist ${v[$BACKUP_SERVER_Field]}" fi fi # end of if [[ $os = "Linux" ]] echo "\nGetting file system manager information . . .\n" dev2list=$(getUsedDevices) if [[ -n $aflag ]] then devlist=$dev2list fi $mmlsmgr $devlist 2>/dev/null | $grep -v "^file system" | $grep -v "^\-\-\-\-\-\-" > lsout if [[ -s lsout ]] then echo "According to mmlsmgr . . ." | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out { while read line do if [[ -z $line ]] then continue fi fs=$(echo $line | $cut -f1 -d " ") manager=$(echo $line | $cut -f2 -d "(" | $cut -f1 -d ")") mannum=$(echo $line | $awk '{print $2}') echo $mannum | $grep "\." >/dev/null if [[ $? = 0 ]] then mannum=$($grep $mannum $mmsdrfsfile | $grep MEMBER_NODE | $cut -f5 -d ":") fi if [[ $mannum != "(none" ]] then nodesetID=$(findNodesetId $mmsdrfsfile $mannum) name=$(getNodeInfo $REL_HOSTNAME_Field $NODE_NUMBER_Field $mannum $nodesetID $mmsdrfsfile) addlist="$addlist $name" else name="" fi print "The manager of $fs is $manager ($name)" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out done } < lsout skipone=1 else print "Couldn't get filesystem manager info from daemon. Trying log files. . ." | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi # end of if [[ -s lsout ]] $rm lsout 2>/dev/null echo "\nAssessing file system manager data from logs . . .\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out list=$($ls -t $rasDir/mmfs.log.[0-9]* 2>/dev/null) for fs2 in $dev2list do gotlast=0;gotresigned=0;gotappointed=0; fs=${fs2#/dev/} $rm tmplist 2>/dev/null for file in $list do $grep -nE "Cannot mount file system|unmounted because it|as manager|last file system manager" $file | $grep -nE "$fs|last file system manager" | $sort -nrk 1,1 >> tmplist done if [[ -s tmplist ]] then { while read line do echo $line | $grep "last file system manager" >/dev/null if [[ $? = 0 ]] then continue fi echo $line | $grep -E "unmounted | Cannot mount" >/dev/null if [[ $? = 0 ]] then read line fi manager=$(echo $line | $cut -f2 -d "(" | $cut -f1 -d ")") mannum=$(echo $line | $cut -f1 -d "(") echo $line | $grep "last file system manager" >/dev/null if [[ $? = 0 ]] then if [[ $gotlast = 1 ]] then continue fi mannum=${mannum#*was node } gotlast=1 message="failed as" read message2 else echo $line | $grep "resigned as" >/dev/null if [[ $? = 0 ]] then if [[ $gotresigned = 1 ]] then continue fi mannum=${mannum#* Node } gotresigned=1 message="resigned as" else if [[ $gotappointed = 2 ]] then continue else mannum=${mannum#* Node } if [[ $gotappointed = 0 ]] then gotappointed=1 message="last appointed" else gotappointed=2 message="2nd last appointed" fi fi fi fi # end of if [[ $? = 0 ]] echo $mannum | $grep "\." >/dev/null if [[ $? = 0 ]] then mannum=$($grep $mannum $mmsdrfsfile | $grep MEMBER_NODE | $cut -f5 -d ":") fi nodesetID=$(findNodesetId $mmsdrfsfile $mannum) name=$(getNodeInfo $REL_HOSTNAME_Field $NODE_NUMBER_Field $mannum $nodesetID $mmsdrfsFile) if [[ $message = "failed as" ]] then outfile=${BASELOGDIR}/problem.${my_hostname} else outfile=${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi print "$manager ($name) $message manager of $fs" | $tee -a $outfile if [[ -n $message2 ]] then print "$manager ($name) $message manager of $fs" | $tee -a $outfile manager="" fi addlist="$addlist $name" done } < tmplist if [[ $gotlast = 1 && gotresigned = 1 && gotappointed = 2 ]] then break fi fi # end of if [[ -s tmplist ]] $rm tmplist 2>/dev/null done if [[ $yflag != 1 && $pflag != 1 ]] then addtolist "$addlist" fi bigtarfile=${BASELOGDIR}/all.${logdate}.tar if [[ -s $nodefile ]] then mysum=$($sum $spath | $cut -f1 -d " ") NODESDIR=${BASELOGDIR}/${logdate} $mkdir $NODESDIR node_list=$($cat $nodefile) if [[ -n $node_list ]] then if [[ $x_arg = 1 ]] then print "\nWould fork gpfs.snap on nodes $node_list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out else print "\nForking gpfs.snap on nodes:\n$node_list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi dir=/$(echo $BASELOGDIR | $cut -f2 -d "/") $mmdsh -F $nodefile "K5MUTE=1 df $dir | tail -n -1 | grep 100%" >dfout & waitforit NULL 60 list=$($cat dfout | $cut -f1 -d :) 2>/dev/null $rm dfout 2>/dev/null if [[ -n $list ]] then print "$dir is 100% full on the following nodes:\n$list\nRemoving from list." | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out removefromlist "$list" fi if [[ -s $nodefile ]] then $mmdsh -F $nodefile K5MUTE=1 mkdir -p ${BASELOGDIR}/${logdate} >/dev/null 2>/dev/null $mmdsh -F $nodefile K5MUTE=1 sum $SNAP/gpfs.snap >sumout 2>sumerr nnewlist=$($cat sumerr | $cut -f1 -d :) newlist2=$($cat sumout | $grep -v "$mysum" | $cut -f1 -d :) nnewlist="$nnewlist $newlist2" firstone=1 $rm -f sumerr sumout $commaFile 2>/dev/null for i in $nnewlist do if [[ $firstone = 1 ]] then commalist="$i" print -- $i > $commaFile firstone=0 else commalist="$commalist,$i" print -- $i >> $commaFile fi done if [[ -s $commaFile ]] then print "There is an outdated or no gpfs.snap in $SNAP on the following nodes:" print $commalist print "\nAttempting to copy . . .\n" if [[ $SNAP != "/usr/lpp/mmfs/bin" ]] then $mmdsh -F $commaFile K5MUTE=1 mkdir -p $SNAP 2>/dev/null fi $mmdsh -F $commaFile K5MUTE=1 $rcp $myhname:$spath $spath 2>tmperr if [[ -s tmperr ]] then list=$($cat tmperr | $cut -f1 -d ":") $cat tmperr removefromlist "$list" print "copy failed for the following nodes:\n$list\nRemoving them from list." | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $cat $nodefile fi $rm tmperr 2>/dev/null fi if [[ $x_arg != 2 ]] then # Question: What is the "node_args" variable for? $mmdsh -F $nodefile K5MUTE=1 "$SNAP/gpfs.snap $node_args -d ${BASELOGDIR}/$(hostname | cut -d. -f1)_${logdate} -x 1 -z" >$BASELOGDIR/pass1outfile & fi fi fi # end of if [[ -n $node_list ]] fi # end of if [[ -s $nodefile ]] } #----- end of function do_master_stuff ---------------------- function printAndExit { [[ -n $DEBUGgpfssnap || -n $DEBUGprintAndExit ]] && set -x printErrorMsg $1 gpfs.snap $2 print "$USAGE" exit 1 } #----- end of function printAndExit ------------------------- function getCurrentStanzaList2 # { typeset sourceFile="gpfs.snap.sh" [[ -n $DEBUGgpfssnap || -n $DEBUGgetCurrentStanzaList2 ]] && set -x $mmTRACE_ENTER "$*" typeset outfile=$1 typeset rc=0 $rm -f $outfile #------------------------------------------------------------------- # Generate a list of the GPFS file systems in /etc/filesystems. # The output of the AIX lsfs -c command looks something like this: # # #MountPoint:Device:Vfs:Nodename:Type:Size:Options:AutoMount:Acct # /gpfs/gpfsA:/dev/gpfsA:mmfs:-:mmfs:0:rw:no:no # /gpfs/gpfsB:/dev/gpfsB:mmfs:-:mmfs:0:rw:no:no #------------------------------------------------------------------- set +x LC_ALL=C $lsfs -c -v mmfs > $outfile 2>&1 rc=$? [[ -n $DEBUGgpfssnap || -n $DEBUGgetCurrentStanzaList2 ]] && set -x if [[ $rc -ne 0 ]] then # Check whether this is a 'not found error'. $grep -q "unknown vfs type" $outfile if [[ $? = 0 ]] then # 'not found' is acceptable. Reset the return code # and create a file with an lsfs header-like line only. rc=0 print -- "#MountPoint:Device:Vfs:junk" > $outfile else # If some other error, show the error messages. $cat $outfile fi fi return $rc } #----- end of function getCurrentStanzaList2 ---------------- ############################################# # Mainline processing MAIN main ############################################# args=$@ set -A months Yam Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec set -A days 0 31 29 31 30 31 30 31 31 30 31 30 31 export K5MUTE=1 GPFSDIR=/usr/lpp/mmfs/bin export PATH=/bin:/usr/bin:/etc:/usr/sbin:/sbin:$GPFSDIR export LANG=en_US export LC_MESSAGES=C export LC_TIME=C pwd=$(pwd) result=$(echo $0 | $grep "^"/) if [[ -n $result ]] then spath=$0 else spath=${pwd}/$0 fi SNAP=$(dirname $spath) pass=1 total_bytes=0 max_bytes=0 BASELOGDIR=/tmp/gpfs.snapOut $mkdir ${BASELOGDIR} 2>/dev/null logdate=$(date +\%m\%d\%H\%M) my_hostname=$($hostname | $cut -d. -f1) $rm -rf ${BASELOGDIR}/gpfs.snap_err.*.out 2>/dev/null $rm -rf ${BASELOGDIR}/gpfs.snap_info.*.out 2>/dev/null $rm ${BASELOGDIR}/problem.${my_hostname} 2>/dev/null YAMO=/tmp/yamo ODMDIR=/etc/objrepos SPENV=0 os=$($uname) if [[ -f $mmfscfg ]] then logDir=$($awk '$1 == "logDir" {value = $2} END {print value}' $mmfscfg) [[ -n $logDir ]] && rasDir="${logDir}/" fi [[ -z $rasDir ]] && rasDir=/var/adm/ras if [[ $os = "AIX" ]] then alevel=$($lslpp -L bos.rte | $grep bos.rte | $awk '{print $2}') sp_version=$($lslpp -Lc ssp.basic 2>/dev/null | $grep ssp.basic | $cut -f3 -d :) if [[ -n $sp_version ]] then sp_version=${sp_version%.#} SPENV=1 my_node_number=$(/usr/lpp/ssp/install/bin/node_number) PATH=$PATH:/usr/lpp/ssp/bin:/usr/lpp/csd/bin fi else alevel=$($uname -rv) fi # end of if [[ $os = "AIX" ]] myhname=$($hostname) if [[ -a /usr/bin/vmstat ]] then gotvmstat=1 fi # gpfs.snap [-c "CmdString"] [-d OutputDirectory] [-p] [-x {1 | 2}] [-y | -z] # [-a | -W NodeFilename | -w NodeName[,NodeName...] | # -n NodeNumber[,NodeNumber...]] USAGE=\ "Usage:\n"\ " gpfs.snap [-c \"CmdString\"] [-d OutputDirectory] [-p] [-x {1 | 2}] [-y | -z]\n"\ " [ -a | -W NodeFilename | -w NodeName[,NodeName...] | -n NodeNumber[,NodeNumber...]]" master=1 if [[ $arg1 = '-?' || $ARG1 = '-H' || $ARG1 = '--HELP' || $arg1 = '--' ]] then print $USAGE exit 1 fi while getopts :ac:d:Dn:pw:W:x:yz OPT do case $OPT in a) [[ -n $aflag ]] && printAndExit 36 "-$OPT" aflag="-$OPT" all="all" if [[ -n $nflag || -n $wflag || -n $Wflag ]] then [[ -n $nflag ]] && printErrorMsg 191 gpfs.snap "-a" "-n" [[ -n $wflag ]] && printErrorMsg 191 gpfs.snap "-a" "-w" [[ -n $Wflag ]] && printErrorMsg 191 gpfs.snap "-a" "-W" print $USAGE exit 1 fi ;; c) [[ -n $cflag ]] && printAndExit 36 "-$OPT" cflag="-$OPT" cmdString="$OPTARG" ;; d) d_argument=$OPTARG $mkdir -p $d_argument 2>/dev/null & waitforit if [[ $? = 1 ]] then print -u2 "Write to $d_argument timed out. Choose another directory or take the default (/tmp)" exit 1 fi BASELOGDIR=$d_argument $rm -rf ${BASELOGDIR}/gpfs.snap_err.*.out 2>/dev/null $rm -rf ${BASELOGDIR}/gpfs.snap_info.*.out 2>/dev/null $rm ${BASELOGDIR}/problem.${my_hostname} 2>/dev/null $mv /tmp/gpfs.snapOut/${BASELOGDIR}/gpfs.snap_err.${logdate}.out ${BASELOGDIR} 2>/dev/null ;; D) DEBUGgpfssnap=1 exec 2>/tmp/gpfs.snap.debug set -x echo "Writing debug data and redirecting stderr to /tmp/gpfs.snap.debug" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out ;; n) [[ -n $nflag ]] && printAndExit 36 "-$OPT" nflag="-$OPT" nodenums="$OPTARG" if [[ -n $aflag || -n $Wflag ]] then [[ -n $aflag ]] && printErrorMsg 191 gpfs.snap "-n" "-a" [[ -n $Wflag ]] && printErrorMsg 191 gpfs.snap "-n" "-W" print $USAGE exit 1 fi ;; p) pflag=1 ;; w) [[ -n $wflag ]] && printAndExit 36 "-$OPT" wflag="-$OPT" nodenames="$OPTARG" if [[ -n $aflag || -n Wflag ]] then [[ -n $aflag ]] && printErrorMsg 191 gpfs.snap "-w" "-a" [[ -n $Wflag ]] && printErrorMsg 191 gpfs.snap "-w" "-W" print $USAGE exit 1 fi ;; W) [[ -n $Wflag ]] && printAndExit 36 "-$OPT" Wflag="-$OPT" wcoll="$OPTARG" if [[ -n $aflag || -n $nflag || -n $wflag ]] then [[ -n $aflag ]] && printErrorMsg 191 gpfs.snap "-W" "-a" [[ -n $nflag ]] && printErrorMsg 191 gpfs.snap "-W" "-n" [[ -n $wflag ]] && printErrorMsg 191 gpfs.snap "-W" "-w" print $USAGE exit 1 fi ;; x) xflag=1 x_arg=$OPTARG if [[ $x_arg != 1 ]] && [[ $x_arg != 2 ]] then print "Illegal argument to option x: $x_arg" print "$USAGE" exit 1 fi ;; y) yflag=1 if [[ $zflag = 1 ]] then printErrorMsg 191 gpfs.snap y z print "$USAGE" exit 1 fi ;; z) zflag=1 if [[ $yflag = 1 ]] then printErrorMsg 191 gpfs.snap y z print "$USAGE" exit 1 fi master=0 ;; :) printAndExit 204 $OPTARG ;; +[acdDnpwWxyz]) printAndExit 13 "$OPT" ;; *) printAndExit 13 $OPTARG ;; esac done # end of while getopts do shift OPTIND-1 [[ $# != 0 ]] && printAndExit 38 $1 # If no node selection option was specified, default to -a. [[ -z $aflag && -z $nflag && -z $wflag && -z $Wflag ]] && \ aflag="-a" ######################################################################## # Set up trap exception handling and call the gpfsInit function. # It will ensure that the local copy of the mmsdrfs and the rest of the # GPFS system files are up-to-date. There is no need to lock the sdr. ######################################################################## trap pretrap2 HUP INT QUIT KILL gpfsInitOutput=$(gpfsInit nolock) setGlobalVar $? $gpfsInitOutput ###################################################### # Create a file with the names of all affected nodes. ###################################################### $rm -f $nodefile 2>/dev/null $touch -f $nodefile if [[ -n $aflag ]] then # Get a list of the nodes that belong to the cluster. getNodeList $REL_HOSTNAME_Field $GLOBAL_ID $mmsdrfsFile > $nodefile # If there are no nodes, issue an appropriate message and return. if [[ ! -s $nodefile ]] then print -u2 "$mmcmd: There are no known GPFS nodes." exit 1 fi elif [[ -n $Wflag ]] then # Verify input file is readable. if [[ ! -f $wcoll || ! -r $wcoll ]] then printErrorMsg 43 $mmcmd $wcoll exit 1 fi # Filter out comment lines and localhost entries. $grep -v -e "localhost" -e "^#" "$wcoll" > $nodefile if [[ ! -s $nodefile ]] then # No node names specified printErrorMsg 328 $mmcmd $wcoll exit 1 fi else # Either no option was specified, or we have some combination of -w and -n. # Convert the node names list into a file. for i in $(print $nodenames | $tr "," " ") do print $i >> $nodefile done # Convert the node number list into node names # and append the names to the file. for i in $(print $nodenums | $tr "," " ") do nodeName=$(getNodeInfo \ $REL_HOSTNAME_Field $NODE_NUMBER_Field $i $GLOBAL_ID $mmsdrfsFile) if [[ -n $nodeName ]] then print $nodeName >> $nodefile else # Node number is not in cluster printErrorMsg 352 $mmcmd $i fi done # If none of the node numbers resolved correctly, give up. [[ ! -s $nodefile && ( -n $nodenames || -n $nodenums ) ]] && exit 1 fi # end of if [[ -n $aflag ]] if [[ $master = 1 && -z $cflag ]] then checklist SUBDIR=${my_hostname}.master.${logdate} else SUBDIR=${my_hostname}.${logdate} fi tarfile=${BASELOGDIR}/gpfs.snap.${SUBDIR}.out.tar LOGDIR=${BASELOGDIR}/${SUBDIR} if [[ -z $cflag ]] then echo "$SNAP/gpfs.snap version $VERSION started at $starttime with args:\n$args\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out echo "My process id is $$" fi mypgid=$($ps -p $$ -o pgid=PGID | $tail -n -1) # Check the output directory for space. basedir=/$(echo $BASELOGDIR | $cut -f2 -d "/") if [[ $os = "AIX" ]] then $df $basedir | $tail -n -1 | $awk '{print $4}' >/tmp/yamo & else $df $basedir | $tail -n -1 | $awk '{print $5}' >/tmp/yamo & fi waitforit "$df $basedir" 60 if [[ $? = 1 ]] then print -u2 "df on $basedir timed out. Solve the problem with $basedir or specify a different directory with -d." exit 1 fi per=$($cat /tmp/yamo) if [[ $per = 100% ]] then print -u2 "$basedir is 100% full. Specify a different directory with -d or clear space." exit 1 fi $mkdir -p ${LOGDIR} cd $BASELOGDIR mmsdrfsfile=/var/mmfs/gen/mmsdrfs # If the node does not belong to a GPFS cluster, go away quietly. if [[ ! -f $mmsdrfsfile ]] then print -u2 "The node does not belong to a GPFS cluster ($mmsdrfsfile does not exist). Exiting." return 0 fi determineMode getLocalNodeData mygnum=$ourNodeNumber mygname=$ourNodeName mynodeset=$(findNodesetId $mmsdrfsfile $mygnum) if [[ $mynodeset = "%%home%%" ]] then mynodeset2=$($grep clusterName $mmsdrfsfile | $grep %%home%% | $cut -f2 -d " " | $cut -f1 -d ":") else mynodeset2=$mynodeset fi if [[ -z $mygname ]] then $grep MEMBER_NODE $mmsdrfsfile >/tmp/mmsdrfs2.tmp { while read line do rhname=$(echo $line | $cut -f8 -d ":") addr=$($ping -c1 -w5 $rhname | $head -n 1 | $cut -f2 -d "(" | $cut -f1 -d ")") ilist=$(netstat -i | $awk '{print $1}' | $grep -v -E "Iface|Kernel|Name") for i in $ilist do $ifconfig $i | $grep $addr >/dev/null if [[ $? = 0 ]] then mynodeset=$(echo $line | $cut -f1 -d :) mygname=$(echo $line | $cut -f6 -d :) mygnum=$(echo $line | $cut -f5 -d :) break fi done if [[ -n $mygname ]] then break fi done } < /tmp/mmsdrfs2.tmp $rm /tmp/mmsdrfs2.tmp 2>/dev/null fi # end of if [[ -z $mygname ]] if [[ $os = "AIX" ]] then [[ -z $cflag ]] && \ echo "I am hostname $myhname running AIX level $alevel" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out gpfs_version=$($lslpp -Lc gpfs.base 2>/dev/null | $grep gpfs.base | $cut -f3 -d :) gpfs_version=${gpfs_version%.#} if [[ -z $gpfs_version ]] then gpfs_version=$($lslpp -Lc mmfs.base.rte 2>/dev/null | $grep mmfs.base.rte | $cut -f3 -d :) gpfs_version=${gpfs_version%.#} fi fi # end of if [[ $os = "AIX" ]] if [[ $SPENV = 1 ]] then [[ -z $cflag ]] && \ echo "I am SP node $my_node_number running $sp_version" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out fi if [[ $os = "Linux" ]] then [[ -z $cflag ]] && \ echo "I am $myhname running Linux level $alevel" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out gpfs_version=$($rpm -q gpfs.base | $awk 'BEGIN{FS="-"} {print $2"-"$3}') fi if [[ -z $gpfs_version ]] then print -u2 "\nGPFS does not appear to be installed on this machine." $rm -r ${LOGDIR} 2>/dev/null exit 1 fi rel1=$(echo $gpfs_version | $cut -f1 -d ".") rel2=$(echo $gpfs_version | $cut -f2 -d ".") if [[ $rel1 -gt 2 ]] || [[ $rel1 -eq 2 && $rel2 -ge 3 ]] then groupname="cluster" else groupname="nodeset" fi [[ -z $cflag ]] && \ echo "I am gpfs node $mygname number $mygnum in $groupname $mynodeset2 running GPFS version $gpfs_version" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out dumpdir=$($cat /var/mmfs/etc/mmfs.cfg | $grep "^dataStructureDump " | $head -1 | $awk '{print $2}') if [[ -z $dumpdir ]] then dumpdir="/tmp/mmfs" fi # If the -c "run this command string on the nodes" option was specified, # just collect the data, cleanup temporary files, and exit early. if [[ -n $cflag ]] then $mmdsh -F $nodefile K5MUTE=1 ksh -c \"PATH=$PATH $cmdString\" 2>/dev/null $rm -r ${LOGDIR} 2>/dev/null $rm -f $nodefile 2>/dev/null $rm -f $LOCAL_FILES 2>/dev/null $rm /tmp/hostfile 2>/dev/null exit 0 fi # end of if [[ -n $cflag ]] # Collect data on the master node if so desired. if [[ $master = 1 ]] then if [[ $pflag != 1 ]] then print "Checking configuration files . . ." nlist2=$($cat $nodefile2) check_fs $nodefile2 "$nlist2" brcFile=$trcFile bglist=$tglist check_files /var/mmfs/gen/mmsdrfs $nodefile2 print "Checking for waiters . . ." check_waiters if [[ $os = "Linux" ]] then set +x getCurrentStanzaList stanzafile [[ $DEBUGgpfssnap = 1 ]] && set -x else getCurrentStanzaList2 stanzafile fi fslist=$($cat stanzafile | $cut -f2 -d :) fslist=${fslist#Device} $rm stanzafile 2>/dev/null fi # end of if [[ $pflag != 1 ]] do_master_stuff firstone=1 $rm -f $nodefilecFile list=$($cat $nodefile) for i in $list do if [[ $firstone = 1 ]] then nodefilelist=$i print -- $i > $nodefilecFile firstone=0 else nodefilelist="$nodefilelist $i" print -- $i >> $nodefilecFile fi done check_fs $nodefilecFile "$nodefilelist" "$hostarray" rlist=$trlist rcFile=$trcFile glist=$tglist fi # end of if [[ $master = 1 ]] all="-1" check_dumps internaldump $all $dumpdir check_dumps trcrpt $all "/tmp/mmfs" if [[ $x_arg = 2 ]] then pass=2 else print "\nDetermining whether there is enough space in ${BASELOGDIR} . . .\n" fi #export mmdshCommandsFile=${BASELOGDIR}/commandfile while [[ $pass -le 2 ]] do print "Processing log files . . ." get_files if [[ $master = 1 && $pass = 2 ]] then if [[ -s $rcFile ]] then $mmdsh -F $rcFile K5MUTE=1 cat ${BASELOGDIR}/${logdate}/problem.\* 2>/dev/null | tee -a ${BASELOGDIR}/problem.${my_hostname} fi for i in $glist do $cat ${BASELOGDIR}/${logdate}/problem.$i 2>/dev/null | $tee -a ${BASELOGDIR}/problem.${my_hostname} done # Question: What is the "node_args" variable for? $mmdsh -F $nodefile K5MUTE=1 $SNAP/gpfs.snap $node_args -d ${BASELOGDIR}/$(hostname | cut -d. -f1)_${logdate} -x 2 -z >/dev/null 2>/dev/null & fi if [[ $pass = 1 && $gotvmstat = 1 ]] then ksh -c "vmstat 5 5" > ${LOGDIR}/vmstat_5_5 & fi if [[ -s ${LOGDIR}/long_waiters.sorted ]] then if [[ $pass = 1 ]] then size=$($ls -l ${LOGDIR}/long_waiters.sorted | $awk '{print $5}') addit $size "${LOGDIR}/long_waiters.sorted" else tarit long_waiters.sorted fi fi if [[ $master = 1 && $pflag != 1 ]] then doit "dump_list" "$mmdsh -F $nodefile2 ls -l $dumpdir/internaldump\* 2>/dev/null" print "Processing waiters . . ." check_waiters2 print "Processing configuration files . . ." check_files2 fi # end of if [[ $master = 1 && $pflag != 1 ]] print "Running mm commands . . ." if [[ $master = 1 || $yflag = 1 ]] then doit "mmlsconfig" "/usr/lpp/mmfs/bin/mmlsconfig" doit "mmlsmgr" "/usr/lpp/mmfs/bin/mmlsmgr" doit "mmlsnode_a" "/usr/lpp/mmfs/bin/mmlsnode -a" doit "mmgetstate_a" "/usr/lpp/mmfs/bin/mmgetstate -a" doit "tsstatus" "tsstatus" # need full pathname for some of these for waitforit to handle properly for i in $fslist do doit "mmdf" "/usr/lpp/mmfs/bin/mmdf $i -q" 1 doit "mmlsfs" "/usr/lpp/mmfs/bin/mmlsfs $i" 1 doit "mmlsdisk" "/usr/lpp/mmfs/bin/mmlsdisk $i -L" 1 doit "mmlspolicy" "/usr/lpp/mmfs/bin/mmlspolicy $i" 1 doit "mmlspolicy" "/usr/lpp/mmfs/bin/mmlspolicy $i -L" 1 doit "mmlsfileset" "/usr/lpp/mmfs/bin/mmlsfileset $i" 1 doit "mmlsfileset" "/usr/lpp/mmfs/bin/mmlsfileset $i -L" 1 doit "mmlssnapshot" "/usr/lpp/mmfs/bin/mmlssnapshot $i -d -Q" 1 done doit "mmlscluster" "mmlscluster" doit "mmlsnsd" "mmlsnsd -L" 1 doit "mmlsnsd" "mmlsnsd -X" doit "mmremotecluster" "mmremotecluster show all" doit "mmremotefs" "mmremotefs show all" doit "mmauth" "mmauth show" fi # end of if [[ $master = 1 || $yflag = 1 ]] # Be careful not to dump live data that may assert or segfault. # We can always ask for additional data later. print "Processing dumps . . ." doit "mmfsadm_dump_some" "mmfsadm dump version" 1 doit "mmfsadm_dump_some" "mmfsadm dump waiters" 1 doit "mmfsadm_dump_some" "mmfsadm dump cfgmgr" 1 doit "mmfsadm_dump_some" "mmfsadm dump tscomm" 1 doit "mmfsadm_dump_some" "mmfsadm dump config" 1 doit "mmfsadm_dump_some" "mmfsadm dump mutex" 1 doit "mmfsadm_dump_some" "mmfsadm dump sgmgr" 1 doit "mmfsadm_dump_some" "mmfsadm dump stripe" 1 doit "mmfsadm_dump_some" "mmfsadm dump malloc" 1 doit "mmfsadm_dump_some" "mmfsadm dump fs" 1 doit "mmfsadm_dump_some" "mmfsadm dump mmap" 1 doit "mmfsadm_dump_some" "mmfsadm dump nsd" 1 doit "mmfsadm_dump_some" "mmfsadm dump disk" 1 doit "mmfsadm_dump_some" "mmfsadm dump alloc stats" 1 doit "mmfsadm_dump_some" "mmfsadm dump alloc hist" 1 doit "mmfsadm_dump_some" "mmfsadm dump dealloc stats" 1 doit "mmfsadm_dump_some" "mmfsadm dump allocmgr" 1 doit "mmfsadm_dump_some" "mmfsadm dump allocmgr stats" 1 doit "mmfsadm_dump_some" "mmfsadm dump allocmgr hist" print "Processing common files . . ." get_always print "Processing network info . . ." get_net_stuff print "Processing lvm info . . ." get_lvm_stuff if [[ $os = "AIX" ]] then console=$(/usr/sbin/lscons) if [[ -f $console ]] then if [[ $pass = 1 ]] then temp_bytes=$($ls -l $console | $awk '{ print $5 }') addit $temp_bytes "$console" else $cp $console ${LOGDIR}/console tarit "console" fi fi fi # end of if [[ $os = "AIX" ]] print "Processing miscellaneous files . . ." get_files_list "/etc" "fstab filesystems trcfmt syslog.conf" get_files_dir "/var/mmfs/etc" get_files_dir "/var/mmfs/gen" get_files_dir "/var/mmfs/ssl" get_files_dir "/var/mmfs/ssl/stage" get_files_dir "/var/mmfs/tmp" get_files_list "$dumpdir" "$internal_list" internaldumps savedir=$(pwd) cd /var/log 2>/dev/null mlist=$($ls messages* 2>/dev/null) cd $savedir if [[ -n $mlist ]] then get_files_list "/var/log" "$mlist" fi mlist="" [[ -s /usr/lpp/mmfs/bin/mmfslinux ]] && mlist="$mlist mmfslinux" [[ -s /usr/lpp/mmfs/bin/mmfs26 ]] && mlist="$mlist mmfs26" [[ -s /usr/lpp/mmfs/bin/mmfs24 ]] && mlist="$mlist mmfs24" [[ -s /usr/lpp/mmfs/bin/mmfs ]] && mlist="$mlist mmfs" if [[ -n $mlist ]] then get_files_list "/usr/lpp/mmfs/bin" "$mlist" fi # Get info for whatever group services/topopology services pairs are running. if [[ $SPENV = 1 && $my_node_number = 0 ]] then syspar=$(/usr/lpp/ssp/bin/spget_syspar -n) syspar=".$syspar" fi if [[ $SPENV = 1 ]] then doit "lssrc_rvsd" "lssrc -g rvsd" 1 doit "lsvsd_l" "lsvsd -l" if [[ -s ./${SUBDIR}/mmsdrfs2 ]] then if [[ $pass = 1 ]] then temp_bytes=$($ls -l ./${SUBDIR}/mmsdrfs2 | $awk '{ print $5 }') addit $temp_bytes "mmsdrfs2" else tarit "mmsdrfs2" fi fi fi # end of if [[ $SPENV = 1 ]] if [[ $master = 1 ]] then if [[ $pass = 1 ]] then print "Waiting for remote nodes to report space requirements . . ." else print "Waiting for remote nodes to collect data . . ." fi wait fi if [[ $pass = 1 ]] then if [[ $gotvmstat = 1 ]] then (( tmpval = $( vmstat | $wc -c ) * 5 )) addit $tmpval fi check_space print "It appears we have enough space.\n" if [[ $x_arg = 1 ]] then $rm -r ${LOGDIR} 2>/dev/null if [[ $master = 1 ]] then $rm -r $NODESDIR 2>/dev/null fi exit 0 fi fi # end of if [[ $pass = 1 ]] if [[ $pass = 2 ]] then [[ $gotvmstat = 1 ]] && tarit "vmstat_5_5" tarit "mmdf" tarit "mmlsdisk" tarit "mmlsfs" tarit "mmlspolicy" tarit "mmlsfileset" tarit "mmlssnapshot" fi # end of if [[ $pass = 2 && $gotvmstat = 1 ]] pass=$(expr $pass + 1) done # end of while [[ $pass -le 2 ]] do if [[ -s gpfs.snap_err.${logdate}.out ]] then $cp gpfs.snap_err.${logdate}.out ${SUBDIR}/gpfs.snap_err.${logdate}.out tar -rf $tarfile ${SUBDIR}/gpfs.snap_err.${logdate}.out fi $rm gpfs.snap_err.out.temp 2>/dev/null if [[ -a problem.${my_hostname} ]] then if [[ -s problem.${my_hostname} ]] then $cp problem.${my_hostname} ${SUBDIR}/problem.${my_hostname} tar -rf $tarfile problem.${my_hostname} else $rm problem.${my_hostname} 2>/dev/null fi fi # end of if [[ -a problem.${my_hostname} ]] endtime=$(date) echo "gpfs.snap near completion at $endtime" >> ${BASELOGDIR}/gpfs.snap_info.${logdate}.out if [[ -a gpfs.snap_info.${logdate}.out ]] then $cp gpfs.snap_info.${logdate}.out ${SUBDIR}/gpfs.snap_info.${logdate}.out tar -rf $tarfile ${SUBDIR}/gpfs.snap_info.${logdate}.out fi if [[ -a /bin/compress ]] then gotcompress=1 compress $tarfile suff="Z" else gzip $tarfile suff="gz" fi if [[ $master = 1 ]] then basetar=$(basename ${tarfile}) tar -cf $bigtarfile ${basetar}.${suff} $rm ${tarfile}.${suff} wait if [[ -s $nodefile ]] then print "Getting snaps from remote nodes . . ." if [[ -s $rcFile ]] then $mmdsh -F $rcFile K5MUTE=1 $rcp ${BASELOGDIR}/$(hostname | cut -d. -f1)_${logdate}/gpfs.snap.\*.out.tar.\* $myhname:${BASELOGDIR} fi if [[ -s rcperr ]] then print "The following nodes had trouble sending the snap file:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out $cat rcperr | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out fi $rm rcperr 2>/dev/null g=0 for i in $glist do short=$(echo ${ghostarray[$g]} | $cut -f1 -d .) $mv ${BASELOGDIR}/${short}_${logdate}/gpfs.snap.${short}.*.out.tar.* ${BASELOGDIR} $rm -r ${BASELOGDIR}/${short}_${logdate} (( g = g + 1 )) done tarlist=$($ls gpfs.snap.*.out.tar.*) if [[ -n $tarlist ]] then tar -rf $bigtarfile $tarlist fi $rm $tarlist 2>/dev/null fi print "###############################################################################" print "Send file ${bigtarfile} to IBM Service" else print "###############################################################################" print "Send file ${tarfile}.${suff} to IBM Service" fi # end of if [[ $master = 1 ]] # Remove temporary files. $rm -r ${LOGDIR} 2>/dev/null $rm -r ${NODESDIR} 2>/dev/null $rm ${BASELOGDIR}/*waiters 2>/dev/null $rm -f $nodefile 2>/dev/null $rm -f $LOCAL_FILES 2>/dev/null $rm /tmp/hostfile 2>/dev/null endtime=$(date) echo "gpfs.snap completed at $endtime" exit 0