#!/bin/ksh # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # # # Licensed Materials - Property of IBM # # (C) COPYRIGHT International Business Machines Corp. 2003,2005 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#)55 1.18 src/avs/fs/mmfs/ts/admin/mmexectsmcmd.sh, mmfs, avs_rgpfs24, rgpfs240610b 2/11/05 11:32:21 ############################################################################## # # This script issues TSM commands from a GPFS node that is a TSM client. # # It accepts the following arguments: # 1) The mount point of the pertinent file system # 2) The TSM operation to be performed; supported opperations are: # - selective # - incremental # - expire # - restore (no longer used since tsrestorefile was dropped) # 3) A qualifier for the fourth parameter: # - filelist This value indicates that the fourth parameter # is the value for the Tivoli filelist=<> option. # - nofilelist This value indicates that the fourth parameter # is not to be used as a Tivoli filelist parameter. # In this case, the Tivoli filelist option is not used. # 4) If the 3rd parameter was # - filelist then the 4th parameter is the full path name of the # file containing the filenames to be operated on. # - nofilelist then the 4th parameter is another parameter to be # passed to the Tivoli command. # 5) The nodename of the master tsbackup process (used for locking) # 6) The pid number of the master tsbackup process (used for locking) # 7) The backup client process index (used for file naming) # 8) The name of the TSM server (checked against dsm.opt file) # 9) The I/O rate level (used to allow other non-backup processes to run) # # The backup client process index is used for constructing the name of # the file for pending (not successfully completed) transactions. # # The program returns: # 0 on success (i.e., all files were succesfully backed up) # 1 on partial success (i.e., some but not all files were backed up) # 2 on failure (i.e., no success at all) # ############################################################################## # Include global declarations and service routines . /usr/lpp/mmfs/bin/mmglobfuncs . /usr/lpp/mmfs/bin/mmsdrfsdef . /usr/lpp/mmfs/bin/mmfsfuncs sourceFile="mmexectsmcmd.sh" [[ -n $DEBUG || -n $DEBUGmmexectsmcmd ]] && set -x $mmTRACE_ENTER "$*" #------------------------------------------------- # Local work files. Names should be of the form: # fn=${tmpDir}fn.${mmcmd}.$$ #------------------------------------------------- #tmpCtrlFile=${tmpDir}tmpCtrlFile.mmbackup.$$ #LOCAL_FILES=" $tmpCtrlFile " #------------------ # Global variables #------------------ dateTime=$($date +"%y%m%d_%H:%M:%S") #tsmDate=$(date '+%m/%d/%y') #tsmTime=$(date '+%H:%M:%S') dsmc=/usr/bin/dsmc if [[ -n $DSM_CONFIG ]] then dsmoptfile=$DSM_CONFIG elif [[ $osName = AIX ]] then dsmoptfile=/usr/tivoli/tsm/client/ba/bin/dsm.opt elif [[ $osName = Linux ]] then dsmoptfile=/opt/tivoli/tsm/client/ba/bin/dsm.opt else print -u2 " Unknown operating system $osName " exit 1 fi searchErrorString="No files matching search criteria" objectErrorString="No objects on server match" errorProcessingString="Error processing" opf=Operation_Failure opps=Operation_Partial_Success ops=Operation_Success rc_success=0 rc_psuccess=1 rc_fail=2 tpl=".mmbuTSMPendingTransactions" lockdir="/var/mmfs/etc/mmbackuplock" pgm="mmexectsmcmd" #----------------- # local routines #----------------- #-------------------------------------------------------------- # This function is called if there is an interrupt before # we have obtained a backup lock. #-------------------------------------------------------------- function localTrap { doCleanupAndExit $rc_fail } #-------------------------------------------------------------- # This function is called if there is an interrupt after # we have obtained a backup lock. #-------------------------------------------------------------- function localTrap2 { freeLockAndExit $rc_fail } #-------------------------------------------------------------- # getBackupLock # # Obtain a lock before proceeding. # # Here is the mechanism by which locks are managed and used: # # 1) Begin to obtain a lock by issuing: mkdir lockdir # 2) If (1) succeeds, no one else has a lock. Finish obtaining your # lock by issuing mkdirs for lockdir/masternode_masterpid and # lockdir/masternode_masterpid/mylocalpid. You now have a valid lock. # 3) If (1) fails, perhaps one of your local sibling processes already # has created lockdir and lockdir/masternode_masterpid (as well as # lockdir/masternode_masterpid/hislocalpid for his own use). Try to # obtain a lock by doing mkdir lockdir/masternode_masterpid/mylocalpid. # This will fail if lockdir/masternode_masterpid does not exist and # succeed if it does. There is a small chance that there is a sibling # process that is obtaining a lock and has created lockdir but has # not yet created lockdir/masternode_masterpid. This could be solved # by trying the mkdir for lockdir/masternode_masterpid/mylocalpid a # second time. If a mkdir for lockdir/masternode_masterpid/mylocalpid # succeeds, you have a valid lock. # 4) If a valid lock is not obtained, exit with a msg to the user stating # who has the lock, try again later, and how to free the lock if the # process that had the lock no longer exists. # 5) If a valid lock is obtained, proceed to do the requested work. # 6) When the requested work is done, free your lock by doing # rmdir lockdir/masternode_masterpid/mylocalpid # rmdir lockdir/masternode_masterpid # rmdir lockdir # The latter two rmdirs will fail if any of sibling processes are still # holding locks, since these directories will then not be empty. # Care must be taken to always free locks before exiting. # #-------------------------------------------------------------- function getBackupLock { # Enable debug output. typeset sourceFile="mmexectsmcmd.sh" [[ -n $DEBUG || -n $DEBUGgetBackupLock ]] && set -x $mmTRACE_ENTER "$*" # Try to create the base lock directory. $mkdir $lockdir > /dev/null 2>&1 if [[ $? = 0 ]] then # The lock was not in use by anyone. Finish creating our lock. $mkdir $lockdir/$masterNode"_"$masterPid > /dev/null 2>&1 $mkdir $lockdir/$masterNode"_"$masterPid/$$ > /dev/null 2>&1 if [[ $? != 0 ]] then print "$opf: $sn: Unexpected error creating a lock. Try again. If the problem persists, contact IBM service." freeLockAndExit $rc_fail fi else # At least one other process has a lock. Try to obtain a lock for # our own use by issuing a mkdir. If the other processes with locks # were created by the same master process as our process was, # the mkdir will succeed. $mkdir $lockdir/$masterNode"_"$masterPid/$$ > /dev/null 2>&1 if [[ $? != 0 ]] then # Retry the mkdir just in case the first one failed due to timing. $mkdir $lockdir/$masterNode"_"$masterPid/$$ > /dev/null 2>&1 if [[ $? != 0 ]] then lsOutput=$($ls $lockdir) IFS_sv=$IFS IFS="_" set -f ; set -- $lsOutput ; set +f lockhldrNode=$1 lockhldrPid=$2 IFS="$IFS_sv" # We have the pid and hostname of the process holding the lock. # Determine whether the process is still running. if [[ $lockhldrNode = $ourNodeName ]] then pidCheckAll=$($mmremote pid $lockhldrPid) else pidCheckAll=$($mmcommon on1 $lockhldrNode pid $lockhldrPid) rc=$? fi pidCheck=$(print "$pidCheckAll" | $egrep "^(died|alive)$") if [[ $pidCheck != died ]] then $mmTRACE "$1 lock held by $lockhldrNode $lockhldrPid" # The process that has the lock is still alive, or we could # not determine its status. Either way, give up. if [[ $pidCheck = alive ]] then print "$opf: $sn:\nFailed to obtain GPFS backup lock. Process $lockhldrPid on node $lockhldrNode has it.\nIf process $lockhldrPid is no longer running on $lockhldrNode, clear the lock\nby issuing: \"rm -rf $lockdir\" on all GPFS nodes. Otherwise,\ntry the GPFS backup again when process $lockhldrPid on $lockhldrNode is done.\n" else print "$opf: $sn:\nFailed to obtain GPFS backup lock. Unable to reach the holder\nof the lock, which is process $lockhldrPid on node $lockhldrNode.\nIf process $lockhldrPid is no longer running on $lockhldrNode, clear the lock\nby issuing: \"rm -rf $lockdir\" on all GPFS nodes. Otherwise,\ntry the GPFS backup again when process $lockhldrPid on $lockhldrNode is done.\n" fi doCleanupAndExit $rc_fail else # The process that had the lock somehow went away without unlocking. # Remove the old lock and establish a new one for the caller. $rm -rf $lockdir > /dev/null 2>&1 $mkdir $lockdir > /dev/null 2>&1 if [[ $? = 0 ]] then # Success! Complete the work of establishing the lock. $mkdir $lockdir/$masterNode"_"$masterPid > /dev/null 2>&1 $mkdir $lockdir/$masterNode"_"$masterPid/$$ > /dev/null 2>&1 else # Failed again! Give up. print "$opf: $sn:\nFailed to obtain GPFS backup lock even after clearing the lock.\n" doCleanupAndExit $rc_fail fi fi # end of if [[ $pidCheck != died ]] fi fi fi } #------ end of function getBackupLock ----------------- #-------------------------------------------------------------- # freeBackupLock #-------------------------------------------------------------- function freeBackupLock { # Enable debug output. typeset sourceFile="mmexectsmcmd.sh" [[ -n $DEBUG || -n $DEBUGfreeBackupLock ]] && set -x $mmTRACE_ENTER "$*" $rmdir $lockdir/$masterNode"_"$masterPid/$$ > /dev/null 2>&1 $rmdir $lockdir/$masterNode"_"$masterPid > /dev/null 2>&1 $rmdir $lockdir > /dev/null 2>&1 } #------ end of function freeBackupLock ---------------- #-------------------------------------------------------------- # # freeLockAndExit # # Function: Free the backup lock and then exit via the # doCleanupAndExit routine. # # Input: $1 - return code with which to exit # #-------------------------------------------------------------- function freeLockAndExit { # Enable debug output. typeset sourceFile="mmexectsmcmd.sh" [[ -n $DEBUG || -n $DEBUGfreeLockAndExit ]] && set -x $mmTRACE_ENTER "$*" rc=$1 freeBackupLock doCleanupAndExit $rc } #------ end of function freeLockAndExit --------------- #-------------------------------------------------------------- # # doCleanupAndExit # # Function: Perform cleanup unique to mmexectsmcmd, and then # exit via the standard cleanupAndExit routine. # # Input: $1 - return code with which to exit # #-------------------------------------------------------------- function doCleanupAndExit { # Enable debug output. typeset sourceFile="mmexectsmcmd.sh" [[ -n $DEBUG || -n $DEBUGdoCleanupAndExit ]] && set -x $mmTRACE_ENTER "$*" rc=$1 # If the TSM log file does not exist and have a size greater than 0, # remove the log directory. [[ ! -s $logfile ]] && $rm -rf $logdir > /dev/null 2>&1 cleanupAndExit $rc } #------ end of function doCleanupAndExit --------------- #------------------------ # Start main processing. #------------------------ # Set local trap routine. trap localTrap HUP INT QUIT KILL typeset -l keyword_lc # variable for storing keyword in lower case # Verify that the correct number of parameters were passed. if [[ $argc -ne 9 && $argc -ne 2 ]] then print "$opf: $sn: An incorrect number of parameters was passed." print "Usage:\n $pgm " doCleanupAndExit $rc_fail fi [[ -z $MMMODE || -z $primaryServer ]] && \ determineMode getLocalNodeData sn=$ourNodeName # # Assign the input parameters as follows: # # arg1 - "givestatus" # arg2 - number of seconds to sleep between messages # # OR # # arg1 - mountpoint of filesystem to be operated on # arg2 - TSM command (selective, incremental, expire, or restore) # arg3 - TSM command option (filelist or nofilelist) # arg4 - name of the filelist file (if arg3 was filelist) # or # some other TSM command parameter (if arg3 was nofilelist) # arg5 - name of the invoking node (used for obtaining a backup lock) # arg6 - pid of the invoking process (used for obtaining a backup lock) # arg7 - process index (used for creating directory in which, if necessary, # Tivoli will store a dsmerror.log file) # arg8 - name of the TSM server (used for checking against the dsm.opt file) # arg9 - I/O rate value # if [[ $arg1 = "givestatus" ]] then integer nSeconds=${arg2} while true do sleep $nSeconds printInfoMsg 527 mmbackup done # end while true fi mountPoint=$arg1 tsmCommand=$arg2 tsmCommandOption=$arg3 if [[ $tsmCommandOption = filelist ]] then filelistName=$arg4 if [[ ! -f $filelistName || ! -r $filelistName ]] then # The filelist file does not exist or is not readable. # Issue an error message and fail the command. print "$opf: $sn: Cannot open $filelistName. Make sure filesystem is mounted on node." doCleanupAndExit $rc_fail fi elif [[ $tsmCommandOption = nofilelist ]] then tsmCommandParm=$arg4 else print "$opf: $sn: Invalid parameter: $tsmCommandOption" doCleanupAndExit $rc_fail fi masterNode=$arg5 masterPid=$arg6 processIndex=$arg7 tsmServer=$arg8 ioRateValue=$arg9 # Set and export the DSM_LOG environment variable to tell TSM # where to put any error log it generates. logdir=$mmbackupDir"$mountPoint"_"$dateTime"_"$processIndex" logfile=$logdir/dsmerror.log export DSM_LOG=$logdir # Create the directory for the TSM error log. $mkdir -p $logdir > /dev/null 2>&1 # Add a slash to the end of the mount point variable for passing to TSM later. mountPoint=$mountPoint/ # Call routine to obtain a lock before proceeding. # If a lock cannot be obtained, getBackupLock will exit with a failure rc. getBackupLock # At this point we have obtained a lock. # Now that we have obtained a backup lock, change the trap routine # to one that frees the lock before exiting. trap localTrap2 HUP INT QUIT KILL # Check that the TSM server passed on the command matches the one # specified in the /usr/tivoli/tsm/client/ba/bin/dsm.opt file. # If it doesn't, issue an error message and exit with a failing rc. grepOutput=$($grep -i servername $dsmoptfile | $grep -v '*') set -f ; set -- $grepOutput ; set +f dsmoptTSMserverKeyword=$1 dsmoptTSMserver=$2 keyword_lc=$dsmoptTSMserverKeyword # Convert keyword to all lower case. if [[ $keyword_lc != servername ]] then print "$opf: $sn: unable to find TSM server name in dsm.opt file" freeLockAndExit $rc_fail fi if [[ $tsmServer != $dsmoptTSMserver ]] then print "$opf: $sn: specified TSM server does not match TSM server in dsm.opt file" freeLockAndExit $rc_fail fi # Depending on the I/O rate value, take a breather # to allow other non-backup processes to run. # The scheme implemented here, wait a fixed number of # seconds controlled by the passed value, is a very # rough beginning. A better scheme would be to calculate # the time used doing backup work, and then sleep a # fraction of that time based on the I/O rate value. # (An I/O rate value of 100 would mean don't sleep at all, # a value of 75 would mean sleep for 1/3 of the time # spent doing backup, a value of 50 would mean sleep an # amount of time equal to the time spent on backup, # and so on.) integer numberOfSeconds=100-${ioRateValue} sleep $numberOfSeconds # We now have obtained a backup lock, verified that the TSM server is # correct, and idled away some time if warranted by the I/O rate value. # Proceed to process Tivoli Storage Manager commands. # TSM selective or incremental command processing # if [[ $tsmCommand = selective || $tsmCommand = incremental ]] then # Issue the TSM command to have the list of files backed up. # $dsmc $tsmCommand -filelist=$filelistName > /tmp/out1i 2> /tmp/out2i $dsmc $tsmCommand -filelist=$filelistName > /dev/null 2>&1 rc=$? # The return code only indicates whether the TSM executable ran successfully. # The TSM client returns a code greater than 0 (zero) only if TSM was # unable to attempt the operations. # Only those return codes output to the error report (dsmerror.log) # indicate the success or failure of specific TSM commands. if [[ $rc -eq 0 ]] then # Clear the error log. It will be recreated # by the query command to be issued next. if [[ -a $logfile && -s $logfile ]] then $rm -rf $logfile > /dev/null 2>&1 rc=$? if [[ $rc -gt 0 ]] then print "$opf: $sn: cannot rm $logfile" freeLockAndExit $rc_fail fi fi # Issue the query command to determine which files were not # backed up successfully. # # dsmc query backup -fromdate=$tsmDate -fromtime=$tsmTime # -filelist=$filelistName > /dev/null 2>&1 # $dsmc query backup -filelist=$filelistName > /tmp/out1q 2> /tmp/out2q $dsmc query backup -filelist=$filelistName > /dev/null 2>&1 rc=$? if [[ $rc -eq 0 ]] then # If we have an error log file, examine it to determine # the files which were not successfully backed up. if [[ -a $logfile && -s $logfile ]] then # Process the error log and construct the pending transactions file. # # The following is a line from the pertinent error log file: # 03/13/02 10:37:48 ANS1092E No files matching search criteria # were found # or # 11/16/01 15:24:56 ANS1345E No objects on server match # '/log/michail/file_4' # Check for matching search criteria failure. pendingBackupsList="$mountPoint$tpl"_"$processIndex" $grep "${searchErrorString}" $logfile > ${pendingBackupsList} if [[ -a $pendingBackupsList && -s $pendingBackupsList ]] then print "$opf: $sn: TSM dsmc query command indicated search criteria failure (see file $logfile)." $rm -rf $pendingBackupsList > /dev/null 2>&1 freeLockAndExit $rc_fail fi # Check for objects which do not match, i.e., # individual files which were not backed up. $grep "${objectErrorString}" $logfile | \ $cut -f2 -d\' >> ${pendingBackupsList} print "$opps: $sn: TSM dsmc $tsmCommand command partially succeeded (see file $logfile)." freeLockAndExit $rc_psuccess else print "$ops: $sn: TSM dsmc query command did not produce any error log." freeLockAndExit $rc_success fi else print "$opf: $sn: TSM dsmc query command failed to run." freeLockAndExit $rc_fail fi else # If there is an error log file, tell the user to examine it # to determine the cause of the failure. if [[ -a $logfile && -s $logfile ]] then print "$opf: $sn: TSM dsmc $tsmCommand command failed to run (see file $logfile)." else print "$opf: $sn: TSM dsmc $tsmCommand command failed to run." fi freeLockAndExit $rc_fail fi # TSM expire command processing # elif [[ $tsmCommand = expire ]] then # Clear the error log. We are only interested in the error log # created from issuing the expire command. if [[ -a $logfile && -s $logfile ]] then $rm -rf $logfile > /dev/null 2>&1 rc=$? if [[ $rc -gt 0 ]] then print "$opf: $sn: cannot rm $logfile" freeLockAndExit $rc_fail fi fi # Issue the TSM command to have files in the file system expired. if [[ $tsmCommandOption = filelist ]] then # $dsmc $tsmCommand -noprompt -filelist=$filelistName > /tmp/out1x 2> /tmp/out2x $dsmc $tsmCommand -noprompt -filelist=$filelistName > /dev/null 2>&1 else # $dsmc $tsmCommand -noprompt $tsmCommandParm > /tmp/out1x 2>/tmp/out2x $dsmc $tsmCommand -noprompt $tsmCommandParm > /dev/null 2>&1 fi rc=$? # The return code only indicates whether the TSM executable ran # successfully. # The TSM client returns a code greater than 0 (zero) only if TSM was # unable to attempt the operations. # Only those return codes output to the error report (dsmerror.log) # indicate the success or failure of specific TSM commands. if [[ $rc -eq 0 ]] then # If we have an error log file examine it to determine the files # which were not successfully restored. if [[ -a $logfile && -s $logfile ]] then # Process the error log and construct the transactions_pending file. # Check for objects which failed to get processed (i.e expired). pendingExpiresList="$mountPoint$tpl"_"$processIndex" $grep "${errorProcessingString}" $logfile | \ $cut -f2 -d\' > ${pendingExpiresList} if [[ -a $pendingExpiresList && -s $pendingExpiresList ]] then print "$opf: $sn: TSM dsmc query command indicated search criteria failure (see file $logfile)." $rm -rf $pendingExpiresList > /dev/null 2>&1 freeLockAndExit $rc_fail fi # Check for objects which do not match, namely, # individual files which were not restored. $grep "${objectErrorString}" $logfile | \ $cut -f2 -d\' >> ${pendingExpiresList} print "$opps: $sn: TSM dsmc expire command partially succeeded (see file $logfile)." freeLockAndExit $rc_psuccess else print "$ops: $sn: TSM dsmc expire command did not produce any error log." freeLockAndExit $rc_success fi else # If there is an error log file, tell the user to examine it # to determine the cause of the failure. if [[ -a $logfile && -s $logfile ]] then print "$opf: $sn: TSM dsmc expire command failed to run (see file $logfile)." else print "$opf: $sn: TSM dsmc expire command failed to run." fi freeLockAndExit $rc_fail fi # TSM restore command processing # elif [[ $tsmCommand = restore ]] then # Clear the error log. We are only interested in the error log # created from issuing the restore command. if [[ -a $logfile && -s $logfile ]] then $rm -rf $logfile > /dev/null 2>&1 rc=$? if [[ $rc -gt 0 ]] then print "$opf: $sn: cannot rm $logfile" freeLockAndExit $rc_fail fi fi # Issue the TSM command to have the list of files restored # NOTE: We restore the files specified to the mount point of the file # system and not to the snapshot the files were backed up from. # $dsmc $tsmCommand -filelist=$filelistName $mountPoint > /tmp/out1r 2>/tmp/out2r $dsmc $tsmCommand -filelist=$filelistName $mountPoint > /dev/null 2>&1 rc=$? # The return code only indicates whether the TSM executable ran successfully. # The TSM client returns a code greater than 0 (zero) only if TSM was # unable to attempt the operations. # Only those return codes output to the error report (dsmerror.log) # indicate the success or failure of specific TSM commands. # NOTE: # According to TSM documentation ("Return Codes from TSM Executables # and Shell Scripts"): # TSM returns 0 even if the file is not successfully backed up, # archived, or restored if the reason for the unsuccessful process is: # . the client could not establish a session with the TSM server, or # . the file does not exist. # WARNING: # In unit testing it was learned that if a file does not exist, # the return code on a restore operation is not zero. # The following code tries to deal with this situation # (which situation contradicts what was stated above). # If we have an error log file examine it to determine the files # which were not successfully restored. if [[ -a $logfile && -s $logfile ]] then # Process the error log and construct the transactions_pending file. # # The following is a line from the pertinent error log file: # 04/05/02 15:41:52 ANS4007E Error processing # '/backup_tsm/.backup_snapshot/.NodeCtrl' # or # 11/16/01 15:24:56 ANS1345E No objects on server match # '/log/michail/file_4' # Check for objects which failed to get processed (i.e., restored). pendingRestoresList="$mountPoint$tpl"_"$processIndex" $grep "${errorProcessingString}" $logfile | \ $cut -f2 -d\' > ${pendingRestoresList} # Check for objects which do not match, i.e., # individual files which were not restored. $grep "${objectErrorString}" $logfile | \ $cut -f2 -d\' >> ${pendingRestoresList} if [[ -a $pendingRestoresList && -s $pendingRestoresList ]] then print "$opps: $sn: TSM dsmc restore command partially succeeded (see file $logfile)." freeLockAndExit $rc_psuccess else print "$opf: $sn: TSM dsmc restore command failed (see file $logfile)." freeLockAndExit $rc_fail fi elif [[ $rc -eq 0 ]] then print "$ops: $sn: TSM dsmc restore command did not produce any error log." freeLockAndExit $rc_success else print "$opf: $sn: TSM dsmc restore command failed to run." freeLockAndExit $rc_fail fi else print "$opf: $sn: Wrong value ($tsmCommand) passed for TSM operation." freeLockAndExit $rc_fail fi freeLockAndExit $rc_success