1 | #!/bin/ksh |
---|
2 | ################################################################################ |
---|
3 | # |
---|
4 | # Module: gpfs.snap |
---|
5 | # |
---|
6 | # Description: |
---|
7 | # This script attempts to collect all of the data likely to be needed |
---|
8 | # when reporting a GPFS-related problem. |
---|
9 | # |
---|
10 | # Syntax: |
---|
11 | # gpfs.snap [-c "CmdString"] [-d OutputDirectory] [-p] [-x {1 | 2}] [-y | -z] |
---|
12 | # [-a | -W NodeFilename | -w NodeName[,NodeName...] | |
---|
13 | # -n NodeNumber[,NodeNumber...]] |
---|
14 | # |
---|
15 | # -c "CmdString" Run the command string on the specified nodes. |
---|
16 | # If -c "CmdString" is used, the data collected is |
---|
17 | # just the data for the specified command string; |
---|
18 | # the standard data collected by gpfs.snap is not |
---|
19 | # collected. CmdString may consist of multiple |
---|
20 | # commands separated by semi-colons. |
---|
21 | # The d, p, x, y, and z flags have no effect when |
---|
22 | # -c "CmdString" is specified. |
---|
23 | # -d OutputDirectory Directory to be used for output. |
---|
24 | # The default is /tmp/gpfs.snapOut |
---|
25 | # -p Skip the problem determination sequence (applies to master only). |
---|
26 | # -x 1 Check whether there is enough space but do not collect data. |
---|
27 | # 2 Collect data only; do not check whether there is enough space. |
---|
28 | # -y Collect snaps only from nodes specified. |
---|
29 | # -z Collect data only from this node - no "master" data. |
---|
30 | # |
---|
31 | # Node specification options: |
---|
32 | # -a Collect data on all nodes. This is the default. |
---|
33 | # Cannot be specified with -n, -w, or -W. |
---|
34 | # -n nodeNumList Collect data on the nodes in the list of node numbers. |
---|
35 | # Cannot be specified with -a or -W. |
---|
36 | # -w nodeList Collect data on the nodes in the list of node names. |
---|
37 | # Cannot be specified with -a or -W. |
---|
38 | # -W nodeFile Collect data on the nodes in the file. |
---|
39 | # Cannot be specified with -a, -n, or -w. |
---|
40 | # |
---|
41 | # Outputs: |
---|
42 | # If -d option is specified, this output file will be stored in the |
---|
43 | # user-specified directory. |
---|
44 | # If -d option is not specified, the output file will be put in the |
---|
45 | # /tmp/gpfs.snapOut directory. |
---|
46 | # When run without the -z flag, snaps from the nodes will all be collected |
---|
47 | # into a tar file named all.xxxxx.tar, where xxxxx is a timestamp. |
---|
48 | # Otherwise, the file name will be gpfs.snap.node_number.xxxxxxxx.out.tar.Z |
---|
49 | # (a compressed file), where xxxxxxxx is the timestamp for the time the |
---|
50 | # script was run. |
---|
51 | # |
---|
52 | # The file contains output from the following commands: |
---|
53 | # |
---|
54 | # lsdev -C ALWAYS for AIX |
---|
55 | # lspv ALWAYS for AIX |
---|
56 | # lsattr -El for all physical disks ALWAYS for AIX |
---|
57 | # lsvg -o, lsvg -l, lsvg, ls -l /dev/VGs ALWAYS for AIX |
---|
58 | # lsfs ALWAYS for AIX |
---|
59 | # |
---|
60 | # Files |
---|
61 | # /etc/fstab (Linux) |
---|
62 | # /etc/filesystems (AIX) |
---|
63 | # /var/adm/ras/mmfs.log.* |
---|
64 | # /var/mmfs/etc/* |
---|
65 | # /var/mmfs/gen/* |
---|
66 | # /var/mmfs/ssl/* |
---|
67 | # /var/mmfs/tmp/* |
---|
68 | # |
---|
69 | # Miscellaneous commands |
---|
70 | # ps -edf ALWAYS |
---|
71 | # errpt -a ALWAYS |
---|
72 | # df -k ALWAYS |
---|
73 | # lslpp -ha ALWAYS |
---|
74 | # lssrc -a ALWAYS |
---|
75 | # vmstat 5 5 ALWAYS |
---|
76 | # vmstat -s ALWAYS |
---|
77 | # |
---|
78 | # Network stuff (ALWAYS) |
---|
79 | # echo $NSorder |
---|
80 | # no -a |
---|
81 | # netstat -m |
---|
82 | # netstat -i -n |
---|
83 | # netstat -rn |
---|
84 | # netstat -D |
---|
85 | # entstat en* |
---|
86 | # tokstat tr* |
---|
87 | # ifconfig (on all adapters in hats groups) |
---|
88 | # |
---|
89 | # Dependencies: |
---|
90 | # The script must be run as root and requires rsh access to remote nodes. |
---|
91 | # |
---|
92 | ################################################################################ |
---|
93 | #"@(#)44 1.31.1.4 src/avs/fs/mmfs/ts/admin/gpfs.snap.sh, mmfs, avs_rgpfs24, rgpfs24s012a 4/2/07 01:34:01" |
---|
94 | |
---|
95 | starttime=$(date) |
---|
96 | VERSION=1.31.1.4 |
---|
97 | |
---|
98 | # Include global declarations and service routines. |
---|
99 | . /usr/lpp/mmfs/bin/mmglobfuncs |
---|
100 | . /usr/lpp/mmfs/bin/mmsdrfsdef |
---|
101 | |
---|
102 | sourceFile="gpfs.snap.sh" |
---|
103 | [[ -n $DEBUGgpfssnap ]] && set -x |
---|
104 | $mmTRACE_ENTER "$*" |
---|
105 | |
---|
106 | # Global variables |
---|
107 | |
---|
108 | mmlsmgr=/usr/lpp/mmfs/bin/mmlsmgr |
---|
109 | outputDelimiter="######################################################################" |
---|
110 | |
---|
111 | |
---|
112 | # Local work files. Names should be of the form: |
---|
113 | # fn=${tmpDir}fn.${mmcmd}.$$ |
---|
114 | |
---|
115 | trcFile=${tmpDir}trcFile.${mmcmd}.$$ # file replacement for trclist |
---|
116 | newrcFile=${tmpDir}newrcFile.${mmcmd}.$$ # file replacement for newrclist |
---|
117 | commaFile=${tmpDir}commaFile.${mmcmd}.$$ # file equivalent of commalist |
---|
118 | nodefilecFile=${tmpDir}nodefilecFile.${mmcmd}.$$ # file replacement for nodefileclist |
---|
119 | nodefile2=${tmpDir}nodefile2.${mmcmd}.$$ # file of reachable specified nodes |
---|
120 | |
---|
121 | LOCAL_FILES=" $trcFile $newrcFile $commaFile $nodefilecFile $nodefile2 " |
---|
122 | |
---|
123 | |
---|
124 | # Local functions |
---|
125 | |
---|
126 | |
---|
127 | function removefromlist |
---|
128 | { |
---|
129 | typeset sourceFile="gpfs.snap.sh" |
---|
130 | [[ -n $DEBUGgpfssnap || -n $DEBUGremovefromlist ]] && set -x |
---|
131 | $mmTRACE_ENTER "$*" |
---|
132 | |
---|
133 | llist=$($cat $nodefile) |
---|
134 | |
---|
135 | $rm ${nodefile}.tmp 2>/dev/null |
---|
136 | $touch ${nodefile}.tmp |
---|
137 | |
---|
138 | k=0 |
---|
139 | for i in $llist |
---|
140 | do |
---|
141 | gotit=-1 |
---|
142 | for j in $1 |
---|
143 | do |
---|
144 | if [[ $i = $j ]] |
---|
145 | then |
---|
146 | gotit=$k |
---|
147 | break |
---|
148 | fi |
---|
149 | done |
---|
150 | if [[ $gotit = -1 ]] |
---|
151 | then |
---|
152 | echo $i >> ${nodefile}.tmp |
---|
153 | else |
---|
154 | l=$gotit |
---|
155 | ddone=0 |
---|
156 | while [[ $ddone = 0 ]] |
---|
157 | do |
---|
158 | (( m = l + 1 )) |
---|
159 | if [[ -n ${hostarray[$m]} ]] |
---|
160 | then |
---|
161 | hostarray[$l]=${hostarray[$m]} |
---|
162 | (( l = l + 1 )) |
---|
163 | else |
---|
164 | unset hostarray[$l] |
---|
165 | ddone=1 |
---|
166 | fi |
---|
167 | done |
---|
168 | fi |
---|
169 | (( k = k + 1 )) |
---|
170 | done |
---|
171 | |
---|
172 | $mv ${nodefile}.tmp ${nodefile} |
---|
173 | |
---|
174 | } #----- end of function removefromlist ----------------------- |
---|
175 | |
---|
176 | |
---|
177 | function checklist |
---|
178 | { |
---|
179 | typeset sourceFile="gpfs.snap.sh" |
---|
180 | [[ -n $DEBUGgpfssnap || -n $DEBUGchecklist ]] && set -x |
---|
181 | $mmTRACE_ENTER "$*" |
---|
182 | |
---|
183 | $rm $nodefile2 $tmpfile 2>/dev/null |
---|
184 | $touch $nodefile2 $tmpfile |
---|
185 | list=$($cat $nodefile) |
---|
186 | $rm /tmp/hostfile 2>/dev/null |
---|
187 | for i in $list |
---|
188 | do |
---|
189 | if [[ $i = $myhname ]] |
---|
190 | then |
---|
191 | continue |
---|
192 | fi |
---|
193 | bad=0 |
---|
194 | $ping -c1 -w5 $i >/dev/null 2>/tmp/err |
---|
195 | if [[ $? = 0 ]] |
---|
196 | then |
---|
197 | $mmdsh -L $i K5MUTE=1 /bin/hostname >/tmp/hostname 2>/tmp/err & |
---|
198 | waitforit |
---|
199 | if [[ -s /tmp/hostname ]] |
---|
200 | then |
---|
201 | echo $i >> $tmpfile |
---|
202 | thostname=$($cat /tmp/hostname | $awk '{print $2}') |
---|
203 | if [[ $thostname = $myhname ]] |
---|
204 | then |
---|
205 | continue |
---|
206 | fi |
---|
207 | echo $i >> $nodefile2 |
---|
208 | $cat /tmp/hostname | $awk '{print $2}' >>/tmp/hostfile |
---|
209 | else |
---|
210 | bad="mmdsh" |
---|
211 | fi |
---|
212 | else |
---|
213 | bad="ping" |
---|
214 | fi |
---|
215 | if [[ $bad != 0 ]] |
---|
216 | then |
---|
217 | [[ -z $cflag ]] && \ |
---|
218 | print "\nCannot collect data from $i. $bad failed:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
219 | $cat /tmp/err | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
220 | already_failed="$already_failed $i" |
---|
221 | fi |
---|
222 | done |
---|
223 | $rm /tmp/err 2>/dev/null |
---|
224 | |
---|
225 | # Create two node files. |
---|
226 | # $nodefile contains all of the reachable nodes except the master node. |
---|
227 | # $nodefile2 contains all of the reachable nodes. |
---|
228 | $mv $nodefile2 $nodefile 2>/dev/null |
---|
229 | $mv $tmpfile $nodefile2 2>/dev/null |
---|
230 | hlist=$($cat /tmp/hostfile 2>/dev/null) |
---|
231 | set -f ; set -A hostarray $hlist ; set +f |
---|
232 | |
---|
233 | } #----- end of function checklist ---------------------------- |
---|
234 | |
---|
235 | |
---|
236 | function addtolist |
---|
237 | { |
---|
238 | typeset sourceFile="gpfs.snap.sh" |
---|
239 | [[ -n $DEBUGgpfssnap || -n $DEBUGaddtolist ]] && set -x |
---|
240 | $mmTRACE_ENTER "$*" |
---|
241 | |
---|
242 | list=$($cat $nodefile) |
---|
243 | hlist=$($cat /tmp/hostfile 2>/dev/null) |
---|
244 | |
---|
245 | j=0 |
---|
246 | for i in $1 |
---|
247 | do |
---|
248 | if [[ $i = $myhname ]] |
---|
249 | then |
---|
250 | continue |
---|
251 | fi |
---|
252 | gotit=0 |
---|
253 | for j in $list |
---|
254 | do |
---|
255 | if [[ $i = $j ]] |
---|
256 | then |
---|
257 | gotit=1 |
---|
258 | break |
---|
259 | fi |
---|
260 | done |
---|
261 | bad=0 |
---|
262 | if [[ $gotit != 1 ]] |
---|
263 | then |
---|
264 | already_got=0 |
---|
265 | for k in $already_failed |
---|
266 | do |
---|
267 | if [[ $i = $k ]] |
---|
268 | then |
---|
269 | already_got=1 |
---|
270 | break; |
---|
271 | fi |
---|
272 | done |
---|
273 | if [[ $already_got = 1 ]] |
---|
274 | then |
---|
275 | continue |
---|
276 | fi |
---|
277 | $ping -c1 -w5 $i >/dev/null 2>/tmp/err |
---|
278 | if [[ $? = 0 ]] |
---|
279 | then |
---|
280 | $mmdsh -L $i K5MUTE=1 /bin/hostname >/tmp/hostname 2>/tmp/err & |
---|
281 | waitforit |
---|
282 | if [[ -s /tmp/hostname ]] |
---|
283 | then |
---|
284 | gotit=0 |
---|
285 | j=0 |
---|
286 | hname=$($cat /tmp/hostname | $awk '{print $2}') |
---|
287 | if [[ $hname = $myhname ]] |
---|
288 | then |
---|
289 | continue |
---|
290 | fi |
---|
291 | while [[ -n ${hostarray[$j]} ]] |
---|
292 | do |
---|
293 | if [[ $hname = ${hostarray[$j]} ]] |
---|
294 | then |
---|
295 | gotit=1 |
---|
296 | break |
---|
297 | fi |
---|
298 | (( j = j + 1 )) |
---|
299 | done |
---|
300 | if [[ $gotit = 0 ]] |
---|
301 | then |
---|
302 | hostarray[$j]=$hname |
---|
303 | echo $i >> $nodefile |
---|
304 | fi |
---|
305 | else |
---|
306 | bad="mmdsh" |
---|
307 | fi |
---|
308 | else |
---|
309 | bad="ping" |
---|
310 | fi |
---|
311 | if [[ $bad != 0 ]] |
---|
312 | then |
---|
313 | print "\nCannot collect data from $i. $bad failed:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
314 | $cat /tmp/err | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
315 | already_failed="$already_failed $i" |
---|
316 | fi |
---|
317 | fi |
---|
318 | done |
---|
319 | |
---|
320 | $rm /tmp/err 2>/dev/null |
---|
321 | |
---|
322 | } #----- end of function addtolist ---------------------------- |
---|
323 | |
---|
324 | |
---|
325 | function check_space |
---|
326 | { |
---|
327 | typeset sourceFile="gpfs.snap.sh" |
---|
328 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_space ]] && set -x |
---|
329 | $mmTRACE_ENTER "$*" |
---|
330 | |
---|
331 | # The following extraction takes into account that the output |
---|
332 | # of df is arranged differently under Linux than under AIX. |
---|
333 | dfOutput=$($df -k $LOGDIR | $tail -n +2) |
---|
334 | if [[ $os = "AIX" ]] |
---|
335 | then |
---|
336 | FREE_SPACE=$(print $dfOutput | $awk '{print $3}') |
---|
337 | else |
---|
338 | FREE_SPACE=$(print $dfOutput | $awk '{print $4}') |
---|
339 | fi |
---|
340 | FREE_SPACE=$(expr $FREE_SPACE - 1) |
---|
341 | |
---|
342 | # Give ourselves a .1 safety margin. |
---|
343 | (( maxbytes = max_bytes + max_bytes / 10 )) |
---|
344 | (( total_bytes = total_bytes + total_bytes / 10 )) |
---|
345 | |
---|
346 | if [[ $total_bytes < 1000000 ]] |
---|
347 | then |
---|
348 | factor=30 |
---|
349 | else |
---|
350 | if [[ $total_bytes < 2000000 ]] |
---|
351 | then |
---|
352 | factor=25 |
---|
353 | else |
---|
354 | factor=20 |
---|
355 | fi |
---|
356 | fi |
---|
357 | |
---|
358 | # Guess how large the compressed file will be. |
---|
359 | (( zipped_bytes = total_bytes * factor / 100 )) |
---|
360 | (( adjusted_bytes = total_bytes + zipped_bytes )) |
---|
361 | if [[ $adjusted_bytes > $maxbytes ]] |
---|
362 | then |
---|
363 | maxbytes=$adjusted_bytes |
---|
364 | fi |
---|
365 | |
---|
366 | if [[ $master = 1 && -s $nodefile ]] |
---|
367 | then |
---|
368 | (( max_zipped_bytes = zipped_bytes * 2 )) |
---|
369 | # when we tar our own Z file.... |
---|
370 | |
---|
371 | notenough=$($grep "There is not enough space" $BASELOGDIR/pass1outfile | $cut -f1 -d :) |
---|
372 | for ii in $notenough |
---|
373 | do |
---|
374 | print "Node $ii reports it does not have enough space in ${BASELOGDIR}/${logdate}\nRemoving from list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
375 | removefromlist $ii |
---|
376 | removefromrlist $ii |
---|
377 | done |
---|
378 | |
---|
379 | for ii in $rlist |
---|
380 | do |
---|
381 | size=$($grep "compressed file" $BASELOGDIR/pass1outfile | $grep $ii | $awk '{print $7}') |
---|
382 | if [[ -n $size ]] |
---|
383 | then |
---|
384 | # zipped_bytes = max at end |
---|
385 | # tmp_max_zipped bytes is max at any point |
---|
386 | (( zipped_bytes = zipped_bytes + size )) |
---|
387 | (( tmp_max_zipped_bytes = zipped_bytes + size )) |
---|
388 | if [[ $tmp_max_zipped_bytes -gt $max_zipped_bytes ]] |
---|
389 | then |
---|
390 | max_zipped_bytes=$tmp_max_zipped_bytes |
---|
391 | fi |
---|
392 | fi |
---|
393 | done |
---|
394 | for ii in $glist |
---|
395 | do |
---|
396 | size=$($grep "requires about" $BASELOGDIR/pass1outfile | $grep $ii | $awk '{print $5}') |
---|
397 | if [[ -n $size ]] |
---|
398 | then |
---|
399 | (( zipped_bytes = zipped_bytes + size )) |
---|
400 | (( tmp_max_zipped_bytes = zipped_bytes + size )) |
---|
401 | if [[ $tmp_max_zipped_bytes -gt $max_zipped_bytes ]] |
---|
402 | then |
---|
403 | max_zipped_bytes=$tmp_max_zipped_bytes |
---|
404 | fi |
---|
405 | fi |
---|
406 | done |
---|
407 | |
---|
408 | if [[ "$max_zipped_bytes" -gt "$maxbytes" ]] |
---|
409 | then |
---|
410 | maxbytes=$max_zipped_bytes |
---|
411 | fi |
---|
412 | else |
---|
413 | print "compressed file will be about $zipped_bytes bytes" |
---|
414 | fi |
---|
415 | |
---|
416 | total_block=$(expr $maxbytes / 1024) |
---|
417 | |
---|
418 | echo "gpfs.snap requires about $maxbytes bytes" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
419 | if [[ "$total_block" -gt "$FREE_SPACE" ]] |
---|
420 | then |
---|
421 | echo "gpfs.snap requires about $maxbytes bytes" >> ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
422 | echo "There is not enough space in ${BASELOGDIR}. Either increase\nthe filesystem size or choose a different filesystem with the -d option." | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
423 | $rm -r ${LOGDIR} 2>/dev/null |
---|
424 | if [[ $master = 1 ]] |
---|
425 | then |
---|
426 | $rm -r $NODESDIR 2>/dev/null |
---|
427 | fi |
---|
428 | exit 1 |
---|
429 | fi |
---|
430 | |
---|
431 | $rm $BASELOGDIR/pass1outfile 2>/dev/null |
---|
432 | |
---|
433 | } #----- end of function check_space -------------------------- |
---|
434 | |
---|
435 | |
---|
436 | function check_waiters2 |
---|
437 | { |
---|
438 | typeset sourceFile="gpfs.snap.sh" |
---|
439 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_waiters2 ]] && set -x |
---|
440 | $mmTRACE_ENTER "$*" |
---|
441 | |
---|
442 | $mkdir -p ${LOGDIR}/waiters 2>/dev/null |
---|
443 | |
---|
444 | firstone=1 |
---|
445 | |
---|
446 | if [[ $pass = 1 ]] |
---|
447 | then |
---|
448 | cp ${BASELOGDIR}/*_waiters ${LOGDIR}/waiters 2>/dev/null |
---|
449 | if [[ -n $aflag ]] |
---|
450 | then |
---|
451 | if [[ -s $rcFile ]] |
---|
452 | then |
---|
453 | $mmdsh -F $rcFile K5MUTE=1 $rcp ${BASELOGDIR}/\*_waiters $my_hostname:${LOGDIR}/waiters & |
---|
454 | waitforit NULL 60 |
---|
455 | $mmdsh -F $rcFile K5MUTE=1 rm -f ${BASELOGDIR}/\*_waiters & |
---|
456 | waitforit NULL 60 |
---|
457 | fi |
---|
458 | else |
---|
459 | if [[ -s $brcFile ]] |
---|
460 | then |
---|
461 | $mmdsh -F $brcFile K5MUTE=1 $rcp ${BASELOGDIR}/\*_waiters $my_hostname:${LOGDIR}/waiters & |
---|
462 | waitforit NULL 60 |
---|
463 | $mmdsh -F $brcFile K5MUTE=1 rm -f ${BASELOGDIR}/\*_waiters & |
---|
464 | waitforit NULL 60 |
---|
465 | fi |
---|
466 | fi # end of if [[ -n $aflag ]] |
---|
467 | size=$(du -ks ${LOGDIR}/waiters | $cut -f1) |
---|
468 | if [[ -n $size ]] |
---|
469 | then |
---|
470 | addit $size ${LOGDIR}/waiters |
---|
471 | fi |
---|
472 | else |
---|
473 | list=$($ls ${LOGDIR}/waiters) |
---|
474 | for i in $list |
---|
475 | do |
---|
476 | if [[ -s ${LOGDIR}/waiters/${i} ]] |
---|
477 | then |
---|
478 | tarit waiters/${i} |
---|
479 | fi |
---|
480 | done |
---|
481 | fi # end of if [[ $pass = 1 ]] |
---|
482 | |
---|
483 | } #----- end of function check_waiters2 ----------------------- |
---|
484 | |
---|
485 | |
---|
486 | function check_files2 |
---|
487 | { |
---|
488 | typeset sourceFile="gpfs.snap.sh" |
---|
489 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_files2 ]] && set -x |
---|
490 | $mmTRACE_ENTER "$*" |
---|
491 | |
---|
492 | dirlist=$($ls -d ${LOGDIR}/bad.*.files 2>/dev/null) |
---|
493 | |
---|
494 | for i in $dirlist |
---|
495 | do |
---|
496 | if [[ $pass = 1 ]] |
---|
497 | then |
---|
498 | size=$(du -ks $i | $cut -f1) |
---|
499 | addit $size $i |
---|
500 | else |
---|
501 | basedir=$(basename $i) |
---|
502 | filelist=$($ls $i) |
---|
503 | for j in $filelist |
---|
504 | do |
---|
505 | tarit $basedir/$j |
---|
506 | done |
---|
507 | fi |
---|
508 | done |
---|
509 | |
---|
510 | } #----- end of function check_files2 ------------------------- |
---|
511 | |
---|
512 | |
---|
513 | function check_files |
---|
514 | { |
---|
515 | typeset sourceFile="gpfs.snap.sh" |
---|
516 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_files ]] && set -x |
---|
517 | $mmTRACE_ENTER "$*" |
---|
518 | |
---|
519 | name=$(basename $1) |
---|
520 | |
---|
521 | if [[ ! -s $2 ]] |
---|
522 | then |
---|
523 | return |
---|
524 | fi |
---|
525 | |
---|
526 | $mmdsh -F $2 K5MUTE=1 sum $1 > $name.outfile & |
---|
527 | waitforit NULL 60 |
---|
528 | diffs=$($sort -uk 2,2 $name.outfile | $wc -l) |
---|
529 | diffs=${diffs##*( )} |
---|
530 | if [[ $diffs -gt 1 ]] |
---|
531 | then |
---|
532 | list=$($sort -uk 2,2 $name.outfile | $awk '{print $2}') |
---|
533 | most=0 |
---|
534 | for i in $list |
---|
535 | do |
---|
536 | num=$($grep $i $name.outfile | $wc -l) |
---|
537 | if [[ $num -gt $most ]] |
---|
538 | then |
---|
539 | mostsum=$i |
---|
540 | most=$num |
---|
541 | fi |
---|
542 | done |
---|
543 | if [[ $base = mmsdrfs ]] |
---|
544 | then |
---|
545 | sdrfssum="$sdrfssum $mostsum" |
---|
546 | fi |
---|
547 | $mkdir ${LOGDIR}/bad.${name}.files |
---|
548 | badlist=$($grep -v $mostsum $name.outfile | $cut -f1 -d :) |
---|
549 | goodlist=$($grep $mostsum $name.outfile | $cut -f1 -d :) |
---|
550 | echo $goodlist | $grep $my_hostname >/dev/null |
---|
551 | if [[ $? = 0 ]] |
---|
552 | then |
---|
553 | cp $1 ${LOGDIR}/bad.${name}.files/${name}.$my_hostname.good |
---|
554 | else |
---|
555 | goodnode=$(echo $goodlist | $awk '{print $1}') |
---|
556 | $mmdsh -L $goodnode K5MUTE=1 $rcp $1 $my_hostname:${LOGDIR}/bad.${name}.files/${name}.${goodnode}.good & |
---|
557 | waitforit NULL 60 |
---|
558 | fi |
---|
559 | if [[ -n $3 ]] |
---|
560 | then |
---|
561 | print "\nThe following nodes $name files are different and are in the minority in $groupname $3" | $tee -a ${BASELOGDIR}/problem.${my_hostname} |
---|
562 | else |
---|
563 | print "\nThe following nodes $name files are different and are in the minority" | $tee -a ${BASELOGDIR}/problem.${my_hostname} |
---|
564 | fi |
---|
565 | echo "$badlist\n" | $tee -a ${BASELOGDIR}/problem.${my_hostname} |
---|
566 | for i in $badlist |
---|
567 | do |
---|
568 | $mmdsh -L $i K5MUTE=1 $rcp $1 $my_hostname:${LOGDIR}/bad.${name}.files/${name}.$i.bad & |
---|
569 | waitforit NULL 60 |
---|
570 | done |
---|
571 | else |
---|
572 | if [[ $name = mmsdrfs ]] |
---|
573 | then |
---|
574 | tsum=$($cat $name.outfile | $head -1 | $cut -f1 -d " ") |
---|
575 | sdrfssum="$sdrfssum $tsum" |
---|
576 | fi |
---|
577 | fi # end of if [[ $diffs -gt 1 ]] |
---|
578 | |
---|
579 | $rm $name.outfile 2>/dev/null |
---|
580 | |
---|
581 | } #----- end of function check_files -------------------------- |
---|
582 | |
---|
583 | |
---|
584 | function check_waiters |
---|
585 | { |
---|
586 | typeset sourceFile="gpfs.snap.sh" |
---|
587 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_waiters ]] && set -x |
---|
588 | $mmTRACE_ENTER "$*" |
---|
589 | |
---|
590 | # Check this cluster. |
---|
591 | firstone=1 |
---|
592 | |
---|
593 | $mmdsh -F $nodefile2 "K5MUTE=1 mkdir ${BASELOGDIR} 2>/dev/null; K5MUTE=1 $mmfsadm dump waiters > ${BASELOGDIR}/$(hostname -s)_waiters" & |
---|
594 | waitforit NULL 60 |
---|
595 | $mmdsh -F $nodefile2 "K5MUTE=1 $mmfsadm dump waiters | grep -v '===== dump waiters ====='" > longwaiters 2>/dev/null & |
---|
596 | waitforit NULL 60 |
---|
597 | |
---|
598 | if [[ -s $brcFile ]] |
---|
599 | then |
---|
600 | $mmdsh -F $brcFile "K5MUTE=1 grep tmMsgRevoke ${BASELOGDIR}/$(hostname | cut -d. -f1)_waiters yamo" >> ${BASELOGDIR}/grepped-waiters 2>/dev/null & |
---|
601 | waitforit NULL 60 |
---|
602 | fi |
---|
603 | |
---|
604 | if [[ -n $bglist ]] |
---|
605 | then |
---|
606 | $mmdsh -L $my_hostname "K5MUTE=1 grep tmMsgRevoke ${BASELOGDIR}/\*_waiters yamo" >> ${BASELOGDIR}/grepped-waiters 2>/dev/null & |
---|
607 | waitforit NULL 60 |
---|
608 | fi |
---|
609 | |
---|
610 | if [[ -s ${BASELOGDIR}/grepped-waiters ]] |
---|
611 | then |
---|
612 | print "\nThere are waiters for tmMsgRevokes:" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
613 | if [[ $yflag != 1 ]] |
---|
614 | then |
---|
615 | print "Data will be collected from these nodes:" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
616 | fi |
---|
617 | newlist="" |
---|
618 | { |
---|
619 | while read line |
---|
620 | do |
---|
621 | tnode=$(echo $line | $cut -f2 -d :) |
---|
622 | thisnode=$(basename $tnode | $cut -f1 -d "_") |
---|
623 | echo $line | $grep "tmMsgRevoke on node" >/dev/null 2>&1 |
---|
624 | if [[ $? = 0 ]] |
---|
625 | then |
---|
626 | addr=${line##*tmMsgRevoke on node} |
---|
627 | print "waiter on $thisnode, tmMsgRevoke from $addr" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
628 | echo $newlist | $grep -w $addr >/dev/null 2>&1 |
---|
629 | if [[ $? != 0 ]] |
---|
630 | then |
---|
631 | newlist="$newlist $addr" |
---|
632 | fi |
---|
633 | else |
---|
634 | print "waiter on $thisnode, tmMsgRevoke" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
635 | fi |
---|
636 | echo $newlist | $grep -w $thisnode >/dev/null 2>&1 |
---|
637 | if [[ $? != 0 ]] |
---|
638 | then |
---|
639 | newlist="$newlist $thisnode" |
---|
640 | fi |
---|
641 | done |
---|
642 | echo "" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
643 | } < ${BASELOGDIR}/grepped-waiters |
---|
644 | if [[ $yflag != 1 && -z $aflag ]] |
---|
645 | then |
---|
646 | addtolist "$newlist" |
---|
647 | fi |
---|
648 | fi # end of if [[ -s ${BASELOGDIR}/grepped-waiters ]] |
---|
649 | |
---|
650 | $rm ${BASELOGDIR}/grepped-waiters 2>/dev/null |
---|
651 | |
---|
652 | if [[ -s longwaiters ]] |
---|
653 | then |
---|
654 | $sort -nrk 4,4 longwaiters > ${LOGDIR}/long_waiters.sorted |
---|
655 | list=$($cat ${LOGDIR}/long_waiters.sorted | $head -5 | $cut -f1 -d ":") |
---|
656 | if [[ $yflag = 1 ]] |
---|
657 | then |
---|
658 | print "There are long waiters. The 5 longest waiters are on the following nodes:\n$list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
659 | else |
---|
660 | print "There are long waiters. The 5 longest waiters are on the following nodes, which will be added to the list to collect data from\n$list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
661 | addtolist "$list" |
---|
662 | fi |
---|
663 | fi # end of if [[ -s longwaiters ]] |
---|
664 | |
---|
665 | $rm longwaiters 2>/dev/null |
---|
666 | |
---|
667 | } #----- end of function check_waiters ------------------------ |
---|
668 | |
---|
669 | |
---|
670 | function check_dumps |
---|
671 | { |
---|
672 | typeset sourceFile="gpfs.snap.sh" |
---|
673 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_dumps ]] && set -x |
---|
674 | $mmTRACE_ENTER "$*" |
---|
675 | |
---|
676 | filePrefix=$1 |
---|
677 | maxFiles=$2 |
---|
678 | dumpDir=$3 |
---|
679 | |
---|
680 | internal_list="" |
---|
681 | |
---|
682 | if [[ ! -a $dumpDir ]] |
---|
683 | then |
---|
684 | return |
---|
685 | fi |
---|
686 | |
---|
687 | savedir=$(pwd) |
---|
688 | cd $dumpDir |
---|
689 | |
---|
690 | numfiles=$($ls -ltr $filePrefix.*.* 2>/dev/null | $grep -v shutdown | $wc -l) |
---|
691 | if [[ $numfiles -eq 0 ]] |
---|
692 | then |
---|
693 | cd $savedir |
---|
694 | return |
---|
695 | fi |
---|
696 | if [[ $maxFiles -gt 0 && $numfiles -gt $maxFiles ]] |
---|
697 | then |
---|
698 | print "There are $numfiles $filePrefix files in $dumpDir.\nBecause these files are large I am only grabbing the latest $maxFiles.\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
699 | fi |
---|
700 | |
---|
701 | today=$(date +%j) |
---|
702 | today=${today##+(0)} |
---|
703 | |
---|
704 | if [[ $maxFiles -lt 0 ]] |
---|
705 | then |
---|
706 | # A negative value indicates "collect them all". |
---|
707 | $ls -ltr $filePrefix.*.* 2>/dev/null | $grep -v shutdown > $tmpfile |
---|
708 | else |
---|
709 | # Collect up to the specified number of files. |
---|
710 | $ls -ltr $filePrefix.*.* 2>/dev/null | $grep -v shutdown | $tail -n -$maxFiles > $tmpfile |
---|
711 | fi |
---|
712 | |
---|
713 | # Create a list of files in global variable $internal_list |
---|
714 | # that will later be collected after we return to the caller. |
---|
715 | exec 3<&- |
---|
716 | exec 3< $tmpfile |
---|
717 | while read -u3 fileLine |
---|
718 | do |
---|
719 | month=$(echo $fileLine | $awk '{print $6}') |
---|
720 | day=$(echo $fileLine | $awk '{print $7}') |
---|
721 | name=$(echo $fileLine | $awk '{print $9}') |
---|
722 | days=0 |
---|
723 | i=1 |
---|
724 | while [[ -n ${months[$i]} ]] |
---|
725 | do |
---|
726 | if [[ $month = ${months[$i]} ]] |
---|
727 | then |
---|
728 | (( days = days + day )) |
---|
729 | break |
---|
730 | else |
---|
731 | (( days = days + ${days[$i]} )) |
---|
732 | (( i = i + 1 )) |
---|
733 | fi |
---|
734 | done # end of while [[ -n ${months[$i]} ]] do |
---|
735 | |
---|
736 | if [[ $(( today - days )) -gt 14 ]] |
---|
737 | then |
---|
738 | print "File $name is over 2 weeks old so I am not going to collect it." | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
739 | else |
---|
740 | if [[ -z $internal_list ]] |
---|
741 | then |
---|
742 | internal_list=$name |
---|
743 | else |
---|
744 | internal_list="${internal_list} $name" |
---|
745 | fi |
---|
746 | fi # end of if [[ $(( today - days )) -gt 14 ]] |
---|
747 | done # end of while read -u3 fileLine do |
---|
748 | $rm -f $tmpfile |
---|
749 | |
---|
750 | cd $savedir |
---|
751 | |
---|
752 | } #----- end of function check_dumps -------------------------- |
---|
753 | |
---|
754 | |
---|
755 | function removefromrlist |
---|
756 | { |
---|
757 | typeset sourceFile="gpfs.snap.sh" |
---|
758 | [[ -n $DEBUGgpfssnap || -n $DEBUGremovefromrlist ]] && set -x |
---|
759 | $mmTRACE_ENTER "$*" |
---|
760 | |
---|
761 | for i in $glist |
---|
762 | do |
---|
763 | if [[ $i != $1 ]] |
---|
764 | then |
---|
765 | newglist="$newglist $i" |
---|
766 | fi |
---|
767 | done |
---|
768 | glist=$newglist |
---|
769 | |
---|
770 | firstone=1 |
---|
771 | $rm -f $newrcFile |
---|
772 | for i in $rlist |
---|
773 | do |
---|
774 | if [[ $i != $1 ]] |
---|
775 | then |
---|
776 | if [[ $firstone = 1 ]] |
---|
777 | then |
---|
778 | print -- $i > $newrcFile |
---|
779 | newrlist="$i" |
---|
780 | firstone=0 |
---|
781 | else |
---|
782 | print -- $i >> $newrcFile |
---|
783 | newrlist="$newrlist $i" |
---|
784 | fi |
---|
785 | fi |
---|
786 | done |
---|
787 | rlist=$newrlist |
---|
788 | rcFile=$newrcFile |
---|
789 | |
---|
790 | } #----- end of function removefromrlist ---------------------- |
---|
791 | |
---|
792 | |
---|
793 | function check_fs |
---|
794 | { |
---|
795 | typeset sourceFile="gpfs.snap.sh" |
---|
796 | [[ -n $DEBUGgpfssnap || -n $DEBUGcheck_fs ]] && set -x |
---|
797 | $mmTRACE_ENTER "$*" |
---|
798 | |
---|
799 | echo "yamo" > ${BASELOGDIR}/yamo |
---|
800 | yamosum=$($sum yamo | $cut -f1 -d " ") |
---|
801 | |
---|
802 | $mmdsh -F $1 "K5MUTE=1 sum ${BASELOGDIR}/yamo 2>/dev/null | awk '{print \$1}'" > rsumfile & |
---|
803 | waitforit |
---|
804 | |
---|
805 | firstone=1 |
---|
806 | trlist="" |
---|
807 | tglist="" |
---|
808 | $rm -f $trcFile |
---|
809 | |
---|
810 | g=0 |
---|
811 | h=0 |
---|
812 | for i in $2 |
---|
813 | do |
---|
814 | thissum=$($grep -E "^$i:" rsumfile | $awk '{print $2}') |
---|
815 | if [[ $thissum = $yamosum ]] |
---|
816 | then |
---|
817 | tglist="$tglist $i" |
---|
818 | if [[ -n $3 ]] |
---|
819 | then |
---|
820 | ghostarray[$g]=${hostarray[$h]} |
---|
821 | (( g = g + 1 )) |
---|
822 | fi |
---|
823 | else |
---|
824 | if [[ $firstone = 1 ]] |
---|
825 | then |
---|
826 | print -- $i > $trcFile |
---|
827 | trlist=$i |
---|
828 | firstone=0 |
---|
829 | else |
---|
830 | print -- $i >> $trcFile |
---|
831 | trlist="$trlist $i" |
---|
832 | fi |
---|
833 | fi |
---|
834 | (( h = h + 1 )) |
---|
835 | done |
---|
836 | |
---|
837 | $rm ${BASELOGDIR}/yamo 2>/dev/null |
---|
838 | $rm rsumfile 2>/dev/null |
---|
839 | |
---|
840 | } #----- end of function check_fs ----------------------------- |
---|
841 | |
---|
842 | |
---|
843 | function get_files |
---|
844 | { |
---|
845 | typeset sourceFile="gpfs.snap.sh" |
---|
846 | [[ -n $DEBUGgpfssnap || -n $DEBUGget_files ]] && set -x |
---|
847 | $mmTRACE_ENTER "$*" |
---|
848 | |
---|
849 | cd $LOGDIR |
---|
850 | |
---|
851 | if [[ $pass = 1 || $x_arg = 2 ]] |
---|
852 | then |
---|
853 | if [[ $master = 1 ]] |
---|
854 | then |
---|
855 | ddd=$(date +"%m %e %H %M %y") |
---|
856 | emon=$(echo $ddd | $awk '{print $1}') |
---|
857 | eday=$(echo $ddd | $awk '{print $2}') |
---|
858 | eyr=$(echo $ddd | $awk '{print $5}') |
---|
859 | emon=${emon##0} |
---|
860 | syr=$eyr; smon=$emon; |
---|
861 | if [[ $eday -gt 14 ]] |
---|
862 | then |
---|
863 | (( sday = eday - 14 )) |
---|
864 | else |
---|
865 | (( diff = 14 - eday )) |
---|
866 | if [[ $emon -eq 1 ]] |
---|
867 | then |
---|
868 | smon=12 |
---|
869 | (( syr = eyr - 1 )) |
---|
870 | (( sday = 31 - diff )) |
---|
871 | else |
---|
872 | (( smon = smon - 1 )) |
---|
873 | (( sday = ${days[$smon]} - diff )) |
---|
874 | fi |
---|
875 | fi |
---|
876 | fi |
---|
877 | |
---|
878 | if [[ $smon = $emon ]] |
---|
879 | then |
---|
880 | months=1 |
---|
881 | endday=$eday |
---|
882 | else |
---|
883 | months=2 |
---|
884 | endday=${days[$smon]} |
---|
885 | fi |
---|
886 | |
---|
887 | curmonth=1 |
---|
888 | cmon=$smon |
---|
889 | i=${sday##0} |
---|
890 | while [[ $curmonth -le $months ]] |
---|
891 | do |
---|
892 | if [[ $curmonth -eq 2 ]] |
---|
893 | then |
---|
894 | i=1 |
---|
895 | endday=$eday |
---|
896 | cmon=$emon |
---|
897 | fi |
---|
898 | |
---|
899 | if [[ $i -lt 10 ]] |
---|
900 | then |
---|
901 | e=${endday##0} |
---|
902 | if [[ $e -lt 10 ]] |
---|
903 | then |
---|
904 | ee=$e |
---|
905 | else |
---|
906 | ee=9 |
---|
907 | fi |
---|
908 | if [[ -z $greplist ]] |
---|
909 | then |
---|
910 | greplist="${months[$cmon]} [$i-$ee]" |
---|
911 | greplistb="^${months[$cmon]} *[$i-$ee] " |
---|
912 | greplist2="^${cmon}/0[$i-$ee]" |
---|
913 | else |
---|
914 | greplist="${greplist}|${months[$cmon]} [$i-$ee]" |
---|
915 | greplistb="${greplistb}|^${months[$cmon]} *[$i-$ee] " |
---|
916 | greplist2="${greplist2}|^${cmon}/0[$i-$ee]" |
---|
917 | fi |
---|
918 | fi |
---|
919 | ii=2 |
---|
920 | while [[ $ii -lt 5 ]] |
---|
921 | do |
---|
922 | (( jj = ii * 10 )) |
---|
923 | (( kk = jj - 11 )) |
---|
924 | (( ll = jj - 10 )) |
---|
925 | (( mm = ii - 1 )) |
---|
926 | if [[ $i -lt $jj && $endday -gt $kk ]] |
---|
927 | then |
---|
928 | if [[ $i -gt $kk ]] |
---|
929 | then |
---|
930 | (( s = i - ll )) |
---|
931 | else |
---|
932 | s=0 |
---|
933 | fi |
---|
934 | |
---|
935 | (( e = endday - ll )) |
---|
936 | if [[ $endday -ge $jj ]] |
---|
937 | then |
---|
938 | e=9 |
---|
939 | fi |
---|
940 | if [[ -z $greplist ]] |
---|
941 | then |
---|
942 | greplist="${months[$cmon]} ${mm}[$s-$e]" |
---|
943 | greplistb="^${months[$cmon]} ${mm}[$s-$e]" |
---|
944 | greplist2="^${cmon}/${mm}[$s-$e]" |
---|
945 | else |
---|
946 | greplist="${greplist}|${months[$cmon]} ${mm}[$s-$e]" |
---|
947 | greplistb="${greplistb}|^${months[$cmon]} ${mm}[$s-$e]" |
---|
948 | greplist2="${greplist2}|^${cmon}/${mm}[$s-$e]" |
---|
949 | fi |
---|
950 | fi |
---|
951 | (( ii = ii + 1 )) |
---|
952 | done |
---|
953 | (( curmonth = curmonth + 1 )) |
---|
954 | done |
---|
955 | |
---|
956 | if [[ $master = 1 ]] |
---|
957 | then |
---|
958 | if [[ -n $aflag ]] |
---|
959 | then |
---|
960 | $mmdsh -L $myhname -F $nodefile K5MUTE=1 "grep -E '$greplist' $rasDir/mmfs.log.[0-9]*" > mmfslogs.unsorted 2>/dev/null |
---|
961 | else |
---|
962 | if [[ -s $nodefile2 ]] |
---|
963 | then |
---|
964 | $mmdsh -F $nodefile2 K5MUTE=1 "grep -E '$greplist' $rasDir/mmfs.log.[0-9]*" > mmfslogs.unsorted 2>/dev/null |
---|
965 | fi |
---|
966 | fi |
---|
967 | $sort -k3,5 mmfslogs.unsorted > mmfslogs.sorted 2>/dev/null |
---|
968 | |
---|
969 | $rm mmfslogs.unsorted 2>/dev/null |
---|
970 | |
---|
971 | if [[ $pass = 1 ]] |
---|
972 | then |
---|
973 | size=$($ls -l mmfslogs.sorted | $awk '{print $5}') |
---|
974 | if [[ $size != 0 ]] |
---|
975 | then |
---|
976 | addit $size mmfslogs.sorted |
---|
977 | fi |
---|
978 | fi |
---|
979 | fi |
---|
980 | fi # end of if [[ $pass = 1 || $x_arg = 2 ]] |
---|
981 | |
---|
982 | if [[ $pass = 2 ]] |
---|
983 | then |
---|
984 | cd $BASELOGDIR |
---|
985 | tarit mmfslogs.sorted |
---|
986 | fi |
---|
987 | |
---|
988 | cd $LOGDIR |
---|
989 | $cat $rasDir/mmfs.log.[0-9]* > mmfs.logs.${my_hostname} |
---|
990 | size=$($ls -l mmfs.logs.${my_hostname} | $awk '{print $5}') |
---|
991 | if [[ $pass = 1 ]] |
---|
992 | then |
---|
993 | addit $size mmfs.logs.${my_hostname} |
---|
994 | cd $BASELOGDIR |
---|
995 | else |
---|
996 | cd $BASELOGDIR |
---|
997 | tarit mmfs.logs.${my_hostname} |
---|
998 | fi |
---|
999 | |
---|
1000 | } #----- end of function get_files ---------------------------- |
---|
1001 | |
---|
1002 | |
---|
1003 | ############################################################################ |
---|
1004 | # |
---|
1005 | # Function: Get all of the files in a specified directory. |
---|
1006 | # |
---|
1007 | # Input: $1 - directory whose files are to be gotten |
---|
1008 | # |
---|
1009 | # Output: Pass 1: Calculate space for the files in the directory. |
---|
1010 | # Pass 2: Add the files in the directory to the tar file. |
---|
1011 | # |
---|
1012 | # Returns: 0 |
---|
1013 | # |
---|
1014 | ############################################################################ |
---|
1015 | function get_files_dir # <dirName> |
---|
1016 | { |
---|
1017 | typeset sourceFile="gpfs.snap.sh" |
---|
1018 | [[ -n $DEBUGgpfssnap || -n $DEBUGget_files_dir ]] && set -x |
---|
1019 | $mmTRACE_ENTER "$*" |
---|
1020 | |
---|
1021 | typeset dirName=$1 |
---|
1022 | typeset saveDir fileList |
---|
1023 | |
---|
1024 | # Generate the list of files to get, but leave out complete.map files. |
---|
1025 | # Then invoke get_files_list() to get the files. |
---|
1026 | saveDir=$(pwd) |
---|
1027 | cd $dirName 2>/dev/null |
---|
1028 | fileList=$($ls -A 2>/dev/null | $grep -v "complete.map") |
---|
1029 | cd $saveDir |
---|
1030 | if [[ -n $fileList ]] |
---|
1031 | then |
---|
1032 | get_files_list "$dirName" "$fileList" |
---|
1033 | fi |
---|
1034 | |
---|
1035 | return 0 |
---|
1036 | |
---|
1037 | } #----- end of function get_files_dir ------------------------ |
---|
1038 | |
---|
1039 | |
---|
1040 | ############################################################################ |
---|
1041 | # |
---|
1042 | # Function: Get the files specified by means of a directory and a list. |
---|
1043 | # |
---|
1044 | # Input: $1 - directory from which to get files |
---|
1045 | # $2 - list of files to be gotten from the directory |
---|
1046 | # $3 - (optional) name of subdir to use in the tar file |
---|
1047 | # |
---|
1048 | # Output: Pass 1: Calculate space for the specified files. |
---|
1049 | # Pass 2: Add the specified files to the tar file. |
---|
1050 | # |
---|
1051 | # Returns: 0 |
---|
1052 | # |
---|
1053 | ############################################################################ |
---|
1054 | function get_files_list # <dirName> <fileList> [<subdirName>] |
---|
1055 | { |
---|
1056 | typeset sourceFile="gpfs.snap.sh" |
---|
1057 | [[ -n $DEBUGgpfssnap || -n $DEBUGget_files_list ]] && set -x |
---|
1058 | $mmTRACE_ENTER "$*" |
---|
1059 | |
---|
1060 | typeset dirName=$1 |
---|
1061 | typeset fileList=$2 |
---|
1062 | typeset subdirName=$3 |
---|
1063 | typeset relDir tmpdir e |
---|
1064 | |
---|
1065 | # Based on the value of the subdir and the input directory parameters, |
---|
1066 | # calculate $relDir and $dir and optionally create directory $LOGDIR/$dir. |
---|
1067 | if [[ -n $subdirName ]] |
---|
1068 | then |
---|
1069 | relDir=$subdirName # use the name specified as an input |
---|
1070 | else |
---|
1071 | relDir=${dirName#/} # remove the leading / character |
---|
1072 | fi |
---|
1073 | tmpdir=$(echo $relDir | $grep "/") |
---|
1074 | if [[ -n $tmpdir ]] |
---|
1075 | then |
---|
1076 | dir=${relDir%/*} |
---|
1077 | $mkdir -p ${LOGDIR}/$dir 2>/dev/null |
---|
1078 | else |
---|
1079 | dir=$relDir |
---|
1080 | fi |
---|
1081 | |
---|
1082 | # Create a symlink to the passed directory. |
---|
1083 | ln -s $dirName ${LOGDIR}/$dir |
---|
1084 | |
---|
1085 | # Loop through the list passed as the 2nd parameter |
---|
1086 | # and calculate the space required if this is pass 1 |
---|
1087 | # or add the parts to the tar file if this is pass 2. |
---|
1088 | for e in $fileList |
---|
1089 | do |
---|
1090 | if [[ -f $dirName/$e ]] |
---|
1091 | then |
---|
1092 | if [[ $pass = 1 ]] |
---|
1093 | then |
---|
1094 | temp_bytes=$($ls -l $dirName/$e | $awk '{ print $5 }') |
---|
1095 | addit $temp_bytes "$dirName/$e file" |
---|
1096 | else |
---|
1097 | tarit "$relDir/$e" 1 |
---|
1098 | fi |
---|
1099 | fi # end of if [[ -f $dirName/$e ]] |
---|
1100 | done # end of for e in $fileList do |
---|
1101 | |
---|
1102 | # Remove the symlink created earlier. |
---|
1103 | $rm ${LOGDIR}/$relDir 2>/dev/null |
---|
1104 | |
---|
1105 | } #----- end of function get_files_list ----------------------- |
---|
1106 | |
---|
1107 | |
---|
1108 | function get_always |
---|
1109 | { |
---|
1110 | typeset sourceFile="gpfs.snap.sh" |
---|
1111 | [[ -n $DEBUGgpfssnap || -n $DEBUGget_always ]] && set -x |
---|
1112 | $mmTRACE_ENTER "$*" |
---|
1113 | |
---|
1114 | if [[ $os = "AIX" ]] |
---|
1115 | then |
---|
1116 | doit "errpt_a" "/usr/bin/errpt -a" |
---|
1117 | doit "lscfg_vp" "lscfg -vp" |
---|
1118 | doit "lslpp_hac" "/usr/bin/lslpp -hac" |
---|
1119 | doit "lssrc_a" "lssrc -a" |
---|
1120 | doit "no_a" "no -a" |
---|
1121 | if [[ $gotvmstat = 1 ]] |
---|
1122 | then |
---|
1123 | doit "vmstat_s" "vmstat -s" |
---|
1124 | fi |
---|
1125 | conslog=$(lscons) |
---|
1126 | if [[ $conslog != +(/)dev+(/)* && -s $conslog ]] |
---|
1127 | then |
---|
1128 | doit "lscons" "cat $conslog" |
---|
1129 | fi |
---|
1130 | |
---|
1131 | else |
---|
1132 | doit "dmesg" "dmesg" |
---|
1133 | doit "fdisk_l" "fdisk -l" |
---|
1134 | doit "lsmod" "lsmod" |
---|
1135 | doit "lspci" "lspci" |
---|
1136 | doit "rpm_qa" "rpm -qa" |
---|
1137 | doit "rpm_verify" "rpm --verify gpfs.base" 1 |
---|
1138 | doit "rpm_verify" "rpm --verify gpfs.docs" 1 |
---|
1139 | doit "rpm_verify" "rpm --verify gpfs.gpl" 1 |
---|
1140 | doit "rpm_verify" "rpm --verify gpfs.msg.en_US" |
---|
1141 | doit "uname_a" "uname -a" |
---|
1142 | doit "proc_cpuinfo" "cat /proc/cpuinfo" |
---|
1143 | doit "proc_version" "cat /proc/version" |
---|
1144 | doit "site_mcr" "cat /usr/lpp/mmfs/src/config/site.mcr" |
---|
1145 | doit "etc_release" "$grep '[a-zA-Z]' /etc/*release" |
---|
1146 | |
---|
1147 | fi # end of if [[ $os = "AIX" ]] |
---|
1148 | |
---|
1149 | doit "date" "date" |
---|
1150 | doit "df_k" "df -k" |
---|
1151 | doit "exportfs" "exportfs" |
---|
1152 | doit "gpfs_executables" "$ls -l /usr/lpp/mmfs/bin" |
---|
1153 | doit "ipcs_a" "ipcs -a" |
---|
1154 | doit "ls_dev" "$ls -l /dev" |
---|
1155 | doit "ps_edf" "ps -edf" |
---|
1156 | doit "uptime" "uptime" |
---|
1157 | |
---|
1158 | doit "mmdevdiscover" "/usr/lpp/mmfs/bin/mmdevdiscover" |
---|
1159 | doit "tspreparedisk_S" "/usr/lpp/mmfs/bin/tspreparedisk -S" |
---|
1160 | |
---|
1161 | if [[ $mmScriptTrace != /dev/null && -s $mmScriptTrace ]] |
---|
1162 | then |
---|
1163 | doit "mmScriptTrace" "cat $mmScriptTrace" |
---|
1164 | fi |
---|
1165 | |
---|
1166 | return 0 |
---|
1167 | |
---|
1168 | } #----- end of function get_always --------------------------- |
---|
1169 | |
---|
1170 | |
---|
1171 | function get_net_stuff |
---|
1172 | { |
---|
1173 | typeset sourceFile="gpfs.snap.sh" |
---|
1174 | [[ -n $DEBUGgpfssnap || -n $DEBUGget_net_stuff ]] && set -x |
---|
1175 | $mmTRACE_ENTER "$*" |
---|
1176 | |
---|
1177 | if [[ $os = "AIX" ]] |
---|
1178 | then |
---|
1179 | doit "netstat" "netstat -i -n" "1" |
---|
1180 | doit "netstat" "netstat -m" "1" |
---|
1181 | doit "netstat" "netstat -D" "1" |
---|
1182 | INTERFACES=$($lsdev -Cc if | $grep -v Defined | $cut -d " " -f1) |
---|
1183 | for i in $INTERFACES |
---|
1184 | do |
---|
1185 | doit "ifconfig" "ifconfig $i" 1 |
---|
1186 | doit "odmget_CuAt" "odmget -q name=$i CuAt" 1 |
---|
1187 | doit "lsattr" "lsattr -El $i" 1 |
---|
1188 | done |
---|
1189 | else |
---|
1190 | INTERFACES=$(netstat -i -n | $cut -f1 -d " " | $grep -v "Kernel" | $grep -v "Iface") |
---|
1191 | for i in $INTERFACES |
---|
1192 | do |
---|
1193 | doit "ifconfig" "ifconfig $i" 1 |
---|
1194 | done |
---|
1195 | fi # end of if [[ $os = "AIX" ]] |
---|
1196 | |
---|
1197 | doit "netstat" "netstat -i" "1" |
---|
1198 | doit "netstat" "netstat -r" "1" |
---|
1199 | doit "netstat" "netstat -rn" "1" |
---|
1200 | doit "netstat" "netstat -v 2>/dev/null" "1" |
---|
1201 | doit "netstat" "netstat -s" |
---|
1202 | |
---|
1203 | if [[ $pass = 2 ]] |
---|
1204 | then |
---|
1205 | tarit "ifconfig" |
---|
1206 | if [[ $os = "AIX" ]] |
---|
1207 | then |
---|
1208 | tarit "lsattr" |
---|
1209 | tarit "odmget_CuAt" |
---|
1210 | fi |
---|
1211 | fi |
---|
1212 | |
---|
1213 | } #----- end of function get_net_stuff ------------------------ |
---|
1214 | |
---|
1215 | |
---|
1216 | function get_lvm_stuff |
---|
1217 | { |
---|
1218 | typeset sourceFile="gpfs.snap.sh" |
---|
1219 | [[ -n $DEBUGgpfssnap || -n $DEBUGget_lvm_stuff ]] && set -x |
---|
1220 | $mmTRACE_ENTER "$*" |
---|
1221 | |
---|
1222 | typeset lspvLine pdisk i VGs |
---|
1223 | |
---|
1224 | if [[ $os = "AIX" ]] |
---|
1225 | then |
---|
1226 | doit "lsfs" "lsfs" |
---|
1227 | doit "lspv" "lspv" 1 |
---|
1228 | |
---|
1229 | LC_ALL=C $lspv > $lspvOutputFile |
---|
1230 | exec 3<&- |
---|
1231 | exec 3< $lspvOutputFile |
---|
1232 | while read -u3 lspvLine |
---|
1233 | do |
---|
1234 | set -f ; set -- $lspvLine ; set +f |
---|
1235 | pdisk=$1 |
---|
1236 | doit "lspv" "lsattr -El $pdisk" "1" |
---|
1237 | done |
---|
1238 | $rm -f $lspvOutputFile |
---|
1239 | |
---|
1240 | doit "lsvg" "lsvg" "1" |
---|
1241 | doit "lsvg" "lsvg -o" "1" |
---|
1242 | VGs=$($lsvg -o) |
---|
1243 | for i in $VGs |
---|
1244 | do |
---|
1245 | doit "lsvg" "lsvg -l $i" "1" |
---|
1246 | doit "getlvodm_u" "getlvodm -u $i" "1" |
---|
1247 | done |
---|
1248 | |
---|
1249 | VGs=$($lsvg) |
---|
1250 | for i in $VGs |
---|
1251 | do |
---|
1252 | doit "lsvg" "$ls -l /dev/$i" "1" |
---|
1253 | done |
---|
1254 | if [[ $pass = 2 ]] |
---|
1255 | then |
---|
1256 | tarit lspv |
---|
1257 | tarit lsvg |
---|
1258 | tarit getlvodm_u |
---|
1259 | fi |
---|
1260 | fi # end of if [[ $os = "AIX" ]] |
---|
1261 | |
---|
1262 | } #----- end of function get_lvm_stuff ------------------------ |
---|
1263 | |
---|
1264 | |
---|
1265 | function addit |
---|
1266 | { |
---|
1267 | typeset sourceFile="gpfs.snap.sh" |
---|
1268 | [[ -n $DEBUGgpfssnap || -n $DEBUGaddit ]] && set -x |
---|
1269 | $mmTRACE_ENTER "$*" |
---|
1270 | |
---|
1271 | # total_bytes is total_bytes at the end |
---|
1272 | # max_tmp is max bytes while processing this file at one time |
---|
1273 | # (size of tarfile + 2 * sizeof_file) |
---|
1274 | # max_bytes is max bytes while processing ANY file |
---|
1275 | |
---|
1276 | (( total_bytes = total_bytes + $1 )) |
---|
1277 | (( max_tmp = total_bytes + $1 )) |
---|
1278 | if (( $max_tmp > $max_bytes )) |
---|
1279 | then |
---|
1280 | max_bytes=$max_tmp |
---|
1281 | fi |
---|
1282 | if [[ $first = 1 ]] |
---|
1283 | then |
---|
1284 | echo "estimate $2 will take $1 bytes" >> ${LOGDIR}/sizes |
---|
1285 | else |
---|
1286 | echo "estimate $2 will take $1 bytes" > ${LOGDIR}/sizes |
---|
1287 | fi |
---|
1288 | first=1 |
---|
1289 | |
---|
1290 | } #----- end of function addit -------------------------------- |
---|
1291 | |
---|
1292 | |
---|
1293 | ############################################################################### |
---|
1294 | # |
---|
1295 | # Function waitforit (wait for the most recently-started background process) |
---|
1296 | # |
---|
1297 | ############################################################################### |
---|
1298 | function waitforit |
---|
1299 | { |
---|
1300 | typeset sourceFile="gpfs.snap.sh" |
---|
1301 | [[ -n $DEBUGgpfssnap || -n $DEBUGwaitforit ]] && set -x |
---|
1302 | $mmTRACE_ENTER "$*" |
---|
1303 | |
---|
1304 | mpid=$! |
---|
1305 | |
---|
1306 | if [[ -n $1 && $1 != NULL ]] |
---|
1307 | then |
---|
1308 | tmpid=$mpid |
---|
1309 | comm=$(echo $1 | $head -c 70) |
---|
1310 | $sleep 1 |
---|
1311 | mpid=$($ps -g $mypgid -o pid=PID,args=COMM | $grep "$comm" | $grep -v grep | $tail -n -1 | $awk '{print $1}') |
---|
1312 | |
---|
1313 | # In some cases commands fork themselves. We need to make sure |
---|
1314 | # we have the parent process; try to find the right one. |
---|
1315 | words=$(echo $mpid | $wc -w) |
---|
1316 | words=${words##*( )} |
---|
1317 | if [[ $words != 1 && $words != 0 ]] |
---|
1318 | then |
---|
1319 | echo "got a multiple: $mpid comm is $comm" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1320 | # if there is more than one try to get the direct descendent of |
---|
1321 | # gpfs.snap or the pid |
---|
1322 | mmpid=$($ps -f | $grep "$comm" | $grep -v grep | $grep -E "$$|$tmpid" | $awk '{print $2}') |
---|
1323 | if [[ -z $mmpid ]] |
---|
1324 | then |
---|
1325 | mpid=$(echo $mpid | $head -n 1) |
---|
1326 | mpid=$(echo $mpid | $awk '{print $1}') |
---|
1327 | else |
---|
1328 | mpid=$mmpid |
---|
1329 | fi |
---|
1330 | fi |
---|
1331 | fi # end of if [[ -n $1 && $1 != NULL ]] |
---|
1332 | |
---|
1333 | counter=1 |
---|
1334 | if [[ -n $2 ]] |
---|
1335 | then |
---|
1336 | count=$2 |
---|
1337 | else |
---|
1338 | count=20 |
---|
1339 | fi |
---|
1340 | |
---|
1341 | while [[ $counter -le $count && -n $mpid ]] |
---|
1342 | do |
---|
1343 | if [[ $counter -eq $count ]] |
---|
1344 | then |
---|
1345 | if [[ -n $1 && $1 != NULL ]] |
---|
1346 | then |
---|
1347 | comm=$1 |
---|
1348 | else |
---|
1349 | comm=$($ps -fp $mpid -o args=ARGS | $tail -n -1) |
---|
1350 | fi |
---|
1351 | print "\nThe following command timed out!:\n$comm\n" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
1352 | # kill any children of the process we are about to kill |
---|
1353 | # (for the case where we were called with pipes in the command line) |
---|
1354 | |
---|
1355 | $ps -o pid=PID,ppid=PPID | $grep $mpid | $grep -v "grep $mpid" > tmpout |
---|
1356 | { |
---|
1357 | while read line |
---|
1358 | do |
---|
1359 | pid=$(echo $line | $awk '{print $1}') |
---|
1360 | ppid=$(echo $line | $awk '{print $2}') |
---|
1361 | # echo $line | read pid ppid |
---|
1362 | if [[ $ppid = $mpid ]] |
---|
1363 | then |
---|
1364 | $kill -9 $pid |
---|
1365 | fi |
---|
1366 | done |
---|
1367 | } < tmpout |
---|
1368 | $rm tmpout |
---|
1369 | |
---|
1370 | echo "killing $mpid" |
---|
1371 | $kill -9 $mpid 2>/dev/null |
---|
1372 | $sleep 1 |
---|
1373 | # special check to catch defunct children of c -c (pass 1) |
---|
1374 | parent=$($ps -fp $mpid | $grep defunct | $awk '{print $3}') |
---|
1375 | if [[ -n $parent ]] |
---|
1376 | then |
---|
1377 | parcomm=$($ps -p $parent | $tail -n -1 | $awk '{print $4}') |
---|
1378 | if [[ $parcomm = wc ]] |
---|
1379 | then |
---|
1380 | $kill -9 $parent 2>/dev/null |
---|
1381 | fi |
---|
1382 | fi |
---|
1383 | set +x |
---|
1384 | return 1 |
---|
1385 | fi # end of if [[ $counter -eq $count ]] |
---|
1386 | $sleep 1 |
---|
1387 | (( counter = counter + 1 )) |
---|
1388 | mpid=$($ps -p $mpid | $awk '{print $1}' | $grep -v "PID") |
---|
1389 | |
---|
1390 | done # end of while [[ $counter -le $count && -n $mpid ]] do |
---|
1391 | |
---|
1392 | return 0 |
---|
1393 | |
---|
1394 | } #----- end of function waitforit ---------------------------- |
---|
1395 | |
---|
1396 | |
---|
1397 | ############################################################################### |
---|
1398 | # |
---|
1399 | # Function doit |
---|
1400 | # |
---|
1401 | # All arguments are optional except $2. If you do not desire an action, |
---|
1402 | # pass "" for the parameter, or just leave out trailing args completely. |
---|
1403 | # |
---|
1404 | # Arguments: |
---|
1405 | # $1 The unique part of the name of the log file as in |
---|
1406 | # $LOGDIR/$1.${my_hostname}.${logdate} |
---|
1407 | # $2 The command to be run. stdout is redirected to the log file. |
---|
1408 | # stderr is redirected to both the screen and the file |
---|
1409 | # gpfs.snap_err.${logdate}.out in the $LOGDIR. |
---|
1410 | # On a non-zero return code, an error message is printed to the |
---|
1411 | # screen and the gpfs.snap_err.${logdate}.out file. |
---|
1412 | # $3 "Output control" flag. |
---|
1413 | # If null, the output is appended to the global tar file; |
---|
1414 | # If 1, the output is not appended to the global tar file |
---|
1415 | # (the log file will be left for further data to be added to it, |
---|
1416 | # and a subsequent invocation with a null output control flag |
---|
1417 | # will cause the log file to be added to the tar file then) |
---|
1418 | # If 2, the output is prepended with the hostname of the node |
---|
1419 | # to make the output look like that produced by mmdsh, |
---|
1420 | # and then copied to the log file's parent directory |
---|
1421 | # (this option is used by the -c option for collecting |
---|
1422 | # output data from executing a command string). |
---|
1423 | # |
---|
1424 | ############################################################################### |
---|
1425 | function doit # <logFile> <cmdToRun> <outputControl> |
---|
1426 | { |
---|
1427 | typeset sourceFile="gpfs.snap.sh" |
---|
1428 | [[ -n $DEBUGgpfssnap || -n $DEBUGdoit ]] && set -x |
---|
1429 | $mmTRACE_ENTER "$*" |
---|
1430 | typeset logFile=$1 |
---|
1431 | typeset cmdToRun=$2 |
---|
1432 | typeset outputControl=$3 |
---|
1433 | |
---|
1434 | if [[ $pass = 1 ]] |
---|
1435 | then |
---|
1436 | if [[ -n $cmdToRun ]] |
---|
1437 | then |
---|
1438 | ksh -c "PATH=$PATH $cmdToRun" 2>/dev/null | $wc -c >$YAMO & |
---|
1439 | if [[ $cmdToRun = "netstat -D" ]] |
---|
1440 | then |
---|
1441 | waitforit "$cmdToRun" "60" |
---|
1442 | else |
---|
1443 | waitforit "$cmdToRun" |
---|
1444 | fi |
---|
1445 | if [[ $? = 0 && -s $YAMO ]] |
---|
1446 | then |
---|
1447 | temp_bytes=$($cat $YAMO | $awk '{print $1}') |
---|
1448 | temp_bytes2=$(echo $cmdToRun | $wc -c) |
---|
1449 | tmp2_bytes=$(( temp_bytes + temp_bytes2 + 153 )) |
---|
1450 | addit $tmp2_bytes $cmdToRun |
---|
1451 | fi |
---|
1452 | fi |
---|
1453 | $rm $YAMO 2>/dev/null |
---|
1454 | else |
---|
1455 | if [[ -n $cmdToRun ]] |
---|
1456 | then |
---|
1457 | print "" >> $LOGDIR/$logFile |
---|
1458 | print "$outputDelimiter" >> $LOGDIR/$logFile |
---|
1459 | print "Output for $cmdToRun on $($hostname | $cut -d. -f1)" >>${LOGDIR}/$logFile |
---|
1460 | print "$outputDelimiter" >> $LOGDIR/$logFile |
---|
1461 | ksh -c "PATH=$PATH $cmdToRun" >>$LOGDIR/$logFile 2>${BASELOGDIR}/gpfs.snap_err.out.temp & |
---|
1462 | if [[ $cmdToRun = "netstat -D" ]] |
---|
1463 | then |
---|
1464 | waitforit "$cmdToRun" "60" |
---|
1465 | else |
---|
1466 | waitforit |
---|
1467 | fi |
---|
1468 | |
---|
1469 | if [[ -s ${BASELOGDIR}/gpfs.snap_err.out.temp ]]; |
---|
1470 | then |
---|
1471 | print "\nErrata from $cmdToRun:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
1472 | $cat ${BASELOGDIR}/gpfs.snap_err.out.temp | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
1473 | $rm ${BASELOGDIR}/gpfs.snap_err.out.temp 2>/dev/null |
---|
1474 | fi |
---|
1475 | |
---|
1476 | if [[ -z $outputControl ]] |
---|
1477 | then |
---|
1478 | tarit $logFile |
---|
1479 | fi # end of if [[ -z $outputControl ]] |
---|
1480 | fi # end of if [[ -n $cmdToRun ]] |
---|
1481 | fi # end of if [[ $pass = 1 ]] |
---|
1482 | |
---|
1483 | } #----- end of function doit --------------------------------- |
---|
1484 | |
---|
1485 | |
---|
1486 | ############################################################################ |
---|
1487 | # |
---|
1488 | # Function: Append a file to the global tarfile ($tarfile) |
---|
1489 | # If $tarfile does not exist yet, create it. |
---|
1490 | # |
---|
1491 | # Input: $1 - file to be added to (or serve as the start of) $tarfile |
---|
1492 | # $2 - "remove" flag (if 0, delete input file; otherwise, don't) |
---|
1493 | # |
---|
1494 | # Output: $tarfile has been created or augmented |
---|
1495 | # |
---|
1496 | # Returns: 0 |
---|
1497 | # |
---|
1498 | # Notes: The caller must be in $BASELOGDIR when calling tarit. |
---|
1499 | # The file to be added to $tarfile must be in $LOGDIR. |
---|
1500 | # |
---|
1501 | ############################################################################ |
---|
1502 | function tarit |
---|
1503 | { |
---|
1504 | typeset sourceFile="gpfs.snap.sh" |
---|
1505 | [[ -n $DEBUGgpfssnap || -n $DEBUGtarit ]] && set -x |
---|
1506 | $mmTRACE_ENTER "$*" |
---|
1507 | |
---|
1508 | if [[ -a ${SUBDIR}/$1 ]] |
---|
1509 | then |
---|
1510 | if [[ -a $tarfile ]] |
---|
1511 | then |
---|
1512 | tar -rf $tarfile ${SUBDIR}/$1 |
---|
1513 | else |
---|
1514 | tar -cf $tarfile ${SUBDIR}/$1 |
---|
1515 | fi |
---|
1516 | if [[ -z $2 ]] |
---|
1517 | then |
---|
1518 | $rm -r ${LOGDIR}/$1 2>/dev/null |
---|
1519 | fi |
---|
1520 | fi # end of if [[ -a ${SUBDIR}/$1 ]] |
---|
1521 | |
---|
1522 | } #----- end of function tarit -------------------------------- |
---|
1523 | |
---|
1524 | |
---|
1525 | function do_master_stuff |
---|
1526 | { |
---|
1527 | typeset sourceFile="gpfs.snap.sh" |
---|
1528 | [[ -n $DEBUGgpfssnap || -n $DEBUGdo_master_stuff ]] && set -x |
---|
1529 | $mmTRACE_ENTER "$*" |
---|
1530 | |
---|
1531 | if [[ $os = "Linux" ]] |
---|
1532 | then |
---|
1533 | line=$($head -n1 $mmsdrfsfile) |
---|
1534 | IFS_sv="$IFS" |
---|
1535 | IFS=":" |
---|
1536 | set -f ; set -A v -- - $line ; set +f |
---|
1537 | IFS="$IFS_sv" |
---|
1538 | addlist=${v[$PRIMARY_SERVER_Field]} |
---|
1539 | echo "Primary server is: ${v[$PRIMARY_SERVER_Field]}" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1540 | if [[ -n ${v[$BACKUP_SERVER_Field]} ]] |
---|
1541 | then |
---|
1542 | echo "Backup server is: ${v[$BACKUP_SERVER_Field]}" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1543 | addlist="$addlist ${v[$BACKUP_SERVER_Field]}" |
---|
1544 | fi |
---|
1545 | fi # end of if [[ $os = "Linux" ]] |
---|
1546 | |
---|
1547 | echo "\nGetting file system manager information . . .\n" |
---|
1548 | |
---|
1549 | dev2list=$(getUsedDevices) |
---|
1550 | if [[ -n $aflag ]] |
---|
1551 | then |
---|
1552 | devlist=$dev2list |
---|
1553 | fi |
---|
1554 | |
---|
1555 | $mmlsmgr $devlist 2>/dev/null | $grep -v "^file system" | $grep -v "^\-\-\-\-\-\-" > lsout |
---|
1556 | |
---|
1557 | if [[ -s lsout ]] |
---|
1558 | then |
---|
1559 | echo "According to mmlsmgr . . ." | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1560 | { |
---|
1561 | while read line |
---|
1562 | do |
---|
1563 | if [[ -z $line ]] |
---|
1564 | then |
---|
1565 | continue |
---|
1566 | fi |
---|
1567 | fs=$(echo $line | $cut -f1 -d " ") |
---|
1568 | manager=$(echo $line | $cut -f2 -d "(" | $cut -f1 -d ")") |
---|
1569 | mannum=$(echo $line | $awk '{print $2}') |
---|
1570 | echo $mannum | $grep "\." >/dev/null |
---|
1571 | if [[ $? = 0 ]] |
---|
1572 | then |
---|
1573 | mannum=$($grep $mannum $mmsdrfsfile | $grep MEMBER_NODE | $cut -f5 -d ":") |
---|
1574 | fi |
---|
1575 | if [[ $mannum != "(none" ]] |
---|
1576 | then |
---|
1577 | nodesetID=$(findNodesetId $mmsdrfsfile $mannum) |
---|
1578 | name=$(getNodeInfo $REL_HOSTNAME_Field $NODE_NUMBER_Field $mannum $nodesetID $mmsdrfsfile) |
---|
1579 | addlist="$addlist $name" |
---|
1580 | else |
---|
1581 | name="" |
---|
1582 | fi |
---|
1583 | print "The manager of $fs is $manager ($name)" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1584 | done |
---|
1585 | } < lsout |
---|
1586 | skipone=1 |
---|
1587 | else |
---|
1588 | print "Couldn't get filesystem manager info from daemon. Trying log files. . ." | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1589 | fi # end of if [[ -s lsout ]] |
---|
1590 | |
---|
1591 | $rm lsout 2>/dev/null |
---|
1592 | |
---|
1593 | echo "\nAssessing file system manager data from logs . . .\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1594 | |
---|
1595 | list=$($ls -t $rasDir/mmfs.log.[0-9]* 2>/dev/null) |
---|
1596 | for fs2 in $dev2list |
---|
1597 | do |
---|
1598 | gotlast=0;gotresigned=0;gotappointed=0; |
---|
1599 | fs=${fs2#/dev/} |
---|
1600 | $rm tmplist 2>/dev/null |
---|
1601 | for file in $list |
---|
1602 | do |
---|
1603 | $grep -nE "Cannot mount file system|unmounted because it|as manager|last file system manager" $file | $grep -nE "$fs|last file system manager" | $sort -nrk 1,1 >> tmplist |
---|
1604 | done |
---|
1605 | if [[ -s tmplist ]] |
---|
1606 | then |
---|
1607 | { |
---|
1608 | while read line |
---|
1609 | do |
---|
1610 | echo $line | $grep "last file system manager" >/dev/null |
---|
1611 | if [[ $? = 0 ]] |
---|
1612 | then |
---|
1613 | continue |
---|
1614 | fi |
---|
1615 | echo $line | $grep -E "unmounted | Cannot mount" >/dev/null |
---|
1616 | if [[ $? = 0 ]] |
---|
1617 | then |
---|
1618 | read line |
---|
1619 | fi |
---|
1620 | manager=$(echo $line | $cut -f2 -d "(" | $cut -f1 -d ")") |
---|
1621 | mannum=$(echo $line | $cut -f1 -d "(") |
---|
1622 | echo $line | $grep "last file system manager" >/dev/null |
---|
1623 | if [[ $? = 0 ]] |
---|
1624 | then |
---|
1625 | if [[ $gotlast = 1 ]] |
---|
1626 | then |
---|
1627 | continue |
---|
1628 | fi |
---|
1629 | mannum=${mannum#*was node } |
---|
1630 | gotlast=1 |
---|
1631 | message="failed as" |
---|
1632 | read message2 |
---|
1633 | else |
---|
1634 | echo $line | $grep "resigned as" >/dev/null |
---|
1635 | if [[ $? = 0 ]] |
---|
1636 | then |
---|
1637 | if [[ $gotresigned = 1 ]] |
---|
1638 | then |
---|
1639 | continue |
---|
1640 | fi |
---|
1641 | mannum=${mannum#* Node } |
---|
1642 | gotresigned=1 |
---|
1643 | message="resigned as" |
---|
1644 | else |
---|
1645 | if [[ $gotappointed = 2 ]] |
---|
1646 | then |
---|
1647 | continue |
---|
1648 | else |
---|
1649 | mannum=${mannum#* Node } |
---|
1650 | if [[ $gotappointed = 0 ]] |
---|
1651 | then |
---|
1652 | gotappointed=1 |
---|
1653 | message="last appointed" |
---|
1654 | else |
---|
1655 | gotappointed=2 |
---|
1656 | message="2nd last appointed" |
---|
1657 | fi |
---|
1658 | fi |
---|
1659 | fi |
---|
1660 | fi # end of if [[ $? = 0 ]] |
---|
1661 | echo $mannum | $grep "\." >/dev/null |
---|
1662 | if [[ $? = 0 ]] |
---|
1663 | then |
---|
1664 | mannum=$($grep $mannum $mmsdrfsfile | $grep MEMBER_NODE | $cut -f5 -d ":") |
---|
1665 | fi |
---|
1666 | nodesetID=$(findNodesetId $mmsdrfsfile $mannum) |
---|
1667 | name=$(getNodeInfo $REL_HOSTNAME_Field $NODE_NUMBER_Field $mannum $nodesetID $mmsdrfsFile) |
---|
1668 | if [[ $message = "failed as" ]] |
---|
1669 | then |
---|
1670 | outfile=${BASELOGDIR}/problem.${my_hostname} |
---|
1671 | else |
---|
1672 | outfile=${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1673 | fi |
---|
1674 | print "$manager ($name) $message manager of $fs" | $tee -a $outfile |
---|
1675 | if [[ -n $message2 ]] |
---|
1676 | then |
---|
1677 | print "$manager ($name) $message manager of $fs" | $tee -a $outfile |
---|
1678 | manager="" |
---|
1679 | fi |
---|
1680 | addlist="$addlist $name" |
---|
1681 | done |
---|
1682 | } < tmplist |
---|
1683 | if [[ $gotlast = 1 && gotresigned = 1 && gotappointed = 2 ]] |
---|
1684 | then |
---|
1685 | break |
---|
1686 | fi |
---|
1687 | fi # end of if [[ -s tmplist ]] |
---|
1688 | $rm tmplist 2>/dev/null |
---|
1689 | done |
---|
1690 | |
---|
1691 | if [[ $yflag != 1 && $pflag != 1 ]] |
---|
1692 | then |
---|
1693 | addtolist "$addlist" |
---|
1694 | fi |
---|
1695 | |
---|
1696 | bigtarfile=${BASELOGDIR}/all.${logdate}.tar |
---|
1697 | if [[ -s $nodefile ]] |
---|
1698 | then |
---|
1699 | mysum=$($sum $spath | $cut -f1 -d " ") |
---|
1700 | NODESDIR=${BASELOGDIR}/${logdate} |
---|
1701 | $mkdir $NODESDIR |
---|
1702 | |
---|
1703 | node_list=$($cat $nodefile) |
---|
1704 | if [[ -n $node_list ]] |
---|
1705 | then |
---|
1706 | if [[ $x_arg = 1 ]] |
---|
1707 | then |
---|
1708 | print "\nWould fork gpfs.snap on nodes $node_list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1709 | else |
---|
1710 | print "\nForking gpfs.snap on nodes:\n$node_list\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1711 | fi |
---|
1712 | |
---|
1713 | dir=/$(echo $BASELOGDIR | $cut -f2 -d "/") |
---|
1714 | $mmdsh -F $nodefile "K5MUTE=1 df $dir | tail -n -1 | grep 100%" >dfout & |
---|
1715 | waitforit NULL 60 |
---|
1716 | list=$($cat dfout | $cut -f1 -d :) 2>/dev/null |
---|
1717 | $rm dfout 2>/dev/null |
---|
1718 | if [[ -n $list ]] |
---|
1719 | then |
---|
1720 | print "$dir is 100% full on the following nodes:\n$list\nRemoving from list." | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
1721 | removefromlist "$list" |
---|
1722 | fi |
---|
1723 | if [[ -s $nodefile ]] |
---|
1724 | then |
---|
1725 | $mmdsh -F $nodefile K5MUTE=1 mkdir -p ${BASELOGDIR}/${logdate} >/dev/null 2>/dev/null |
---|
1726 | $mmdsh -F $nodefile K5MUTE=1 sum $SNAP/gpfs.snap >sumout 2>sumerr |
---|
1727 | nnewlist=$($cat sumerr | $cut -f1 -d :) |
---|
1728 | newlist2=$($cat sumout | $grep -v "$mysum" | $cut -f1 -d :) |
---|
1729 | nnewlist="$nnewlist $newlist2" |
---|
1730 | firstone=1 |
---|
1731 | $rm -f sumerr sumout $commaFile 2>/dev/null |
---|
1732 | for i in $nnewlist |
---|
1733 | do |
---|
1734 | if [[ $firstone = 1 ]] |
---|
1735 | then |
---|
1736 | commalist="$i" |
---|
1737 | print -- $i > $commaFile |
---|
1738 | firstone=0 |
---|
1739 | else |
---|
1740 | commalist="$commalist,$i" |
---|
1741 | print -- $i >> $commaFile |
---|
1742 | fi |
---|
1743 | done |
---|
1744 | if [[ -s $commaFile ]] |
---|
1745 | then |
---|
1746 | print "There is an outdated or no gpfs.snap in $SNAP on the following nodes:" |
---|
1747 | print $commalist |
---|
1748 | print "\nAttempting to copy . . .\n" |
---|
1749 | if [[ $SNAP != "/usr/lpp/mmfs/bin" ]] |
---|
1750 | then |
---|
1751 | $mmdsh -F $commaFile K5MUTE=1 mkdir -p $SNAP 2>/dev/null |
---|
1752 | fi |
---|
1753 | $mmdsh -F $commaFile K5MUTE=1 $rcp $myhname:$spath $spath 2>tmperr |
---|
1754 | |
---|
1755 | if [[ -s tmperr ]] |
---|
1756 | then |
---|
1757 | list=$($cat tmperr | $cut -f1 -d ":") |
---|
1758 | $cat tmperr |
---|
1759 | removefromlist "$list" |
---|
1760 | print "copy failed for the following nodes:\n$list\nRemoving them from list." | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
1761 | $cat $nodefile |
---|
1762 | fi |
---|
1763 | $rm tmperr 2>/dev/null |
---|
1764 | fi |
---|
1765 | if [[ $x_arg != 2 ]] |
---|
1766 | then |
---|
1767 | # Question: What is the "node_args" variable for? |
---|
1768 | $mmdsh -F $nodefile K5MUTE=1 "$SNAP/gpfs.snap $node_args -d ${BASELOGDIR}/$(hostname | cut -d. -f1)_${logdate} -x 1 -z" >$BASELOGDIR/pass1outfile & |
---|
1769 | fi |
---|
1770 | fi |
---|
1771 | fi # end of if [[ -n $node_list ]] |
---|
1772 | fi # end of if [[ -s $nodefile ]] |
---|
1773 | |
---|
1774 | } #----- end of function do_master_stuff ---------------------- |
---|
1775 | |
---|
1776 | |
---|
1777 | function printAndExit |
---|
1778 | { |
---|
1779 | [[ -n $DEBUGgpfssnap || -n $DEBUGprintAndExit ]] && set -x |
---|
1780 | |
---|
1781 | printErrorMsg $1 gpfs.snap $2 |
---|
1782 | print "$USAGE" |
---|
1783 | exit 1 |
---|
1784 | |
---|
1785 | } #----- end of function printAndExit ------------------------- |
---|
1786 | |
---|
1787 | |
---|
1788 | function getCurrentStanzaList2 # <outputFile> |
---|
1789 | { |
---|
1790 | typeset sourceFile="gpfs.snap.sh" |
---|
1791 | [[ -n $DEBUGgpfssnap || -n $DEBUGgetCurrentStanzaList2 ]] && set -x |
---|
1792 | $mmTRACE_ENTER "$*" |
---|
1793 | |
---|
1794 | typeset outfile=$1 |
---|
1795 | typeset rc=0 |
---|
1796 | |
---|
1797 | $rm -f $outfile |
---|
1798 | |
---|
1799 | #------------------------------------------------------------------- |
---|
1800 | # Generate a list of the GPFS file systems in /etc/filesystems. |
---|
1801 | # The output of the AIX lsfs -c command looks something like this: |
---|
1802 | # |
---|
1803 | # #MountPoint:Device:Vfs:Nodename:Type:Size:Options:AutoMount:Acct |
---|
1804 | # /gpfs/gpfsA:/dev/gpfsA:mmfs:-:mmfs:0:rw:no:no |
---|
1805 | # /gpfs/gpfsB:/dev/gpfsB:mmfs:-:mmfs:0:rw:no:no |
---|
1806 | #------------------------------------------------------------------- |
---|
1807 | set +x |
---|
1808 | LC_ALL=C $lsfs -c -v mmfs > $outfile 2>&1 |
---|
1809 | rc=$? |
---|
1810 | [[ -n $DEBUGgpfssnap || -n $DEBUGgetCurrentStanzaList2 ]] && set -x |
---|
1811 | if [[ $rc -ne 0 ]] |
---|
1812 | then |
---|
1813 | # Check whether this is a 'not found error'. |
---|
1814 | $grep -q "unknown vfs type" $outfile |
---|
1815 | if [[ $? = 0 ]] |
---|
1816 | then |
---|
1817 | # 'not found' is acceptable. Reset the return code |
---|
1818 | # and create a file with an lsfs header-like line only. |
---|
1819 | rc=0 |
---|
1820 | print -- "#MountPoint:Device:Vfs:junk" > $outfile |
---|
1821 | else |
---|
1822 | # If some other error, show the error messages. |
---|
1823 | $cat $outfile |
---|
1824 | fi |
---|
1825 | fi |
---|
1826 | |
---|
1827 | return $rc |
---|
1828 | |
---|
1829 | } #----- end of function getCurrentStanzaList2 ---------------- |
---|
1830 | |
---|
1831 | |
---|
1832 | |
---|
1833 | ############################################# |
---|
1834 | # Mainline processing MAIN main |
---|
1835 | ############################################# |
---|
1836 | |
---|
1837 | args=$@ |
---|
1838 | set -A months Yam Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec |
---|
1839 | set -A days 0 31 29 31 30 31 30 31 31 30 31 30 31 |
---|
1840 | export K5MUTE=1 |
---|
1841 | GPFSDIR=/usr/lpp/mmfs/bin |
---|
1842 | export PATH=/bin:/usr/bin:/etc:/usr/sbin:/sbin:$GPFSDIR |
---|
1843 | |
---|
1844 | export LANG=en_US |
---|
1845 | export LC_MESSAGES=C |
---|
1846 | export LC_TIME=C |
---|
1847 | pwd=$(pwd) |
---|
1848 | result=$(echo $0 | $grep "^"/) |
---|
1849 | if [[ -n $result ]] |
---|
1850 | then |
---|
1851 | spath=$0 |
---|
1852 | else |
---|
1853 | spath=${pwd}/$0 |
---|
1854 | fi |
---|
1855 | SNAP=$(dirname $spath) |
---|
1856 | pass=1 |
---|
1857 | total_bytes=0 |
---|
1858 | max_bytes=0 |
---|
1859 | |
---|
1860 | BASELOGDIR=/tmp/gpfs.snapOut |
---|
1861 | $mkdir ${BASELOGDIR} 2>/dev/null |
---|
1862 | logdate=$(date +\%m\%d\%H\%M) |
---|
1863 | my_hostname=$($hostname | $cut -d. -f1) |
---|
1864 | $rm -rf ${BASELOGDIR}/gpfs.snap_err.*.out 2>/dev/null |
---|
1865 | $rm -rf ${BASELOGDIR}/gpfs.snap_info.*.out 2>/dev/null |
---|
1866 | $rm ${BASELOGDIR}/problem.${my_hostname} 2>/dev/null |
---|
1867 | YAMO=/tmp/yamo |
---|
1868 | ODMDIR=/etc/objrepos |
---|
1869 | SPENV=0 |
---|
1870 | os=$($uname) |
---|
1871 | if [[ -f $mmfscfg ]] |
---|
1872 | then |
---|
1873 | logDir=$($awk '$1 == "logDir" {value = $2} END {print value}' $mmfscfg) |
---|
1874 | [[ -n $logDir ]] && rasDir="${logDir}/" |
---|
1875 | fi |
---|
1876 | [[ -z $rasDir ]] && rasDir=/var/adm/ras |
---|
1877 | |
---|
1878 | |
---|
1879 | if [[ $os = "AIX" ]] |
---|
1880 | then |
---|
1881 | alevel=$($lslpp -L bos.rte | $grep bos.rte | $awk '{print $2}') |
---|
1882 | sp_version=$($lslpp -Lc ssp.basic 2>/dev/null | $grep ssp.basic | $cut -f3 -d :) |
---|
1883 | if [[ -n $sp_version ]] |
---|
1884 | then |
---|
1885 | sp_version=${sp_version%.#} |
---|
1886 | SPENV=1 |
---|
1887 | my_node_number=$(/usr/lpp/ssp/install/bin/node_number) |
---|
1888 | PATH=$PATH:/usr/lpp/ssp/bin:/usr/lpp/csd/bin |
---|
1889 | fi |
---|
1890 | else |
---|
1891 | alevel=$($uname -rv) |
---|
1892 | fi # end of if [[ $os = "AIX" ]] |
---|
1893 | |
---|
1894 | myhname=$($hostname) |
---|
1895 | if [[ -a /usr/bin/vmstat ]] |
---|
1896 | then |
---|
1897 | gotvmstat=1 |
---|
1898 | fi |
---|
1899 | |
---|
1900 | # gpfs.snap [-c "CmdString"] [-d OutputDirectory] [-p] [-x {1 | 2}] [-y | -z] |
---|
1901 | # [-a | -W NodeFilename | -w NodeName[,NodeName...] | |
---|
1902 | # -n NodeNumber[,NodeNumber...]] |
---|
1903 | USAGE=\ |
---|
1904 | "Usage:\n"\ |
---|
1905 | " gpfs.snap [-c \"CmdString\"] [-d OutputDirectory] [-p] [-x {1 | 2}] [-y | -z]\n"\ |
---|
1906 | " [ -a | -W NodeFilename | -w NodeName[,NodeName...] | -n NodeNumber[,NodeNumber...]]" |
---|
1907 | |
---|
1908 | master=1 |
---|
1909 | |
---|
1910 | if [[ $arg1 = '-?' || $ARG1 = '-H' || $ARG1 = '--HELP' || $arg1 = '--' ]] |
---|
1911 | then |
---|
1912 | print $USAGE |
---|
1913 | exit 1 |
---|
1914 | fi |
---|
1915 | |
---|
1916 | while getopts :ac:d:Dn:pw:W:x:yz OPT |
---|
1917 | do |
---|
1918 | case $OPT in |
---|
1919 | |
---|
1920 | a) [[ -n $aflag ]] && printAndExit 36 "-$OPT" |
---|
1921 | aflag="-$OPT" |
---|
1922 | all="all" |
---|
1923 | if [[ -n $nflag || -n $wflag || -n $Wflag ]] |
---|
1924 | then |
---|
1925 | [[ -n $nflag ]] && printErrorMsg 191 gpfs.snap "-a" "-n" |
---|
1926 | [[ -n $wflag ]] && printErrorMsg 191 gpfs.snap "-a" "-w" |
---|
1927 | [[ -n $Wflag ]] && printErrorMsg 191 gpfs.snap "-a" "-W" |
---|
1928 | print $USAGE |
---|
1929 | exit 1 |
---|
1930 | fi |
---|
1931 | ;; |
---|
1932 | |
---|
1933 | c) [[ -n $cflag ]] && printAndExit 36 "-$OPT" |
---|
1934 | cflag="-$OPT" |
---|
1935 | cmdString="$OPTARG" |
---|
1936 | ;; |
---|
1937 | |
---|
1938 | d) d_argument=$OPTARG |
---|
1939 | $mkdir -p $d_argument 2>/dev/null & |
---|
1940 | waitforit |
---|
1941 | if [[ $? = 1 ]] |
---|
1942 | then |
---|
1943 | print -u2 "Write to $d_argument timed out. Choose another directory or take the default (/tmp)" |
---|
1944 | exit 1 |
---|
1945 | fi |
---|
1946 | BASELOGDIR=$d_argument |
---|
1947 | $rm -rf ${BASELOGDIR}/gpfs.snap_err.*.out 2>/dev/null |
---|
1948 | $rm -rf ${BASELOGDIR}/gpfs.snap_info.*.out 2>/dev/null |
---|
1949 | $rm ${BASELOGDIR}/problem.${my_hostname} 2>/dev/null |
---|
1950 | $mv /tmp/gpfs.snapOut/${BASELOGDIR}/gpfs.snap_err.${logdate}.out ${BASELOGDIR} 2>/dev/null |
---|
1951 | ;; |
---|
1952 | |
---|
1953 | D) DEBUGgpfssnap=1 |
---|
1954 | exec 2>/tmp/gpfs.snap.debug |
---|
1955 | set -x |
---|
1956 | echo "Writing debug data and redirecting stderr to /tmp/gpfs.snap.debug" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
1957 | ;; |
---|
1958 | |
---|
1959 | n) [[ -n $nflag ]] && printAndExit 36 "-$OPT" |
---|
1960 | nflag="-$OPT" |
---|
1961 | nodenums="$OPTARG" |
---|
1962 | if [[ -n $aflag || -n $Wflag ]] |
---|
1963 | then |
---|
1964 | [[ -n $aflag ]] && printErrorMsg 191 gpfs.snap "-n" "-a" |
---|
1965 | [[ -n $Wflag ]] && printErrorMsg 191 gpfs.snap "-n" "-W" |
---|
1966 | print $USAGE |
---|
1967 | exit 1 |
---|
1968 | fi |
---|
1969 | ;; |
---|
1970 | |
---|
1971 | p) pflag=1 |
---|
1972 | ;; |
---|
1973 | |
---|
1974 | w) [[ -n $wflag ]] && printAndExit 36 "-$OPT" |
---|
1975 | wflag="-$OPT" |
---|
1976 | nodenames="$OPTARG" |
---|
1977 | if [[ -n $aflag || -n Wflag ]] |
---|
1978 | then |
---|
1979 | [[ -n $aflag ]] && printErrorMsg 191 gpfs.snap "-w" "-a" |
---|
1980 | [[ -n $Wflag ]] && printErrorMsg 191 gpfs.snap "-w" "-W" |
---|
1981 | print $USAGE |
---|
1982 | exit 1 |
---|
1983 | fi |
---|
1984 | ;; |
---|
1985 | |
---|
1986 | W) [[ -n $Wflag ]] && printAndExit 36 "-$OPT" |
---|
1987 | Wflag="-$OPT" |
---|
1988 | wcoll="$OPTARG" |
---|
1989 | if [[ -n $aflag || -n $nflag || -n $wflag ]] |
---|
1990 | then |
---|
1991 | [[ -n $aflag ]] && printErrorMsg 191 gpfs.snap "-W" "-a" |
---|
1992 | [[ -n $nflag ]] && printErrorMsg 191 gpfs.snap "-W" "-n" |
---|
1993 | [[ -n $wflag ]] && printErrorMsg 191 gpfs.snap "-W" "-w" |
---|
1994 | print $USAGE |
---|
1995 | exit 1 |
---|
1996 | fi |
---|
1997 | ;; |
---|
1998 | |
---|
1999 | x) xflag=1 |
---|
2000 | x_arg=$OPTARG |
---|
2001 | if [[ $x_arg != 1 ]] && [[ $x_arg != 2 ]] |
---|
2002 | then |
---|
2003 | print "Illegal argument to option x: $x_arg" |
---|
2004 | print "$USAGE" |
---|
2005 | exit 1 |
---|
2006 | fi |
---|
2007 | ;; |
---|
2008 | |
---|
2009 | y) yflag=1 |
---|
2010 | if [[ $zflag = 1 ]] |
---|
2011 | then |
---|
2012 | printErrorMsg 191 gpfs.snap y z |
---|
2013 | print "$USAGE" |
---|
2014 | exit 1 |
---|
2015 | fi |
---|
2016 | ;; |
---|
2017 | |
---|
2018 | z) zflag=1 |
---|
2019 | if [[ $yflag = 1 ]] |
---|
2020 | then |
---|
2021 | printErrorMsg 191 gpfs.snap y z |
---|
2022 | print "$USAGE" |
---|
2023 | exit 1 |
---|
2024 | fi |
---|
2025 | master=0 |
---|
2026 | ;; |
---|
2027 | |
---|
2028 | :) printAndExit 204 $OPTARG |
---|
2029 | ;; |
---|
2030 | |
---|
2031 | +[acdDnpwWxyz]) |
---|
2032 | printAndExit 13 "$OPT" |
---|
2033 | ;; |
---|
2034 | |
---|
2035 | *) printAndExit 13 $OPTARG |
---|
2036 | ;; |
---|
2037 | |
---|
2038 | esac |
---|
2039 | done # end of while getopts do |
---|
2040 | |
---|
2041 | shift OPTIND-1 |
---|
2042 | [[ $# != 0 ]] && printAndExit 38 $1 |
---|
2043 | |
---|
2044 | # If no node selection option was specified, default to -a. |
---|
2045 | [[ -z $aflag && -z $nflag && -z $wflag && -z $Wflag ]] && \ |
---|
2046 | aflag="-a" |
---|
2047 | |
---|
2048 | |
---|
2049 | ######################################################################## |
---|
2050 | # Set up trap exception handling and call the gpfsInit function. |
---|
2051 | # It will ensure that the local copy of the mmsdrfs and the rest of the |
---|
2052 | # GPFS system files are up-to-date. There is no need to lock the sdr. |
---|
2053 | ######################################################################## |
---|
2054 | trap pretrap2 HUP INT QUIT KILL |
---|
2055 | gpfsInitOutput=$(gpfsInit nolock) |
---|
2056 | setGlobalVar $? $gpfsInitOutput |
---|
2057 | |
---|
2058 | |
---|
2059 | ###################################################### |
---|
2060 | # Create a file with the names of all affected nodes. |
---|
2061 | ###################################################### |
---|
2062 | $rm -f $nodefile 2>/dev/null |
---|
2063 | $touch -f $nodefile |
---|
2064 | |
---|
2065 | if [[ -n $aflag ]] |
---|
2066 | then |
---|
2067 | # Get a list of the nodes that belong to the cluster. |
---|
2068 | getNodeList $REL_HOSTNAME_Field $GLOBAL_ID $mmsdrfsFile > $nodefile |
---|
2069 | |
---|
2070 | # If there are no nodes, issue an appropriate message and return. |
---|
2071 | if [[ ! -s $nodefile ]] |
---|
2072 | then |
---|
2073 | print -u2 "$mmcmd: There are no known GPFS nodes." |
---|
2074 | exit 1 |
---|
2075 | fi |
---|
2076 | elif [[ -n $Wflag ]] |
---|
2077 | then |
---|
2078 | # Verify input file is readable. |
---|
2079 | if [[ ! -f $wcoll || ! -r $wcoll ]] |
---|
2080 | then |
---|
2081 | printErrorMsg 43 $mmcmd $wcoll |
---|
2082 | exit 1 |
---|
2083 | fi |
---|
2084 | |
---|
2085 | # Filter out comment lines and localhost entries. |
---|
2086 | $grep -v -e "localhost" -e "^#" "$wcoll" > $nodefile |
---|
2087 | if [[ ! -s $nodefile ]] |
---|
2088 | then |
---|
2089 | # No node names specified |
---|
2090 | printErrorMsg 328 $mmcmd $wcoll |
---|
2091 | exit 1 |
---|
2092 | fi |
---|
2093 | else |
---|
2094 | # Either no option was specified, or we have some combination of -w and -n. |
---|
2095 | |
---|
2096 | # Convert the node names list into a file. |
---|
2097 | for i in $(print $nodenames | $tr "," " ") |
---|
2098 | do |
---|
2099 | print $i >> $nodefile |
---|
2100 | done |
---|
2101 | |
---|
2102 | # Convert the node number list into node names |
---|
2103 | # and append the names to the file. |
---|
2104 | for i in $(print $nodenums | $tr "," " ") |
---|
2105 | do |
---|
2106 | nodeName=$(getNodeInfo \ |
---|
2107 | $REL_HOSTNAME_Field $NODE_NUMBER_Field $i $GLOBAL_ID $mmsdrfsFile) |
---|
2108 | if [[ -n $nodeName ]] |
---|
2109 | then |
---|
2110 | print $nodeName >> $nodefile |
---|
2111 | else |
---|
2112 | # Node number is not in cluster |
---|
2113 | printErrorMsg 352 $mmcmd $i |
---|
2114 | fi |
---|
2115 | done |
---|
2116 | |
---|
2117 | # If none of the node numbers resolved correctly, give up. |
---|
2118 | [[ ! -s $nodefile && ( -n $nodenames || -n $nodenums ) ]] && exit 1 |
---|
2119 | |
---|
2120 | fi # end of if [[ -n $aflag ]] |
---|
2121 | |
---|
2122 | if [[ $master = 1 && -z $cflag ]] |
---|
2123 | then |
---|
2124 | checklist |
---|
2125 | SUBDIR=${my_hostname}.master.${logdate} |
---|
2126 | else |
---|
2127 | SUBDIR=${my_hostname}.${logdate} |
---|
2128 | fi |
---|
2129 | tarfile=${BASELOGDIR}/gpfs.snap.${SUBDIR}.out.tar |
---|
2130 | LOGDIR=${BASELOGDIR}/${SUBDIR} |
---|
2131 | |
---|
2132 | if [[ -z $cflag ]] |
---|
2133 | then |
---|
2134 | echo "$SNAP/gpfs.snap version $VERSION started at $starttime with args:\n$args\n" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
2135 | echo "My process id is $$" |
---|
2136 | fi |
---|
2137 | |
---|
2138 | mypgid=$($ps -p $$ -o pgid=PGID | $tail -n -1) |
---|
2139 | |
---|
2140 | # Check the output directory for space. |
---|
2141 | basedir=/$(echo $BASELOGDIR | $cut -f2 -d "/") |
---|
2142 | if [[ $os = "AIX" ]] |
---|
2143 | then |
---|
2144 | $df $basedir | $tail -n -1 | $awk '{print $4}' >/tmp/yamo & |
---|
2145 | else |
---|
2146 | $df $basedir | $tail -n -1 | $awk '{print $5}' >/tmp/yamo & |
---|
2147 | fi |
---|
2148 | waitforit "$df $basedir" 60 |
---|
2149 | if [[ $? = 1 ]] |
---|
2150 | then |
---|
2151 | print -u2 "df on $basedir timed out. Solve the problem with $basedir or specify a different directory with -d." |
---|
2152 | exit 1 |
---|
2153 | fi |
---|
2154 | per=$($cat /tmp/yamo) |
---|
2155 | if [[ $per = 100% ]] |
---|
2156 | then |
---|
2157 | print -u2 "$basedir is 100% full. Specify a different directory with -d or clear space." |
---|
2158 | exit 1 |
---|
2159 | fi |
---|
2160 | $mkdir -p ${LOGDIR} |
---|
2161 | cd $BASELOGDIR |
---|
2162 | |
---|
2163 | mmsdrfsfile=/var/mmfs/gen/mmsdrfs |
---|
2164 | |
---|
2165 | # If the node does not belong to a GPFS cluster, go away quietly. |
---|
2166 | if [[ ! -f $mmsdrfsfile ]] |
---|
2167 | then |
---|
2168 | print -u2 "The node does not belong to a GPFS cluster ($mmsdrfsfile does not exist). Exiting." |
---|
2169 | return 0 |
---|
2170 | fi |
---|
2171 | |
---|
2172 | determineMode |
---|
2173 | getLocalNodeData |
---|
2174 | mygnum=$ourNodeNumber |
---|
2175 | mygname=$ourNodeName |
---|
2176 | |
---|
2177 | mynodeset=$(findNodesetId $mmsdrfsfile $mygnum) |
---|
2178 | |
---|
2179 | if [[ $mynodeset = "%%home%%" ]] |
---|
2180 | then |
---|
2181 | mynodeset2=$($grep clusterName $mmsdrfsfile | $grep %%home%% | $cut -f2 -d " " | $cut -f1 -d ":") |
---|
2182 | else |
---|
2183 | mynodeset2=$mynodeset |
---|
2184 | fi |
---|
2185 | |
---|
2186 | if [[ -z $mygname ]] |
---|
2187 | then |
---|
2188 | $grep MEMBER_NODE $mmsdrfsfile >/tmp/mmsdrfs2.tmp |
---|
2189 | { |
---|
2190 | while read line |
---|
2191 | do |
---|
2192 | rhname=$(echo $line | $cut -f8 -d ":") |
---|
2193 | addr=$($ping -c1 -w5 $rhname | $head -n 1 | $cut -f2 -d "(" | $cut -f1 -d ")") |
---|
2194 | ilist=$(netstat -i | $awk '{print $1}' | $grep -v -E "Iface|Kernel|Name") |
---|
2195 | for i in $ilist |
---|
2196 | do |
---|
2197 | $ifconfig $i | $grep $addr >/dev/null |
---|
2198 | if [[ $? = 0 ]] |
---|
2199 | then |
---|
2200 | mynodeset=$(echo $line | $cut -f1 -d :) |
---|
2201 | mygname=$(echo $line | $cut -f6 -d :) |
---|
2202 | mygnum=$(echo $line | $cut -f5 -d :) |
---|
2203 | break |
---|
2204 | fi |
---|
2205 | done |
---|
2206 | if [[ -n $mygname ]] |
---|
2207 | then |
---|
2208 | break |
---|
2209 | fi |
---|
2210 | done |
---|
2211 | } < /tmp/mmsdrfs2.tmp |
---|
2212 | $rm /tmp/mmsdrfs2.tmp 2>/dev/null |
---|
2213 | fi # end of if [[ -z $mygname ]] |
---|
2214 | |
---|
2215 | if [[ $os = "AIX" ]] |
---|
2216 | then |
---|
2217 | [[ -z $cflag ]] && \ |
---|
2218 | echo "I am hostname $myhname running AIX level $alevel" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
2219 | gpfs_version=$($lslpp -Lc gpfs.base 2>/dev/null | $grep gpfs.base | $cut -f3 -d :) |
---|
2220 | gpfs_version=${gpfs_version%.#} |
---|
2221 | if [[ -z $gpfs_version ]] |
---|
2222 | then |
---|
2223 | gpfs_version=$($lslpp -Lc mmfs.base.rte 2>/dev/null | $grep mmfs.base.rte | $cut -f3 -d :) |
---|
2224 | gpfs_version=${gpfs_version%.#} |
---|
2225 | fi |
---|
2226 | fi # end of if [[ $os = "AIX" ]] |
---|
2227 | |
---|
2228 | if [[ $SPENV = 1 ]] |
---|
2229 | then |
---|
2230 | [[ -z $cflag ]] && \ |
---|
2231 | echo "I am SP node $my_node_number running $sp_version" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
2232 | fi |
---|
2233 | |
---|
2234 | if [[ $os = "Linux" ]] |
---|
2235 | then |
---|
2236 | [[ -z $cflag ]] && \ |
---|
2237 | echo "I am $myhname running Linux level $alevel" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
2238 | gpfs_version=$($rpm -q gpfs.base | $awk 'BEGIN{FS="-"} {print $2"-"$3}') |
---|
2239 | fi |
---|
2240 | |
---|
2241 | if [[ -z $gpfs_version ]] |
---|
2242 | then |
---|
2243 | print -u2 "\nGPFS does not appear to be installed on this machine." |
---|
2244 | $rm -r ${LOGDIR} 2>/dev/null |
---|
2245 | exit 1 |
---|
2246 | fi |
---|
2247 | |
---|
2248 | rel1=$(echo $gpfs_version | $cut -f1 -d ".") |
---|
2249 | rel2=$(echo $gpfs_version | $cut -f2 -d ".") |
---|
2250 | |
---|
2251 | if [[ $rel1 -gt 2 ]] || [[ $rel1 -eq 2 && $rel2 -ge 3 ]] |
---|
2252 | then |
---|
2253 | groupname="cluster" |
---|
2254 | else |
---|
2255 | groupname="nodeset" |
---|
2256 | fi |
---|
2257 | |
---|
2258 | [[ -z $cflag ]] && \ |
---|
2259 | echo "I am gpfs node $mygname number $mygnum in $groupname $mynodeset2 running GPFS version $gpfs_version" | $tee -a ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
2260 | |
---|
2261 | dumpdir=$($cat /var/mmfs/etc/mmfs.cfg | $grep "^dataStructureDump " | $head -1 | $awk '{print $2}') |
---|
2262 | if [[ -z $dumpdir ]] |
---|
2263 | then |
---|
2264 | dumpdir="/tmp/mmfs" |
---|
2265 | fi |
---|
2266 | |
---|
2267 | # If the -c "run this command string on the nodes" option was specified, |
---|
2268 | # just collect the data, cleanup temporary files, and exit early. |
---|
2269 | if [[ -n $cflag ]] |
---|
2270 | then |
---|
2271 | $mmdsh -F $nodefile K5MUTE=1 ksh -c \"PATH=$PATH $cmdString\" 2>/dev/null |
---|
2272 | $rm -r ${LOGDIR} 2>/dev/null |
---|
2273 | $rm -f $nodefile 2>/dev/null |
---|
2274 | $rm -f $LOCAL_FILES 2>/dev/null |
---|
2275 | $rm /tmp/hostfile 2>/dev/null |
---|
2276 | exit 0 |
---|
2277 | fi # end of if [[ -n $cflag ]] |
---|
2278 | |
---|
2279 | # Collect data on the master node if so desired. |
---|
2280 | if [[ $master = 1 ]] |
---|
2281 | then |
---|
2282 | if [[ $pflag != 1 ]] |
---|
2283 | then |
---|
2284 | print "Checking configuration files . . ." |
---|
2285 | nlist2=$($cat $nodefile2) |
---|
2286 | check_fs $nodefile2 "$nlist2" |
---|
2287 | brcFile=$trcFile |
---|
2288 | bglist=$tglist |
---|
2289 | |
---|
2290 | check_files /var/mmfs/gen/mmsdrfs $nodefile2 |
---|
2291 | |
---|
2292 | print "Checking for waiters . . ." |
---|
2293 | check_waiters |
---|
2294 | |
---|
2295 | if [[ $os = "Linux" ]] |
---|
2296 | then |
---|
2297 | set +x |
---|
2298 | getCurrentStanzaList stanzafile |
---|
2299 | [[ $DEBUGgpfssnap = 1 ]] && set -x |
---|
2300 | else |
---|
2301 | getCurrentStanzaList2 stanzafile |
---|
2302 | fi |
---|
2303 | fslist=$($cat stanzafile | $cut -f2 -d :) |
---|
2304 | fslist=${fslist#Device} |
---|
2305 | $rm stanzafile 2>/dev/null |
---|
2306 | fi # end of if [[ $pflag != 1 ]] |
---|
2307 | |
---|
2308 | do_master_stuff |
---|
2309 | |
---|
2310 | firstone=1 |
---|
2311 | $rm -f $nodefilecFile |
---|
2312 | list=$($cat $nodefile) |
---|
2313 | for i in $list |
---|
2314 | do |
---|
2315 | if [[ $firstone = 1 ]] |
---|
2316 | then |
---|
2317 | nodefilelist=$i |
---|
2318 | print -- $i > $nodefilecFile |
---|
2319 | firstone=0 |
---|
2320 | else |
---|
2321 | nodefilelist="$nodefilelist $i" |
---|
2322 | print -- $i >> $nodefilecFile |
---|
2323 | fi |
---|
2324 | done |
---|
2325 | check_fs $nodefilecFile "$nodefilelist" "$hostarray" |
---|
2326 | rlist=$trlist |
---|
2327 | rcFile=$trcFile |
---|
2328 | glist=$tglist |
---|
2329 | fi # end of if [[ $master = 1 ]] |
---|
2330 | |
---|
2331 | all="-1" |
---|
2332 | check_dumps internaldump $all $dumpdir |
---|
2333 | check_dumps trcrpt $all "/tmp/mmfs" |
---|
2334 | |
---|
2335 | if [[ $x_arg = 2 ]] |
---|
2336 | then |
---|
2337 | pass=2 |
---|
2338 | else |
---|
2339 | print "\nDetermining whether there is enough space in ${BASELOGDIR} . . .\n" |
---|
2340 | fi |
---|
2341 | |
---|
2342 | #export mmdshCommandsFile=${BASELOGDIR}/commandfile |
---|
2343 | |
---|
2344 | while [[ $pass -le 2 ]] |
---|
2345 | do |
---|
2346 | print "Processing log files . . ." |
---|
2347 | get_files |
---|
2348 | if [[ $master = 1 && $pass = 2 ]] |
---|
2349 | then |
---|
2350 | if [[ -s $rcFile ]] |
---|
2351 | then |
---|
2352 | $mmdsh -F $rcFile K5MUTE=1 cat ${BASELOGDIR}/${logdate}/problem.\* 2>/dev/null | tee -a ${BASELOGDIR}/problem.${my_hostname} |
---|
2353 | fi |
---|
2354 | for i in $glist |
---|
2355 | do |
---|
2356 | $cat ${BASELOGDIR}/${logdate}/problem.$i 2>/dev/null | $tee -a ${BASELOGDIR}/problem.${my_hostname} |
---|
2357 | done |
---|
2358 | # Question: What is the "node_args" variable for? |
---|
2359 | $mmdsh -F $nodefile K5MUTE=1 $SNAP/gpfs.snap $node_args -d ${BASELOGDIR}/$(hostname | cut -d. -f1)_${logdate} -x 2 -z >/dev/null 2>/dev/null & |
---|
2360 | fi |
---|
2361 | |
---|
2362 | if [[ $pass = 1 && $gotvmstat = 1 ]] |
---|
2363 | then |
---|
2364 | ksh -c "vmstat 5 5" > ${LOGDIR}/vmstat_5_5 & |
---|
2365 | fi |
---|
2366 | |
---|
2367 | if [[ -s ${LOGDIR}/long_waiters.sorted ]] |
---|
2368 | then |
---|
2369 | if [[ $pass = 1 ]] |
---|
2370 | then |
---|
2371 | size=$($ls -l ${LOGDIR}/long_waiters.sorted | $awk '{print $5}') |
---|
2372 | addit $size "${LOGDIR}/long_waiters.sorted" |
---|
2373 | else |
---|
2374 | tarit long_waiters.sorted |
---|
2375 | fi |
---|
2376 | fi |
---|
2377 | |
---|
2378 | if [[ $master = 1 && $pflag != 1 ]] |
---|
2379 | then |
---|
2380 | doit "dump_list" "$mmdsh -F $nodefile2 ls -l $dumpdir/internaldump\* 2>/dev/null" |
---|
2381 | |
---|
2382 | print "Processing waiters . . ." |
---|
2383 | check_waiters2 |
---|
2384 | print "Processing configuration files . . ." |
---|
2385 | check_files2 |
---|
2386 | fi # end of if [[ $master = 1 && $pflag != 1 ]] |
---|
2387 | |
---|
2388 | print "Running mm commands . . ." |
---|
2389 | if [[ $master = 1 || $yflag = 1 ]] |
---|
2390 | then |
---|
2391 | doit "mmlsconfig" "/usr/lpp/mmfs/bin/mmlsconfig" |
---|
2392 | doit "mmlsmgr" "/usr/lpp/mmfs/bin/mmlsmgr" |
---|
2393 | doit "mmlsnode_a" "/usr/lpp/mmfs/bin/mmlsnode -a" |
---|
2394 | doit "mmgetstate_a" "/usr/lpp/mmfs/bin/mmgetstate -a" |
---|
2395 | doit "tsstatus" "tsstatus" |
---|
2396 | # need full pathname for some of these for waitforit to handle properly |
---|
2397 | for i in $fslist |
---|
2398 | do |
---|
2399 | doit "mmdf" "/usr/lpp/mmfs/bin/mmdf $i -q" 1 |
---|
2400 | doit "mmlsfs" "/usr/lpp/mmfs/bin/mmlsfs $i" 1 |
---|
2401 | doit "mmlsdisk" "/usr/lpp/mmfs/bin/mmlsdisk $i -L" 1 |
---|
2402 | doit "mmlspolicy" "/usr/lpp/mmfs/bin/mmlspolicy $i" 1 |
---|
2403 | doit "mmlspolicy" "/usr/lpp/mmfs/bin/mmlspolicy $i -L" 1 |
---|
2404 | doit "mmlsfileset" "/usr/lpp/mmfs/bin/mmlsfileset $i" 1 |
---|
2405 | doit "mmlsfileset" "/usr/lpp/mmfs/bin/mmlsfileset $i -L" 1 |
---|
2406 | doit "mmlssnapshot" "/usr/lpp/mmfs/bin/mmlssnapshot $i -d -Q" 1 |
---|
2407 | done |
---|
2408 | doit "mmlscluster" "mmlscluster" |
---|
2409 | doit "mmlsnsd" "mmlsnsd -L" 1 |
---|
2410 | doit "mmlsnsd" "mmlsnsd -X" |
---|
2411 | doit "mmremotecluster" "mmremotecluster show all" |
---|
2412 | doit "mmremotefs" "mmremotefs show all" |
---|
2413 | doit "mmauth" "mmauth show" |
---|
2414 | fi # end of if [[ $master = 1 || $yflag = 1 ]] |
---|
2415 | |
---|
2416 | # Be careful not to dump live data that may assert or segfault. |
---|
2417 | # We can always ask for additional data later. |
---|
2418 | print "Processing dumps . . ." |
---|
2419 | doit "mmfsadm_dump_some" "mmfsadm dump version" 1 |
---|
2420 | doit "mmfsadm_dump_some" "mmfsadm dump waiters" 1 |
---|
2421 | doit "mmfsadm_dump_some" "mmfsadm dump cfgmgr" 1 |
---|
2422 | doit "mmfsadm_dump_some" "mmfsadm dump tscomm" 1 |
---|
2423 | doit "mmfsadm_dump_some" "mmfsadm dump config" 1 |
---|
2424 | doit "mmfsadm_dump_some" "mmfsadm dump mutex" 1 |
---|
2425 | doit "mmfsadm_dump_some" "mmfsadm dump sgmgr" 1 |
---|
2426 | doit "mmfsadm_dump_some" "mmfsadm dump stripe" 1 |
---|
2427 | doit "mmfsadm_dump_some" "mmfsadm dump malloc" 1 |
---|
2428 | doit "mmfsadm_dump_some" "mmfsadm dump fs" 1 |
---|
2429 | doit "mmfsadm_dump_some" "mmfsadm dump mmap" 1 |
---|
2430 | doit "mmfsadm_dump_some" "mmfsadm dump nsd" 1 |
---|
2431 | doit "mmfsadm_dump_some" "mmfsadm dump disk" 1 |
---|
2432 | doit "mmfsadm_dump_some" "mmfsadm dump alloc stats" 1 |
---|
2433 | doit "mmfsadm_dump_some" "mmfsadm dump alloc hist" 1 |
---|
2434 | doit "mmfsadm_dump_some" "mmfsadm dump dealloc stats" 1 |
---|
2435 | doit "mmfsadm_dump_some" "mmfsadm dump allocmgr" 1 |
---|
2436 | doit "mmfsadm_dump_some" "mmfsadm dump allocmgr stats" 1 |
---|
2437 | doit "mmfsadm_dump_some" "mmfsadm dump allocmgr hist" |
---|
2438 | print "Processing common files . . ." |
---|
2439 | get_always |
---|
2440 | print "Processing network info . . ." |
---|
2441 | get_net_stuff |
---|
2442 | print "Processing lvm info . . ." |
---|
2443 | get_lvm_stuff |
---|
2444 | |
---|
2445 | if [[ $os = "AIX" ]] |
---|
2446 | then |
---|
2447 | console=$(/usr/sbin/lscons) |
---|
2448 | if [[ -f $console ]] |
---|
2449 | then |
---|
2450 | if [[ $pass = 1 ]] |
---|
2451 | then |
---|
2452 | temp_bytes=$($ls -l $console | $awk '{ print $5 }') |
---|
2453 | addit $temp_bytes "$console" |
---|
2454 | else |
---|
2455 | $cp $console ${LOGDIR}/console |
---|
2456 | tarit "console" |
---|
2457 | fi |
---|
2458 | fi |
---|
2459 | fi # end of if [[ $os = "AIX" ]] |
---|
2460 | |
---|
2461 | print "Processing miscellaneous files . . ." |
---|
2462 | get_files_list "/etc" "fstab filesystems trcfmt syslog.conf" |
---|
2463 | get_files_dir "/var/mmfs/etc" |
---|
2464 | get_files_dir "/var/mmfs/gen" |
---|
2465 | get_files_dir "/var/mmfs/ssl" |
---|
2466 | get_files_dir "/var/mmfs/ssl/stage" |
---|
2467 | get_files_dir "/var/mmfs/tmp" |
---|
2468 | get_files_list "$dumpdir" "$internal_list" internaldumps |
---|
2469 | savedir=$(pwd) |
---|
2470 | cd /var/log 2>/dev/null |
---|
2471 | mlist=$($ls messages* 2>/dev/null) |
---|
2472 | cd $savedir |
---|
2473 | if [[ -n $mlist ]] |
---|
2474 | then |
---|
2475 | get_files_list "/var/log" "$mlist" |
---|
2476 | fi |
---|
2477 | mlist="" |
---|
2478 | [[ -s /usr/lpp/mmfs/bin/mmfslinux ]] && mlist="$mlist mmfslinux" |
---|
2479 | [[ -s /usr/lpp/mmfs/bin/mmfs26 ]] && mlist="$mlist mmfs26" |
---|
2480 | [[ -s /usr/lpp/mmfs/bin/mmfs24 ]] && mlist="$mlist mmfs24" |
---|
2481 | [[ -s /usr/lpp/mmfs/bin/mmfs ]] && mlist="$mlist mmfs" |
---|
2482 | if [[ -n $mlist ]] |
---|
2483 | then |
---|
2484 | get_files_list "/usr/lpp/mmfs/bin" "$mlist" |
---|
2485 | fi |
---|
2486 | |
---|
2487 | # Get info for whatever group services/topopology services pairs are running. |
---|
2488 | if [[ $SPENV = 1 && $my_node_number = 0 ]] |
---|
2489 | then |
---|
2490 | syspar=$(/usr/lpp/ssp/bin/spget_syspar -n) |
---|
2491 | syspar=".$syspar" |
---|
2492 | fi |
---|
2493 | if [[ $SPENV = 1 ]] |
---|
2494 | then |
---|
2495 | doit "lssrc_rvsd" "lssrc -g rvsd" 1 |
---|
2496 | doit "lsvsd_l" "lsvsd -l" |
---|
2497 | if [[ -s ./${SUBDIR}/mmsdrfs2 ]] |
---|
2498 | then |
---|
2499 | if [[ $pass = 1 ]] |
---|
2500 | then |
---|
2501 | temp_bytes=$($ls -l ./${SUBDIR}/mmsdrfs2 | $awk '{ print $5 }') |
---|
2502 | addit $temp_bytes "mmsdrfs2" |
---|
2503 | else |
---|
2504 | tarit "mmsdrfs2" |
---|
2505 | fi |
---|
2506 | fi |
---|
2507 | fi # end of if [[ $SPENV = 1 ]] |
---|
2508 | |
---|
2509 | if [[ $master = 1 ]] |
---|
2510 | then |
---|
2511 | if [[ $pass = 1 ]] |
---|
2512 | then |
---|
2513 | print "Waiting for remote nodes to report space requirements . . ." |
---|
2514 | else |
---|
2515 | print "Waiting for remote nodes to collect data . . ." |
---|
2516 | fi |
---|
2517 | wait |
---|
2518 | fi |
---|
2519 | |
---|
2520 | if [[ $pass = 1 ]] |
---|
2521 | then |
---|
2522 | if [[ $gotvmstat = 1 ]] |
---|
2523 | then |
---|
2524 | (( tmpval = $( vmstat | $wc -c ) * 5 )) |
---|
2525 | addit $tmpval |
---|
2526 | fi |
---|
2527 | check_space |
---|
2528 | print "It appears we have enough space.\n" |
---|
2529 | |
---|
2530 | if [[ $x_arg = 1 ]] |
---|
2531 | then |
---|
2532 | $rm -r ${LOGDIR} 2>/dev/null |
---|
2533 | if [[ $master = 1 ]] |
---|
2534 | then |
---|
2535 | $rm -r $NODESDIR 2>/dev/null |
---|
2536 | fi |
---|
2537 | exit 0 |
---|
2538 | fi |
---|
2539 | fi # end of if [[ $pass = 1 ]] |
---|
2540 | |
---|
2541 | if [[ $pass = 2 ]] |
---|
2542 | then |
---|
2543 | [[ $gotvmstat = 1 ]] && tarit "vmstat_5_5" |
---|
2544 | tarit "mmdf" |
---|
2545 | tarit "mmlsdisk" |
---|
2546 | tarit "mmlsfs" |
---|
2547 | tarit "mmlspolicy" |
---|
2548 | tarit "mmlsfileset" |
---|
2549 | tarit "mmlssnapshot" |
---|
2550 | fi # end of if [[ $pass = 2 && $gotvmstat = 1 ]] |
---|
2551 | |
---|
2552 | pass=$(expr $pass + 1) |
---|
2553 | |
---|
2554 | done # end of while [[ $pass -le 2 ]] do |
---|
2555 | |
---|
2556 | |
---|
2557 | if [[ -s gpfs.snap_err.${logdate}.out ]] |
---|
2558 | then |
---|
2559 | $cp gpfs.snap_err.${logdate}.out ${SUBDIR}/gpfs.snap_err.${logdate}.out |
---|
2560 | tar -rf $tarfile ${SUBDIR}/gpfs.snap_err.${logdate}.out |
---|
2561 | fi |
---|
2562 | $rm gpfs.snap_err.out.temp 2>/dev/null |
---|
2563 | |
---|
2564 | if [[ -a problem.${my_hostname} ]] |
---|
2565 | then |
---|
2566 | if [[ -s problem.${my_hostname} ]] |
---|
2567 | then |
---|
2568 | $cp problem.${my_hostname} ${SUBDIR}/problem.${my_hostname} |
---|
2569 | tar -rf $tarfile problem.${my_hostname} |
---|
2570 | else |
---|
2571 | $rm problem.${my_hostname} 2>/dev/null |
---|
2572 | fi |
---|
2573 | fi # end of if [[ -a problem.${my_hostname} ]] |
---|
2574 | |
---|
2575 | endtime=$(date) |
---|
2576 | echo "gpfs.snap near completion at $endtime" >> ${BASELOGDIR}/gpfs.snap_info.${logdate}.out |
---|
2577 | |
---|
2578 | if [[ -a gpfs.snap_info.${logdate}.out ]] |
---|
2579 | then |
---|
2580 | $cp gpfs.snap_info.${logdate}.out ${SUBDIR}/gpfs.snap_info.${logdate}.out |
---|
2581 | tar -rf $tarfile ${SUBDIR}/gpfs.snap_info.${logdate}.out |
---|
2582 | fi |
---|
2583 | |
---|
2584 | if [[ -a /bin/compress ]] |
---|
2585 | then |
---|
2586 | gotcompress=1 |
---|
2587 | compress $tarfile |
---|
2588 | suff="Z" |
---|
2589 | else |
---|
2590 | gzip $tarfile |
---|
2591 | suff="gz" |
---|
2592 | fi |
---|
2593 | |
---|
2594 | if [[ $master = 1 ]] |
---|
2595 | then |
---|
2596 | basetar=$(basename ${tarfile}) |
---|
2597 | tar -cf $bigtarfile ${basetar}.${suff} |
---|
2598 | $rm ${tarfile}.${suff} |
---|
2599 | |
---|
2600 | wait |
---|
2601 | if [[ -s $nodefile ]] |
---|
2602 | then |
---|
2603 | print "Getting snaps from remote nodes . . ." |
---|
2604 | if [[ -s $rcFile ]] |
---|
2605 | then |
---|
2606 | $mmdsh -F $rcFile K5MUTE=1 $rcp ${BASELOGDIR}/$(hostname | cut -d. -f1)_${logdate}/gpfs.snap.\*.out.tar.\* $myhname:${BASELOGDIR} |
---|
2607 | fi |
---|
2608 | if [[ -s rcperr ]] |
---|
2609 | then |
---|
2610 | print "The following nodes had trouble sending the snap file:" | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
2611 | $cat rcperr | $tee -a ${BASELOGDIR}/gpfs.snap_err.${logdate}.out |
---|
2612 | fi |
---|
2613 | $rm rcperr 2>/dev/null |
---|
2614 | g=0 |
---|
2615 | for i in $glist |
---|
2616 | do |
---|
2617 | short=$(echo ${ghostarray[$g]} | $cut -f1 -d .) |
---|
2618 | $mv ${BASELOGDIR}/${short}_${logdate}/gpfs.snap.${short}.*.out.tar.* ${BASELOGDIR} |
---|
2619 | $rm -r ${BASELOGDIR}/${short}_${logdate} |
---|
2620 | (( g = g + 1 )) |
---|
2621 | done |
---|
2622 | |
---|
2623 | tarlist=$($ls gpfs.snap.*.out.tar.*) |
---|
2624 | if [[ -n $tarlist ]] |
---|
2625 | then |
---|
2626 | tar -rf $bigtarfile $tarlist |
---|
2627 | fi |
---|
2628 | $rm $tarlist 2>/dev/null |
---|
2629 | fi |
---|
2630 | |
---|
2631 | print "###############################################################################" |
---|
2632 | print "Send file ${bigtarfile} to IBM Service" |
---|
2633 | else |
---|
2634 | print "###############################################################################" |
---|
2635 | print "Send file ${tarfile}.${suff} to IBM Service" |
---|
2636 | |
---|
2637 | fi # end of if [[ $master = 1 ]] |
---|
2638 | |
---|
2639 | # Remove temporary files. |
---|
2640 | $rm -r ${LOGDIR} 2>/dev/null |
---|
2641 | $rm -r ${NODESDIR} 2>/dev/null |
---|
2642 | $rm ${BASELOGDIR}/*waiters 2>/dev/null |
---|
2643 | $rm -f $nodefile 2>/dev/null |
---|
2644 | $rm -f $LOCAL_FILES 2>/dev/null |
---|
2645 | $rm /tmp/hostfile 2>/dev/null |
---|
2646 | |
---|
2647 | endtime=$(date) |
---|
2648 | |
---|
2649 | echo "gpfs.snap completed at $endtime" |
---|
2650 | exit 0 |
---|
2651 | |
---|