| 96 | # prompt |
| 97 | if [ "$1" == "" ];then |
| 98 | echo "Usage : fix <JOB_NAME>"; |
| 99 | echo " where JOB_NAME is one of: "; |
| 100 | echo "===========" |
| 101 | NN=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls |grep crawler |awk '{print $8}' | cut -d "/" -f 4) |
| 102 | echo "$NN" |
| 103 | echo "===========" |
| 104 | exit 9; |
| 105 | fi |
| 106 | |
| 107 | # begin |
| 108 | |
| 109 | JNAME=$1 |
| 110 | LOGFILE=~/crawlzilla/debug_fix.log |
| 111 | META_PATH=/home/crawler/crawlzilla/.tmp |
| 112 | |
| 113 | ### not test |
| 114 | JPID="$META_PATH/$JNAME/$JNAME"_count_pid # go.sh need add go.sh's pid |
| 115 | JDEPTH="$META_PATH/$JNAME/$JNAME"xxx # go.sh need fix |
| 116 | JPTIME="$META_PATH/$JNAME/$JNAME"PassTime |
| 117 | ### not test |
| 118 | |
| 119 | |
| 120 | DATE=$(date) |
| 121 | echo "$JNAME BEGINE at $DATE" >> $LOGFILE |
| 122 | |
| 123 | echo "1 invertlinks" >> $LOGFILE |
| 124 | |
| 125 | /opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/$JNAME/linkdb -dir /user/crawler/$JNAME/segments/ |
| 126 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 127 | |
| 128 | echo "2 index" >> $LOGFILE |
| 129 | SEGS=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls /user/crawler/$JNAME/segments | grep segments | awk '{print $8 }') |
| 130 | /opt/crawlzilla/nutch/bin/nutch index /user/crawler/$JNAME/index /user/crawler/$JNAME/crawldb /user/crawler/$JNAME/linkdb $SEGS |
| 131 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 132 | |
| 133 | echo "3 dedup" >> $LOGFILE |
| 134 | /opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/$JNAME/index |
| 135 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 136 | |
| 137 | echo "4 download" >> $LOGFILE |
| 138 | /opt/crawlzilla/nutch/bin/hadoop dfs -get $JNAME /home/crawler/crawlzilla/archieve/$JNAME |
| 139 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 140 | |
| 141 | echo "5 $JNAMEPassTime" >> $LOGFILE |
| 142 | echo "0h:0m:0s" >> /home/crawler/crawlzilla/archieve/$JNAME/$JNAME"PassTime" |
| 143 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 144 | |
| 145 | echo "6 append depth" >> $LOGFILE |
| 146 | echo "0" >> /home/crawler/crawlzilla/archieve/$JNAME/.crawl_depth |
| 147 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 148 | |
| 149 | echo "7 mv index files from part-00000" >> $LOGFILE |
| 150 | mv /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/* /home/crawler/crawlzilla/archieve/$JNAME/index/ |
| 151 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 152 | |
| 153 | echo "8 rmdir part-00000/" >> $LOGFILE |
| 154 | rmdir /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/ |
| 155 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 156 | |
| 157 | echo "9 tomcat" >> $LOGFILE |
| 158 | cp -rf /opt/crawlzilla/tomcat/webapps/default /opt/crawlzilla/tomcat/webapps/$JNAME |
| 159 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 160 | |
| 161 | echo "10 nutch-site.xml" >> $LOGFILE |
| 162 | sed -i '8s/search/'${JNAME}'/g' /opt/crawlzilla/tomcat/webapps/$JNAME/WEB-INF/classes/nutch-site.xml |
| 163 | if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi |
| 164 | |
| 165 | |
| 166 | |
| 167 | |
| 168 | DATE=$(date) |
| 169 | echo "$JNAME completed and finished at"$DATE >> $LOGFILE |
| 170 | |
| 171 | |
| 172 | }}} |