{{{
#!html
<div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big>
Crawlzilla v 0.2.2 異常錯誤處理步驟
</big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big>
nutch 1.0 + hadoop 0.19 + solr 1.3.0
</big></big></div>
}}}
[[PageOutline]]

= 前言 =
crawlzilla 0.2.2 所用的 nutch 1.0 有時爬得網站會出現執行完 " crawldb + generate + fetch "的循環之後，剩下來的動作就不做了，hadoop 沒有job ，而go.sh 則 idle永遠顯示 crawling的動作， 無法跑到finish。

原因可能有：
 * 資料量太大 ：總共文字數 超過 10萬筆
 * 執行過久 ： 總程序跑超過3h

沒有跑到的程序有：
      	
{{{
#!text
linkdb       	_JOB_DIR_/linkdb
index-lucene    JOB_DIR_/indexes	100.00%
dedup 1:       	urls by time	100.00%
dedup 2:       	content by hash	100.00%
dedup 3:       	delete from index(es)
}}}

= 手動修復步驟 =


{{{
cd /opt/crawlzilla/nutch
}}}

 == index ==
 * linkdb tw_yahoo_com_6/linkdb

{{{
#!java
Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) 
}}}

{{{
$ /opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/cw_yahoo_5/linkdb -dir /user/crawler/cw_yahoo_5/segments/
}}}

 == index-lucene ==
 
 * index-lucene tw_yahoo_com_6/indexes
 
{{{
#!java
Usage: Indexer <index> <crawldb> <linkdb> <segment> ...
}}}

{{{
$ /opt/crawlzilla/nutch/bin/nutch index /user/crawler/cw_yahoo_5/index /user/crawler/cw_yahoo_5/crawldb /user/crawler/cw_yahoo_5/linkdb /user/crawler/cw_yahoo_5/segments/20101027234843 /user/crawler/cw_yahoo_5/segments/20101027234956 /user/crawler/cw_yahoo_5/segments/20101027235315 /user/crawler/cw_yahoo_5/segments/20101028000804 /user/crawler/cw_yahoo_5/segments/20101028002826
}}}


 == dedup ==
 
 * dedup 1: urls by time	100.00%
 * dedup 2: content by hash	100.00%
 * dedup 3: delete from index(es)

{{{
#!java
Usage: DeleteDuplicates <indexes> ...
}}}

{{{
/opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/cw_yahoo_5/index
}}}


 == download and import ==

 {{{
 /opt/crawlzilla/nutch/bin/hadoop dfs -get cw_yahoo_5 ~/crawlzilla/archieve/cw_yahoo_5
 cd ~/crawlzilla/archieve/
 echo "0h:0m:0s" >> ./cw_yahoo_5/cw_yahoo_5PassTime
 echo "5" >> ./cw_yahoo_5/.crawl_depth
 cd ~/crawlzilla/archieve/cw_yahoo_5/index
 mv part-00000/* ./
 rmdir part-00000/
 }}}

 = 自動化script = 

{{{
#!sh
#!/bin/bash

# prompt
if [ "$1" == "" ];then
    echo "Usage : fix <JOB_NAME>";
    echo " where JOB_NAME is one of: ";
    echo "==========="
    NN=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls |grep crawler |awk '{print $8}' | cut -d "/" -f 4)
    echo "$NN"
    echo "==========="
    exit 9;
fi

# begin

JNAME=$1
LOGFILE=~/crawlzilla/debug_fix.log
META_PATH=/home/crawler/crawlzilla/.tmp

### not test
JPID="$META_PATH/$JNAME/$JNAME"_count_pid # go.sh need add go.sh's pid
JDEPTH="$META_PATH/$JNAME/$JNAME"xxx # go.sh need fix
JPTIME="$META_PATH/$JNAME/$JNAME"PassTime
### not test


DATE=$(date)
echo "$JNAME BEGINE at $DATE" >> $LOGFILE

echo "1 invertlinks" >> $LOGFILE

/opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/$JNAME/linkdb -dir /user/crawler/$JNAME/segments/
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "2 index" >> $LOGFILE
SEGS=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls /user/crawler/$JNAME/segments | grep  segments | awk '{print $8 }')
/opt/crawlzilla/nutch/bin/nutch index /user/crawler/$JNAME/index /user/crawler/$JNAME/crawldb /user/crawler/$JNAME/linkdb $SEGS
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "3 dedup" >> $LOGFILE
/opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/$JNAME/index
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "4 download" >> $LOGFILE
/opt/crawlzilla/nutch/bin/hadoop dfs -get $JNAME /home/crawler/crawlzilla/archieve/$JNAME
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "5 $JNAMEPassTime" >> $LOGFILE
echo "0h:0m:0s" >> /home/crawler/crawlzilla/archieve/$JNAME/$JNAME"PassTime"
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "6 append depth" >> $LOGFILE
echo "0" >> /home/crawler/crawlzilla/archieve/$JNAME/.crawl_depth
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "7 mv index files from part-00000" >> $LOGFILE
mv /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/* /home/crawler/crawlzilla/archieve/$JNAME/index/
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "8 rmdir part-00000/"  >> $LOGFILE
rmdir /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "9 tomcat"  >> $LOGFILE
cp -rf /opt/crawlzilla/tomcat/webapps/default /opt/crawlzilla/tomcat/webapps/$JNAME
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi

echo "10 nutch-site.xml"  >> $LOGFILE
sed -i '8s/search/'${JNAME}'/g' /opt/crawlzilla/tomcat/webapps/$JNAME/WEB-INF/classes/nutch-site.xml
if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi




DATE=$(date)
echo "$JNAME completed and finished at"$DATE >> $LOGFILE


}}}