{{{ #!html
Crawlzilla v 0.2.2 異常錯誤處理步驟
nutch 1.0 + hadoop 0.19 + solr 1.3.0
}}} [[PageOutline]] = 前言 = crawlzilla 0.2.2 所用的 nutch 1.0 有時爬得網站會出現執行完 " crawldb + generate + fetch "的循環之後,剩下來的動作就不做了,hadoop 沒有job ,而go.sh 則 idle永遠顯示 crawling的動作, 無法跑到finish。 原因可能有: * 資料量太大 :總共文字數 超過 10萬筆 * 執行過久 : 總程序跑超過3h 沒有跑到的程序有: {{{ #!text linkdb _JOB_DIR_/linkdb index-lucene JOB_DIR_/indexes 100.00% dedup 1: urls by time 100.00% dedup 2: content by hash 100.00% dedup 3: delete from index(es) }}} = 手動修復步驟 = {{{ cd /opt/crawlzilla/nutch }}} == index == * linkdb tw_yahoo_com_6/linkdb {{{ #!java Usage: LinkDb (-dir | ...) }}} {{{ $ /opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/cw_yahoo_5/linkdb -dir /user/crawler/cw_yahoo_5/segments/ }}} == index-lucene == * index-lucene tw_yahoo_com_6/indexes {{{ #!java Usage: Indexer ... }}} {{{ $ /opt/crawlzilla/nutch/bin/nutch index /user/crawler/cw_yahoo_5/index /user/crawler/cw_yahoo_5/crawldb /user/crawler/cw_yahoo_5/linkdb /user/crawler/cw_yahoo_5/segments/20101027234843 /user/crawler/cw_yahoo_5/segments/20101027234956 /user/crawler/cw_yahoo_5/segments/20101027235315 /user/crawler/cw_yahoo_5/segments/20101028000804 /user/crawler/cw_yahoo_5/segments/20101028002826 }}} == dedup == * dedup 1: urls by time 100.00% * dedup 2: content by hash 100.00% * dedup 3: delete from index(es) {{{ #!java Usage: DeleteDuplicates ... }}} {{{ /opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/cw_yahoo_5/index }}} == download and import == {{{ /opt/crawlzilla/nutch/bin/hadoop dfs -get cw_yahoo_5 ~/crawlzilla/archieve/cw_yahoo_5 cd ~/crawlzilla/archieve/ echo "0h:0m:0s" >> ./cw_yahoo_5/cw_yahoo_5PassTime echo "5" >> ./cw_yahoo_5/.crawl_depth cd ~/crawlzilla/archieve/cw_yahoo_5/index mv part-00000/* ./ rmdir part-00000/ }}} = 自動化script = {{{ #!sh #!/bin/bash # prompt if [ "$1" == "" ];then echo "Usage : fix "; echo " where JOB_NAME is one of: "; echo "===========" NN=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls |grep crawler |awk '{print $8}' | cut -d "/" -f 4) echo "$NN" echo "===========" exit 9; fi # begin JNAME=$1 LOGFILE=~/crawlzilla/debug_fix.log META_PATH=/home/crawler/crawlzilla/.tmp ### not test JPID="$META_PATH/$JNAME/$JNAME"_count_pid # go.sh need add go.sh's pid JDEPTH="$META_PATH/$JNAME/$JNAME"xxx # go.sh need fix JPTIME="$META_PATH/$JNAME/$JNAME"PassTime ### not test DATE=$(date) echo "$JNAME BEGINE at $DATE" >> $LOGFILE echo "1 invertlinks" >> $LOGFILE /opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/$JNAME/linkdb -dir /user/crawler/$JNAME/segments/ if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "2 index" >> $LOGFILE SEGS=$(/opt/crawlzilla/nutch/bin/hadoop dfs -ls /user/crawler/$JNAME/segments | grep segments | awk '{print $8 }') /opt/crawlzilla/nutch/bin/nutch index /user/crawler/$JNAME/index /user/crawler/$JNAME/crawldb /user/crawler/$JNAME/linkdb $SEGS if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "3 dedup" >> $LOGFILE /opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/$JNAME/index if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "4 download" >> $LOGFILE /opt/crawlzilla/nutch/bin/hadoop dfs -get $JNAME /home/crawler/crawlzilla/archieve/$JNAME if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "5 $JNAMEPassTime" >> $LOGFILE echo "0h:0m:0s" >> /home/crawler/crawlzilla/archieve/$JNAME/$JNAME"PassTime" if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "6 append depth" >> $LOGFILE echo "0" >> /home/crawler/crawlzilla/archieve/$JNAME/.crawl_depth if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "7 mv index files from part-00000" >> $LOGFILE mv /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/* /home/crawler/crawlzilla/archieve/$JNAME/index/ if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "8 rmdir part-00000/" >> $LOGFILE rmdir /home/crawler/crawlzilla/archieve/$JNAME/index/part-00000/ if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "9 tomcat" >> $LOGFILE cp -rf /opt/crawlzilla/tomcat/webapps/default /opt/crawlzilla/tomcat/webapps/$JNAME if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi echo "10 nutch-site.xml" >> $LOGFILE sed -i '8s/search/'${JNAME}'/g' /opt/crawlzilla/tomcat/webapps/$JNAME/WEB-INF/classes/nutch-site.xml if [ ! $? -eq 0 ];then echo "ERROR!!! see $LOGFILE ";exit 8; fi DATE=$(date) echo "$JNAME completed and finished at"$DATE >> $LOGFILE }}}