#!/bin/bash crawl_dep=$1 if [ "$1" == "" ]; then echo "1. 使用這個shell ,首先你需要有nutchuser這個使用者,並且hadoop 已經開始運作"; echo "2. /home/nutchuser/nutchez/url/urls.txt 裡面有你要抓的網址"; echo "3. 執行 ./go.sh [深度] 即可,如:"; echo " ./go.sh 3" exit fi function debug_echo () { if [ $? -eq 0 ]; then echo "$1 finished " else echo "$1 is error" exit fi } source /opt/nutchez/nutch/conf/hadoop-env.sh debug_echo "import hadoop-env.sh" echo "delete search (local,hdfs) and urls (hdfs) " rm -rf /home/nutchuser/nutchez/search /opt/nutchez/nutch/bin/hadoop dfs -rmr urls search /opt/nutchez/nutch/bin/hadoop dfs -put /home/nutchuser/nutchez/urls urls # /opt/nutchez/nutch/bin/nutch crawl urls -dir search -depth $crawl_dep -topN 5000 -threads 1000 debug_echo "nutch crawl" # /opt/nutchez/nutch/bin/hadoop dfs -get search /home/nutchuser/nutchez/search debug_echo "download search" # /opt/nutchez/tomcat/bin/shutdown.sh /opt/nutchez/tomcat/bin/startup.sh debug_echo "tomcat restart"