Last change
on this file since 249 was
247,
checked in by shunfa, 15 years ago
|
將go.sh複製到nutchez/system/目錄下
|
-
Property svn:executable set to
*
|
File size:
1.1 KB
|
Rev | Line | |
---|
[247] | 1 | #!/bin/bash |
---|
| 2 | crawl_dep=$1 |
---|
| 3 | |
---|
| 4 | if [ "$1" == "" ]; then |
---|
| 5 | echo "1. 使用這個shell ,首先你需要有nutchuser這個使用者,並且hadoop 已經開始運作"; |
---|
| 6 | echo "2. /home/nutchuser/nutchez/url/urls.txt 裡面有你要抓的網址"; |
---|
| 7 | echo "3. 執行 ./go.sh [深度] 即可,如:"; |
---|
| 8 | echo " ./go.sh 3" |
---|
| 9 | exit |
---|
| 10 | fi |
---|
| 11 | |
---|
| 12 | function debug_echo () { |
---|
| 13 | |
---|
| 14 | if [ $? -eq 0 ]; then |
---|
| 15 | echo "$1 finished " |
---|
| 16 | else |
---|
| 17 | echo "$1 is error" |
---|
| 18 | exit |
---|
| 19 | fi |
---|
| 20 | } |
---|
| 21 | |
---|
| 22 | |
---|
| 23 | |
---|
| 24 | source /opt/nutchez/nutch/conf/hadoop-env.sh |
---|
| 25 | |
---|
| 26 | debug_echo "import hadoop-env.sh" |
---|
| 27 | |
---|
| 28 | echo "delete search (local,hdfs) and urls (hdfs) " |
---|
| 29 | |
---|
| 30 | rm -rf /home/nutchuser/nutchez/search |
---|
| 31 | |
---|
| 32 | /opt/nutchez/nutch/bin/hadoop dfs -rmr urls search |
---|
| 33 | |
---|
| 34 | /opt/nutchez/nutch/bin/hadoop dfs -put /home/nutchuser/nutchez/urls urls |
---|
| 35 | |
---|
| 36 | # |
---|
| 37 | |
---|
| 38 | /opt/nutchez/nutch/bin/nutch crawl urls -dir search -depth $crawl_dep -topN 5000 -threads 1000 |
---|
| 39 | |
---|
| 40 | debug_echo "nutch crawl" |
---|
| 41 | |
---|
| 42 | # |
---|
| 43 | |
---|
| 44 | /opt/nutchez/nutch/bin/hadoop dfs -get search /home/nutchuser/nutchez/search |
---|
| 45 | |
---|
| 46 | debug_echo "download search" |
---|
| 47 | |
---|
| 48 | # |
---|
| 49 | |
---|
| 50 | /opt/nutchez/tomcat/bin/shutdown.sh |
---|
| 51 | |
---|
| 52 | /opt/nutchez/tomcat/bin/startup.sh |
---|
| 53 | |
---|
| 54 | |
---|
| 55 | debug_echo "tomcat restart" |
---|
Note: See
TracBrowser
for help on using the repository browser.