source: nutchez-0.2/src/shell/go.sh @ 247

Last change on this file since 247 was 247, checked in by shunfa, 14 years ago

將go.sh複製到nutchez/system/目錄下

  • Property svn:executable set to *
File size: 1.1 KB
Line 
1#!/bin/bash
2crawl_dep=$1
3
4if [ "$1" == "" ]; then
5 echo "1. 使用這個shell ,首先你需要有nutchuser這個使用者,並且hadoop 已經開始運作";
6 echo "2. /home/nutchuser/nutchez/url/urls.txt 裡面有你要抓的網址";
7 echo "3. 執行 ./go.sh [深度] 即可,如:";
8 echo " ./go.sh 3"
9 exit
10fi
11
12function debug_echo () {
13
14  if [ $? -eq 0 ]; then
15      echo "$1 finished "
16  else
17      echo "$1 is error"
18      exit
19  fi
20}
21
22
23
24source /opt/nutchez/nutch/conf/hadoop-env.sh
25
26debug_echo "import hadoop-env.sh"
27
28echo "delete search (local,hdfs) and urls (hdfs) "
29
30rm -rf /home/nutchuser/nutchez/search
31
32/opt/nutchez/nutch/bin/hadoop dfs -rmr urls search
33
34/opt/nutchez/nutch/bin/hadoop dfs -put /home/nutchuser/nutchez/urls urls
35
36#
37
38/opt/nutchez/nutch/bin/nutch crawl urls -dir search -depth $crawl_dep -topN 5000 -threads 1000
39
40debug_echo "nutch crawl"
41
42#
43
44/opt/nutchez/nutch/bin/hadoop dfs -get search /home/nutchuser/nutchez/search
45
46debug_echo "download search"
47
48#
49
50/opt/nutchez/tomcat/bin/shutdown.sh
51
52/opt/nutchez/tomcat/bin/startup.sh
53
54
55debug_echo "tomcat restart"
Note: See TracBrowser for help on using the repository browser.