source: nutchez-0.1/bin/nutchez-func.sh @ 89

Last change on this file since 89 was 89, checked in by waue, 15 years ago
  • Property svn:executable set to *
File size: 5.5 KB
Line 
1#!/bin/bash
2# Author: WeiYu Chen <waue _at_ nchc org tw>
3# License: GPL
4# Description: Eazily use for Nutch
5# .
6
7
8: ${DIALOG=dialog}
9
10# set 1 to display more for debug,
11VERB=0
12
13init_nutchez () {
14  if ! [ -e ~/.nutchez ] ;then
15    # copy from /etc/nutch
16    mkdir ~/.nutchez
17    cp -rf /etc/nutch/conf ~/.nutchez/
18    mkdir ~/.nutchez/log
19    cp -rf /etc/nutch/sav ~/.nutchez/
20    chown -R $LOGNAME:$LOGNAME ~/.nutchez
21  fi
22#  export NUTCH_CONF_DIR=~/.nutchez
23#  export HADOOP_CONF_DIR=~/.nutchez
24#  export HADOOP_LOG_DIR=~/.nutchez/log
25#  . ~/.nutchez/hadoop-env.sh || . /etc/nutch/hadoop-env.sh
26}
27
28echo_vb () {
29  if [ $VERB -eq 1 ]; then
30    $DIALOG --msgbox "$1" 16 51
31  fi
32}
33
34test_file () {
35  if ! test -e "$1" ; then
36    echo_vb "test_file: \n can not find $1"
37    echo "" >  $1
38  else
39     echo_vb "test_file: \n Touch  $1 ! \n Its content is \n `cat $1`"
40  fi
41}
42
43check_if_root() {
44   if [ ! "$UID" = "0" ]; then
45     echo_vb "Hi [$LOGNAME] !! "
46     echo_vb "You need to run this script \"`basename $0`\" as root."
47     exit 1
48   fi
49}
50
51promote_tempfile () {
52  echo_vb "7. chang tmp as txt"
53  rm ~/.nutchez/sav/n.*.txt
54  mv /tmp/n.urls.tmp ~/.nutchez/sav/n.urls.txt
55  mv /tmp/n.robot.tmp ~/.nutchez/sav/n.robot.txt
56  mv /tmp/n.crawler.tmp ~/.nutchez/sav/n.crawler.txt
57  mv /tmp/n.tomcat.tmp ~/.nutchez/sav/n.tomcat.txt
58}
59
60clean_tempfile () {
61  echo_vb "7. delete tmp"
62  rm /tmp/n.*.tmp
63}
64
65setup_nutchez () {   
66  if ! [ -e ~/.nutchez/urls ] ; then
67    # make url list dir
68    mkdir ~/.nutchez/urls
69  fi
70
71  if  [ -e ~/.nutchez/urls/urls.txt ] ; then
72    rm ~/.nutchez/urls/urls.txt
73  fi
74
75  cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt
76
77  if [ -e ~/.nutchez/nutch-site.xml ] ; then
78    # set nutch-site.xml
79    sed -i -e "4s/<value>[a-zA-Z0-9]*</<value>$ROBOT</" ~/.nutchez/nutch-site.xml
80  fi
81
82}
83
84install_tomcat (){
85  if ! [ -e ~/.nutchez/tomcat ] ;then
86    # isntall tomcat to home
87    cp -rf /opt/nutch/tomcat ~/.nutchez/
88    chown -R $LOGNAME:$LOGNAME ~/.nutchez/tomcat/
89    # make search dir
90    if ! [ -e ~/.nutchez/search ] ;then
91      mkdir ~/.nutchez/search
92    fi
93  fi
94  # change explorer port
95  sed -i -e "67s/<Connector port=\"[0-9]*\"/<Connector port=\"$PORT\"/" ~/.nutchez/tomcat/conf/server.xml
96}
97
98
99show_urls (){
100  # show urls : ok =0 ,cancel = 1
101  echo_vb "2. show_urls !"
102  test_file  ~/.nutchez/sav/n.urls.txt
103  echo_vb "2.1 test_file ~/.nutchez/sav return : $?"
104  # dialog begin
105  dialog --title "The URLS that you want" --editbox ~/.nutchez/sav/n.urls.txt 16 51 2>/tmp/n.urls.tmp
106  RET=$?
107  echo_vb "2.1 cat url: `cat /tmp/n.urls.tmp`"
108  return $RET
109}
110
111setup_robot () {
112  test_file ~/.nutchez/sav/n.robot.txt
113  echo_vb "3. setup_robot"
114  # dialog
115  dialog --nocancel --inputbox " This agent name \n" 16 51 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp
116  echo_vb "3.1 cat robot : `cat /tmp/n.robot.tmp`"
117}
118
119setup_crawler () {
120  echo_vb "4. setup_crawler"
121  test_file ~/.nutchez/sav/n.crawler.txt
122  dialog --nocancel --inputbox " Depth  \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp
123  echo_vb "4.1 cat robot : `cat /tmp/n.robot.tmp`"
124}
125
126setup_tomcat () {
127  echo_vb "5. setup_tomcat"
128  test_file ~/.nutchez/sav/n.tomcat.txt
129  dialog --nocancel --inputbox " explorer port \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp
130  echo_vb "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`"
131}
132
133final_confirm () {
134  echo_vb "6. final_confirm : start =0 , back =1 "
135  tempfile=/tmp/n.finalcheck.tmp
136
137  echo " \n 1. The url list is : \n " > $tempfile
138  cat /tmp/n.urls.tmp >> $tempfile
139  echo " \n 2. The robot name is : \n" >> $tempfile
140  cat /tmp/n.robot.tmp >> $tempfile
141  echo " \n 3. The crawled depth is : \n " >> $tempfile
142  cat /tmp/n.crawler.tmp >> $tempfile
143  echo " \n 4. The explorer port is : \n " >> $tempfile
144  cat /tmp/n.tomcat.tmp >> $tempfile
145
146  MSG=`cat $tempfile`
147  echo_vb "6.1 final message :\n $MSG"
148  #read READ
149  $DIALOG --title "Check It !!" --clear \
150        --extra-button --extra-label "reset" --ok-label "ok" --cancel-label "exit" \
151        --yesno "$MSG" 26 51
152  RET=$?
153  echo_vb "final return = $RET"
154  return $RET
155}
156
157# define paramaters
158
159set_nutchez_p () {
160  ROBOT=`cat ~/.nutchez/sav/n.robot.txt`
161  URLS=`cat ~/.nutchez/sav/n.urls.txt`
162  DEPTH=`cat ~/.nutchez/sav/n.crawler.txt`
163  PORT=`cat ~/.nutchez/sav/n.tomcat.txt`
164}
165start_crawl () {
166
167  echo_vb "7. start_crawl"
168  setup_nutchez
169  install_tomcat
170  echo_vb "/opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH"
171  echo_vb "nutch conf dir = $NUTCH_CONF_DIR"
172  /opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH -topN 5000 -threads 1000
173}
174
175start_tomcat () {
176  echo_vb "8. start_tomcat "
177  echo_vb "/opt/nutch/tomcat/bin/startup.sh"
178  if [ -e /tmp/search ];then
179    rm -rf /tmp/search
180  fi
181  ln -sf ~/.nutchez/search/ /tmp/
182
183  pid_tc=$(ps axw -eo pid,command | \
184      grep "org.apache.catalina.startup.Bootstrap"\
185      | grep "start" | awk '{print $1}')
186  if [ -z "$pid_tc" ]; then
187      echo "starting tomcat"
188  else
189      echo "tomcat had been started and the pid is $pid_tc"
190      echo "stop it first"
191      kill -9 $pid_tc
192      if [ -z $? ];then
193    echo " tomcat ($pid_tc) is  killed ..."
194      else
195    echo "kill error ..." 
196      fi
197  fi
198  ~/.nutchez/tomcat/bin/startup.sh
199  sleep 3
200}
201
202show_report () {
203  echo_vb "9. show_report "
204  FIREFOX=`which firefox`
205  RET=$?
206  if [ $RET == 0 ];then 
207    $FIREFOX -D 0.0 http://localhost:$PORT
208    RET=$?
209  fi
210  if ! [ $RET == 0 ];then
211    $DIALOG --msgbox "Congratulations! \n you can explore the url: \n  http://localhost:$PORT" 0 0 
212  fi
213}
Note: See TracBrowser for help on using the repository browser.