source: nutchez-0.1/bin/nutchez-func.sh @ 243

Last change on this file since 243 was 108, checked in by waue, 15 years ago

years and log

  • Property svn:executable set to *
File size: 7.9 KB
RevLine 
[66]1#!/bin/bash
2# Author: WeiYu Chen <waue _at_ nchc org tw>
3# License: GPL
4# Description: Eazily use for Nutch
5# .
6
7
[94]8#DIALOG='dialog --backtitle "     NutchEz Setup Menu           -- powered by NCHC "'
9DIALOG=dialog
[66]10
[69]11# set 1 to display more for debug,
[72]12VERB=0
[66]13
[69]14init_nutchez () {
15  if ! [ -e ~/.nutchez ] ;then
16    # copy from /etc/nutch
17    mkdir ~/.nutchez
[89]18    cp -rf /etc/nutch/conf ~/.nutchez/
[69]19    mkdir ~/.nutchez/log
[89]20    cp -rf /etc/nutch/sav ~/.nutchez/
[69]21    chown -R $LOGNAME:$LOGNAME ~/.nutchez
22  fi
[91]23 
[69]24}
25
[91]26echo_dialog_v () {
[108]27  if [[ $VERB -eq 1 ]]; then
[66]28    $DIALOG --msgbox "$1" 16 51
29  fi
30}
31
32test_file () {
33  if ! test -e "$1" ; then
[91]34    echo_dialog_v "test_file: \n can not find $1"
[66]35    echo "" >  $1
36  else
[91]37     echo_dialog_v "test_file: \n Touch  $1 ! \n Its content is \n `cat $1`"
[66]38  fi
39}
40
41check_if_root() {
[108]42   if [[ ! "$UID" -eq "0" ]]; then
[91]43     echo_dialog_v "Hi [$LOGNAME] !! "
44     echo_dialog_v "You need to run this script \"`basename $0`\" as root."
[66]45     exit 1
46   fi
47}
48
49promote_tempfile () {
[91]50  echo_dialog_v "7. chang tmp as txt"
[100]51  rm -f ~/.nutchez/sav/n.*.txt
[69]52  mv /tmp/n.urls.tmp ~/.nutchez/sav/n.urls.txt
53  mv /tmp/n.robot.tmp ~/.nutchez/sav/n.robot.txt
54  mv /tmp/n.crawler.tmp ~/.nutchez/sav/n.crawler.txt
55  mv /tmp/n.tomcat.tmp ~/.nutchez/sav/n.tomcat.txt
[100]56  rm -f /tmp/n.*.tmp
[66]57}
58
59clean_tempfile () {
[91]60  echo_dialog_v "7. delete tmp"
[100]61  rm -f /tmp/n.*.tmp
[66]62}
63
[68]64setup_nutchez () {   
65  if ! [ -e ~/.nutchez/urls ] ; then
66    # make url list dir
67    mkdir ~/.nutchez/urls
[67]68  fi
[68]69
70  if  [ -e ~/.nutchez/urls/urls.txt ] ; then
71    rm ~/.nutchez/urls/urls.txt
72  fi
73
74  cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt
[99]75  if ! [ -z $NOCONTINUE ]; then
[108]76    if [[ $NOCONTINUE -eq 1 ]]; then
[99]77      echo_dialog_v " delete the ~/.nutchez/search/*"
[108]78      DATE=`date +%Y%m%d%H%M%S`
[99]79      mv ~/.nutchez/search ~/.nutchez/search-$DATE
80      $DIALOG --msgbox "上次搜尋的結果改放到 ~/.nutchez/search-$DATE " 0 0
[101]81  #    rm -f /tmp/search
[99]82  #    rm -rf ~/.nutchez/search/*
83    fi
[94]84  fi
85
[91]86  if [ -e ~/.nutchez/conf/nutch-site.xml ] ; then
[68]87    # set nutch-site.xml
[91]88    sed -i -e "4s/<value>[a-zA-Z0-9]*</<value>$ROBOT</" ~/.nutchez/conf/nutch-site.xml
[68]89  fi
[67]90}
[66]91
[68]92install_tomcat (){
93  if ! [ -e ~/.nutchez/tomcat ] ;then
94    # isntall tomcat to home
[100]95    cp -rf /opt/nutchez/tomcat ~/.nutchez/
[68]96    chown -R $LOGNAME:$LOGNAME ~/.nutchez/tomcat/
97    # make search dir
98    if ! [ -e ~/.nutchez/search ] ;then
99      mkdir ~/.nutchez/search
100    fi
[101]101
102   # setup search dir
103    cd ~
104    HOMEDIR=`pwd`/.nutchez/search
105    sed -i -e "8s|<value>[a-zA-Z0-9/]*<|<value>$HOMEDIR<|" ~/.nutchez/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml
106
[69]107  fi
[77]108  # change explorer port
109  sed -i -e "67s/<Connector port=\"[0-9]*\"/<Connector port=\"$PORT\"/" ~/.nutchez/tomcat/conf/server.xml
[68]110}
111
112
[66]113show_urls (){
114  # show urls : ok =0 ,cancel = 1
[91]115  echo_dialog_v "2. show_urls !"
[69]116  test_file  ~/.nutchez/sav/n.urls.txt
[91]117  echo_dialog_v "2.1 test_file ~/.nutchez/sav return : $?"
[66]118  # dialog begin
[94]119  dialog --title "請輸入你要抓取的網址(一行一個網址)" --editbox ~/.nutchez/sav/n.urls.txt 16 51 2>/tmp/n.urls.tmp
[66]120  RET=$?
[91]121  echo_dialog_v "2.1 cat url: `cat /tmp/n.urls.tmp`"
[66]122  return $RET
123}
124
125setup_robot () {
[68]126  test_file ~/.nutchez/sav/n.robot.txt
[91]127  echo_dialog_v "3. setup_robot"
[66]128  # dialog
[94]129  dialog --title "設定機器人名稱" --nocancel --inputbox " 這個爬網機器人,你要將他取名為:\n\n (ps: 這個設定只是禮貌性宣告,並不會對結果造成影響) \n" 16 55 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp
[91]130  echo_dialog_v "3.1 cat robot : `cat /tmp/n.robot.tmp`"
[66]131}
132
133setup_crawler () {
[91]134  echo_dialog_v "4. setup_crawler"
[68]135  test_file ~/.nutchez/sav/n.crawler.txt
[94]136  dialog --title "設定抓取深度" --nocancel --inputbox " 對於每個網址,你需要NutchEz爬多深呢?\n\n (ps: 初次體驗建議將深度設為1來感受需要多久)  \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp
[91]137  echo_dialog_v "4.1 cat robot : `cat /tmp/n.robot.tmp`"
[66]138}
139
140setup_tomcat () {
[91]141  echo_dialog_v "5. setup_tomcat"
[68]142  test_file ~/.nutchez/sav/n.tomcat.txt
[94]143  dialog --title "設定網頁伺服器" --nocancel --inputbox " 你希望NutchEz將網頁伺服器開在哪個port \n\n (ps: 請選擇一個沒用到的port以免造成衝突 \n 也請盡量不要設成80以免造成你誤以為是apache的混淆) \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp
[91]144  echo_dialog_v "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`"
[66]145}
146
[94]147continue_previous () {
148  echo_dialog_v "6. setup_tomcat"
149$DIALOG --title "清除上次搜尋" --clear \
150        --yesno "你是否要清除上一次爬網所得的結果,\n否則將加入到URL列裡增加搜尋負擔 \n\n ps: 選no的話,會跑相當相當久,\n 請慎重考慮之\n" 16 51
151
152case $? in
153  0)
154    NOCONTINUE=1;;
155  1)
156    NOCONTINUE=0;;
157  255)
158    echo "ESC pressed.";;
159esac
160echo_dialog_v " 6continue = $CONTINUE"
161}
162
[66]163final_confirm () {
[94]164  echo_dialog_v "7. final_confirm : start =0 , back =1 "
[69]165  tempfile=/tmp/n.finalcheck.tmp
[66]166
[94]167  echo " \n 1. 你所選擇要爬取的網址為 : \n " > $tempfile
[69]168  cat /tmp/n.urls.tmp >> $tempfile
[94]169  echo " \n\n 2. 對於這個爬網機器人,你取名為 : \n" >> $tempfile
[66]170  cat /tmp/n.robot.tmp >> $tempfile
[94]171  echo " \n\n 3. 爬網的深度,你設定為 : \n " >> $tempfile
[66]172  cat /tmp/n.crawler.tmp >> $tempfile
[94]173  echo " \n\n 4. NutchEz將會把你的搜尋結果呈現在這個Port : \n " >> $tempfile
[66]174  cat /tmp/n.tomcat.tmp >> $tempfile
[108]175  if [[ $NOCONTINUE -eq 0 ]];then
[94]176    echo " \n\n 5. 是否要清除上一次的收尋結果 : \n " >> $tempfile
177    echo_dialog_v " 7continue = $CONTINUE"
178    echo "NO" >> $tempfile
[108]179  elif [[ $NOCONTINUE -eq 1 ]];then
[94]180    echo " \n\n 5. 是否要清除上一次的收尋結果繼續搜尋 : \n " >> $tempfile
181    echo_dialog_v " 7continue = $CONTINUE"
182    echo "YES" >> $tempfile
183  else
184    echo_dialog_v " 無資料可匯入 "
185  fi
[66]186  MSG=`cat $tempfile`
[94]187  echo_dialog_v "7.1 final message :\n $MSG"
[66]188  #read READ
[94]189  $DIALOG --title "請檢查你的選擇 ! \n\n 若所有的設定都是正確的,你可以按 \"ok\",\n 若你按了 \"reset\" 則會重頭開始設定, \n 若你選擇 \"exit\" 則會跳出NutchEz的設定選單 \n ps: reset 與 exit都不會把資料記成預設值,請放心使用 " --clear \
[75]190        --extra-button --extra-label "reset" --ok-label "ok" --cancel-label "exit" \
[69]191        --yesno "$MSG" 26 51
[66]192  RET=$?
[100]193  echo_dialog_v "final return = $RET" 
[66]194  return $RET
195}
196
[67]197# define paramaters
198
[77]199set_nutchez_p () {
[69]200  ROBOT=`cat ~/.nutchez/sav/n.robot.txt`
201  URLS=`cat ~/.nutchez/sav/n.urls.txt`
202  DEPTH=`cat ~/.nutchez/sav/n.crawler.txt`
203  PORT=`cat ~/.nutchez/sav/n.tomcat.txt`
[77]204}
205start_crawl () {
[68]206
[91]207  echo_dialog_v "7. start_crawl"
[69]208  setup_nutchez
[68]209  install_tomcat
[100]210  echo_dialog_v "/opt/nutchez/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH"
[91]211  echo_dialog_v "nutch conf dir = $NUTCH_CONF_DIR"
[100]212  /opt/nutchez/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH -topN 5000 -threads 1000
[66]213}
214
215start_tomcat () {
[91]216  echo_dialog_v "8. start_tomcat "
[100]217  echo_dialog_v "/opt/nutchez/nutch/tomcat/bin/startup.sh"
[101]218  #if [ -e /tmp/search ];then
219  #  rm -rf /tmp/search
220  #fi
221  #ln -sf ~/.nutchez/search/ /tmp/
[88]222
[91]223  pid_tc=$(ps axw -eo pid,command |\
[94]224  grep "catalina" | grep "java" |\
225  grep "start" | awk '{print $1}')
[88]226  if [ -z "$pid_tc" ]; then
[94]227      echo_dialog_v "no another tomcat is running"
[88]228  else
[94]229      echo_dialog_v "tomcat had been started and the pid is $pid_tc"
230      echo_dialog_v "stop it first"
[88]231      kill -9 $pid_tc
232      if [ -z $? ];then
[94]233    echo_dialog_v " tomcat ($pid_tc) is  killed ..."
[88]234      else
[94]235    echo_dialog_v "kill error ..." 
[88]236      fi
237  fi
[94]238  echo "Starting Tomcat ...."
[69]239  ~/.nutchez/tomcat/bin/startup.sh
[77]240  sleep 3
[66]241}
242
243show_report () {
[91]244  echo_dialog_v "9. show_report "
[68]245  FIREFOX=`which firefox`
246  RET=$?
[108]247  if [[ $RET -eq 0 ]];then 
[68]248    $FIREFOX -D 0.0 http://localhost:$PORT
[69]249    RET=$?
[68]250  fi
[108]251  if ! [[ $RET -eq 0 ]];then
[94]252    $DIALOG --msgbox "恭喜你已經完成了! \n 你可以用瀏覽器瀏覽: \n  http://host_ip:$PORT" 0 0 
[69]253  fi
[66]254}
Note: See TracBrowser for help on using the repository browser.