#!/bin/bash # Author: WeiYu Chen # License: GPL # Description: Eazily use for Nutch # . : ${DIALOG=dialog} # set 1 to display more for debug, VERB=0 init_nutchez () { if ! [ -e ~/.nutchez ] ;then # copy from /etc/nutch mkdir ~/.nutchez cp -rf /etc/nutch/* ~/.nutchez mkdir ~/.nutchez/log chown -R $LOGNAME:$LOGNAME ~/.nutchez fi # export NUTCH_CONF_DIR=~/.nutchez # export HADOOP_CONF_DIR=~/.nutchez # export HADOOP_LOG_DIR=~/.nutchez/log # . ~/.nutchez/hadoop-env.sh || . /etc/nutch/hadoop-env.sh } echo_vb () { if [ $VERB -eq 1 ]; then $DIALOG --msgbox "$1" 16 51 fi } test_file () { if ! test -e "$1" ; then echo_vb "test_file: \n can not find $1" echo "" > $1 else echo_vb "test_file: \n Touch $1 ! \n Its content is \n `cat $1`" fi } check_if_root() { if [ ! "$UID" = "0" ]; then echo_vb "Hi [$LOGNAME] !! " echo_vb "You need to run this script \"`basename $0`\" as root." exit 1 fi } promote_tempfile () { echo_vb "7. chang tmp as txt" rm ~/.nutchez/sav/n.*.txt mv /tmp/n.urls.tmp ~/.nutchez/sav/n.urls.txt mv /tmp/n.robot.tmp ~/.nutchez/sav/n.robot.txt mv /tmp/n.crawler.tmp ~/.nutchez/sav/n.crawler.txt mv /tmp/n.tomcat.tmp ~/.nutchez/sav/n.tomcat.txt } clean_tempfile () { echo_vb "7. delete tmp" rm /tmp/n.*.tmp } setup_nutchez () { if ! [ -e ~/.nutchez/urls ] ; then # make url list dir mkdir ~/.nutchez/urls fi if [ -e ~/.nutchez/urls/urls.txt ] ; then rm ~/.nutchez/urls/urls.txt fi cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt if [ -e ~/.nutchez/nutch-site.xml ] ; then # set nutch-site.xml sed -i -e "4s/[a-zA-Z0-9]*$ROBOT/tmp/n.urls.tmp RET=$? echo_vb "2.1 cat url: `cat /tmp/n.urls.tmp`" return $RET } setup_robot () { test_file ~/.nutchez/sav/n.robot.txt echo_vb "3. setup_robot" # dialog dialog --nocancel --inputbox " This agent name \n" 16 51 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp echo_vb "3.1 cat robot : `cat /tmp/n.robot.tmp`" } setup_crawler () { echo_vb "4. setup_crawler" test_file ~/.nutchez/sav/n.crawler.txt dialog --nocancel --inputbox " Depth \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp echo_vb "4.1 cat robot : `cat /tmp/n.robot.tmp`" } setup_tomcat () { echo_vb "5. setup_tomcat" test_file ~/.nutchez/sav/n.tomcat.txt dialog --nocancel --inputbox " explorer port \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp echo_vb "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`" } final_confirm () { echo_vb "6. final_confirm : start =0 , back =1 " tempfile=/tmp/n.finalcheck.tmp echo " \n 1. The url list is : \n " > $tempfile cat /tmp/n.urls.tmp >> $tempfile echo " \n 2. The robot name is : \n" >> $tempfile cat /tmp/n.robot.tmp >> $tempfile echo " \n 3. The crawled depth is : \n " >> $tempfile cat /tmp/n.crawler.tmp >> $tempfile echo " \n 4. The explorer port is : \n " >> $tempfile cat /tmp/n.tomcat.tmp >> $tempfile MSG=`cat $tempfile` echo_vb "6.1 final message :\n $MSG" #read READ $DIALOG --title "Check It !!" --clear \ --extra-button --extra-label "reset" --ok-label "ok" --cancel-label "exit" \ --yesno "$MSG" 26 51 RET=$? echo_vb "final return = $RET" return $RET } # define paramaters set_nutchez_p () { ROBOT=`cat ~/.nutchez/sav/n.robot.txt` URLS=`cat ~/.nutchez/sav/n.urls.txt` DEPTH=`cat ~/.nutchez/sav/n.crawler.txt` PORT=`cat ~/.nutchez/sav/n.tomcat.txt` } start_crawl () { echo_vb "7. start_crawl" setup_nutchez install_tomcat echo_vb "/opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH" echo_vb "nutch conf dir = $NUTCH_CONF_DIR" /opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH } start_tomcat () { echo_vb "8. start_tomcat " echo_vb "/opt/nutch/tomcat/bin/startup.sh" if [ -e /tmp/search ];then rm -rf /tmp/search fi ln -sf ~/.nutchez/search/ /tmp/ ~/.nutchez/tomcat/bin/shutdown.sh ~/.nutchez/tomcat/bin/startup.sh sleep 3 } show_report () { echo_vb "9. show_report " FIREFOX=`which firefox` RET=$? if [ $RET == 0 ];then $FIREFOX -D 0.0 http://localhost:$PORT RET=$? fi if ! [ $RET == 0 ];then $DIALOG --msgbox "Congratulations! \n you can explore the url: \n http://localhost:$PORT" 0 0 fi }