#!/bin/bash # Author: WeiYu Chen # License: GPL # Description: Eazily use for Nutch # . . /etc/nutch/hadoop-env.sh || . $NUTCHEZ_SCRIPT_PATH/conf/hadoop-env.sh : ${DIALOG=dialog} # display more for debug VERB=0 echo_vb () { if [ $VERB -eq 1 ]; then $DIALOG --msgbox "$1" 16 51 fi } test_file () { if ! test -e "$1" ; then echo_vb "test_file: \n can not find $1" echo "" > $1 else echo_vb "test_file: \n Touch $1 ! \n Its content is \n `cat $1`" fi } check_if_root() { if [ ! "$UID" = "0" ]; then echo_vb "Hi [$LOGNAME] !! " echo_vb "You need to run this script \"`basename $0`\" as root." exit 1 fi } promote_tempfile () { echo_vb "7. chang tmp as txt" rm ~/.nutchez/sav/n.*.txt mv /tmp/n.url.tmp ~/.nutchez/sav/ mv /tmp/n.robot.tmp ~/.nutchez/sav/ mv /tmp/n.crawler.tmp ~/.nutchez/sav/ mv /tmp/n.tomcat.tmp ~/.nutchez/sav/ } clean_tempfile () { echo_vb "7. delete tmp" rm /tmp/n.*.tmp } init_nutchez () { if ! [ -e ~/.nutchez ] ;then # copy from /etc/nutch cp -rf /etc/nutch/* ~/.nutchez chown -R $LOGNAME:$LOGNAME ~/.nutchez fi } setup_nutchez () { if ! [ -e ~/.nutchez/urls ] ; then # make url list dir mkdir ~/.nutchez/urls fi if [ -e ~/.nutchez/urls/urls.txt ] ; then rm ~/.nutchez/urls/urls.txt fi cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt if [ -e ~/.nutchez/nutch-site.xml ] ; then # set nutch-site.xml sed -ie "s/>user$ROBOT$PORT/tmp/n.url.tmp RET=$? echo_vb "2.1 cat url: `cat /tmp/n.url.tmp`" return $RET } setup_robot () { test_file ~/.nutchez/sav/n.robot.txt echo_vb "3. setup_robot" # dialog dialog --nocancel --inputbox " This agent name \n" 16 51 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp echo_vb "3.1 cat robot : `cat /tmp/n.robot.tmp`" } setup_crawler () { echo_vb "4. setup_crawler" test_file ~/.nutchez/sav/n.crawler.txt dialog --nocancel --inputbox " Depth \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp echo_vb "4.1 cat robot : `cat /tmp/n.robot.tmp`" } setup_tomcat () { echo_vb "5. setup_tomcat" test_file ~/.nutchez/sav/n.tomcat.txt dialog --nocancel --inputbox " explorer port \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp echo_vb "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`" } final_confirm () { echo_vb "6. final_confirm : start =0 , back =1 " tempfile=`tempfile 2>/dev/null` || tempfile=/tmp/n.finalcheck.tmp #trap "rm -f $tempfile" 0 1 2 5 15 echo " \n 1. The url list is : \n " > $tempfile cat /tmp/n.url.tmp >> $tempfile echo " \n 2. The robot name is : \n" >> $tempfile cat /tmp/n.robot.tmp >> $tempfile echo " \n 3. The crawled depth is : \n " >> $tempfile cat /tmp/n.crawler.tmp >> $tempfile echo " \n 4. The explorer port is : \n " >> $tempfile cat /tmp/n.tomcat.tmp >> $tempfile MSG=`cat $tempfile` echo_vb "6.1 final message :\n $MSG" #read READ $DIALOG --title "Check It !!" --clear \ --yesno "$MSG" 16 51 RET=$? echo_vb "final return = $RET" return $RET } # define paramaters start_crawl () { ROBOT=`cat ~/.nutchez/sav` URLS=`cat ~/.nutchez/sav` DEPTH=`cat ~/.nutchez/sav` PORT=`cat ~/.nutchez/sav` echo_vb "7. start_crawl" setup_nutch install_tomcat # /opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH echo "/opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH" } start_tomcat () { echo_vb "8. start_tomcat " /opt/nutch/tomcat/bin/startup.sh } show_report () { echo_vb "9. show_report " FIREFOX=`which firefox` RET=$? if [ RET == 0 ];then $FIREFOX -D 0.0 http://localhost:$PORT else $DIALOG --msgbox "Congratulations! \n you can explore the url: \n http://localhost:8080" 0 0 fi }