#!/bin/bash
# Author: WeiYu Chen <waue _at_ nchc org tw>
# License: GPL
# Description: Eazily use for Nutch
# .


#DIALOG='dialog --backtitle "     NutchEz Setup Menu           -- powered by NCHC "'
DIALOG=dialog

# set 1 to display more for debug, 
VERB=0

init_nutchez () {
  if ! [ -e ~/.nutchez ] ;then
    # copy from /etc/nutch
    mkdir ~/.nutchez
    cp -rf /etc/nutch/conf ~/.nutchez/
    mkdir ~/.nutchez/log
    cp -rf /etc/nutch/sav ~/.nutchez/
    chown -R $LOGNAME:$LOGNAME ~/.nutchez
  fi
  
}

echo_dialog_v () {
  if [[ $VERB -eq 1 ]]; then
    $DIALOG --msgbox "$1" 16 51
  fi
}

test_file () {
  if ! test -e "$1" ; then
    echo_dialog_v "test_file: \n can not find $1"
    echo "" >  $1
  else
     echo_dialog_v "test_file: \n Touch  $1 ! \n Its content is \n `cat $1`"
  fi
}

check_if_root() {
   if [[ ! "$UID" -eq "0" ]]; then
     echo_dialog_v "Hi [$LOGNAME] !! "
     echo_dialog_v "You need to run this script \"`basename $0`\" as root."
     exit 1
   fi
}

promote_tempfile () {
  echo_dialog_v "7. chang tmp as txt"
  rm -f ~/.nutchez/sav/n.*.txt
  mv /tmp/n.urls.tmp ~/.nutchez/sav/n.urls.txt
  mv /tmp/n.robot.tmp ~/.nutchez/sav/n.robot.txt
  mv /tmp/n.crawler.tmp ~/.nutchez/sav/n.crawler.txt
  mv /tmp/n.tomcat.tmp ~/.nutchez/sav/n.tomcat.txt
  rm -f /tmp/n.*.tmp
}

clean_tempfile () {
  echo_dialog_v "7. delete tmp"
  rm -f /tmp/n.*.tmp
}

setup_nutchez () {    
  if ! [ -e ~/.nutchez/urls ] ; then
    # make url list dir
    mkdir ~/.nutchez/urls
  fi

  if  [ -e ~/.nutchez/urls/urls.txt ] ; then
    rm ~/.nutchez/urls/urls.txt
  fi

  cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt
  if ! [ -z $NOCONTINUE ]; then
    if [[ $NOCONTINUE -eq 1 ]]; then
      echo_dialog_v " delete the ~/.nutchez/search/*"
      DATE=`date +%Y%m%d%H%M%S`
      mv ~/.nutchez/search ~/.nutchez/search-$DATE
      $DIALOG --msgbox "上次搜尋的結果改放到 ~/.nutchez/search-$DATE " 0 0
  #    rm -f /tmp/search
  #    rm -rf ~/.nutchez/search/*
    fi
  fi

  if [ -e ~/.nutchez/conf/nutch-site.xml ] ; then
    # set nutch-site.xml
    sed -i -e "4s/<value>[a-zA-Z0-9]*</<value>$ROBOT</" ~/.nutchez/conf/nutch-site.xml
  fi
}

install_tomcat (){
  if ! [ -e ~/.nutchez/tomcat ] ;then
    # isntall tomcat to home
    cp -rf /opt/nutchez/tomcat ~/.nutchez/
    chown -R $LOGNAME:$LOGNAME ~/.nutchez/tomcat/
    # make search dir
    if ! [ -e ~/.nutchez/search ] ;then
      mkdir ~/.nutchez/search
    fi

   # setup search dir
    cd ~
    HOMEDIR=`pwd`/.nutchez/search
    sed -i -e "8s|<value>[a-zA-Z0-9/]*<|<value>$HOMEDIR<|" ~/.nutchez/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml

  fi
  # change explorer port
  sed -i -e "67s/<Connector port=\"[0-9]*\"/<Connector port=\"$PORT\"/" ~/.nutchez/tomcat/conf/server.xml
}


show_urls (){
  # show urls : ok =0 ,cancel = 1
  echo_dialog_v "2. show_urls !"
  test_file  ~/.nutchez/sav/n.urls.txt
  echo_dialog_v "2.1 test_file ~/.nutchez/sav return : $?"
  # dialog begin
  dialog --title "請輸入你要抓取的網址（一行一個網址）" --editbox ~/.nutchez/sav/n.urls.txt 16 51 2>/tmp/n.urls.tmp
  RET=$?
  echo_dialog_v "2.1 cat url: `cat /tmp/n.urls.tmp`"
  return $RET
}

setup_robot () {
  test_file ~/.nutchez/sav/n.robot.txt
  echo_dialog_v "3. setup_robot"
  # dialog
  dialog --title "設定機器人名稱" --nocancel --inputbox " 這個爬網機器人，你要將他取名為：\n\n (ps: 這個設定只是禮貌性宣告，並不會對結果造成影響) \n" 16 55 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp
  echo_dialog_v "3.1 cat robot : `cat /tmp/n.robot.tmp`"
}

setup_crawler () {
  echo_dialog_v "4. setup_crawler"
  test_file ~/.nutchez/sav/n.crawler.txt
  dialog --title "設定抓取深度" --nocancel --inputbox " 對於每個網址，你需要NutchEz爬多深呢？\n\n (ps: 初次體驗建議將深度設為1來感受需要多久)  \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp
  echo_dialog_v "4.1 cat robot : `cat /tmp/n.robot.tmp`"
}

setup_tomcat () {
  echo_dialog_v "5. setup_tomcat"
  test_file ~/.nutchez/sav/n.tomcat.txt
  dialog --title "設定網頁伺服器" --nocancel --inputbox " 你希望NutchEz將網頁伺服器開在哪個port \n\n (ps: 請選擇一個沒用到的port以免造成衝突 \n 也請盡量不要設成80以免造成你誤以為是apache的混淆) \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp
  echo_dialog_v "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`"
}

continue_previous () {
  echo_dialog_v "6. setup_tomcat"
$DIALOG --title "清除上次搜尋" --clear \
        --yesno "你是否要清除上一次爬網所得的結果，\n否則將加入到URL列裡增加搜尋負擔 \n\n ps: 選no的話，會跑相當相當久，\n 請慎重考慮之\n" 16 51

case $? in
  0)
    NOCONTINUE=1;;
  1)
    NOCONTINUE=0;;
  255)
    echo "ESC pressed.";;
esac
echo_dialog_v " 6continue = $CONTINUE"
}

final_confirm () {
  echo_dialog_v "7. final_confirm : start =0 , back =1 "
  tempfile=/tmp/n.finalcheck.tmp

  echo " \n 1. 你所選擇要爬取的網址為 : \n " > $tempfile
  cat /tmp/n.urls.tmp >> $tempfile
  echo " \n\n 2. 對於這個爬網機器人，你取名為 : \n" >> $tempfile
  cat /tmp/n.robot.tmp >> $tempfile
  echo " \n\n 3. 爬網的深度，你設定為 : \n " >> $tempfile
  cat /tmp/n.crawler.tmp >> $tempfile
  echo " \n\n 4. NutchEz將會把你的搜尋結果呈現在這個Port : \n " >> $tempfile
  cat /tmp/n.tomcat.tmp >> $tempfile
  if [[ $NOCONTINUE -eq 0 ]];then
    echo " \n\n 5. 是否要清除上一次的收尋結果 : \n " >> $tempfile
    echo_dialog_v " 7continue = $CONTINUE"
    echo "NO" >> $tempfile
  elif [[ $NOCONTINUE -eq 1 ]];then
    echo " \n\n 5. 是否要清除上一次的收尋結果繼續搜尋 : \n " >> $tempfile
    echo_dialog_v " 7continue = $CONTINUE"
    echo "YES" >> $tempfile
  else
    echo_dialog_v " 無資料可匯入 "
  fi
  MSG=`cat $tempfile`
  echo_dialog_v "7.1 final message :\n $MSG"
  #read READ
  $DIALOG --title "請檢查你的選擇 ! \n\n 若所有的設定都是正確的，你可以按 \"ok\",\n 若你按了 \"reset\" 則會重頭開始設定, \n 若你選擇 \"exit\" 則會跳出NutchEz的設定選單 \n ps: reset 與 exit都不會把資料記成預設值，請放心使用 " --clear \
        --extra-button --extra-label "reset" --ok-label "ok" --cancel-label "exit" \
        --yesno "$MSG" 26 51
  RET=$?
  echo_dialog_v "final return = $RET"  
  return $RET
}

# define paramaters

set_nutchez_p () {
  ROBOT=`cat ~/.nutchez/sav/n.robot.txt`
  URLS=`cat ~/.nutchez/sav/n.urls.txt`
  DEPTH=`cat ~/.nutchez/sav/n.crawler.txt`
  PORT=`cat ~/.nutchez/sav/n.tomcat.txt`
}
start_crawl () {

  echo_dialog_v "7. start_crawl"
  setup_nutchez
  install_tomcat
  echo_dialog_v "/opt/nutchez/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH"
  echo_dialog_v "nutch conf dir = $NUTCH_CONF_DIR"
  /opt/nutchez/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH -topN 5000 -threads 1000
}

start_tomcat () {
  echo_dialog_v "8. start_tomcat "
  echo_dialog_v "/opt/nutchez/nutch/tomcat/bin/startup.sh"
  #if [ -e /tmp/search ];then
  #  rm -rf /tmp/search
  #fi
  #ln -sf ~/.nutchez/search/ /tmp/

  pid_tc=$(ps axw -eo pid,command |\
  grep "catalina" | grep "java" |\
  grep "start" | awk '{print $1}')
  if [ -z "$pid_tc" ]; then
      echo_dialog_v "no another tomcat is running"
  else
      echo_dialog_v "tomcat had been started and the pid is $pid_tc"
      echo_dialog_v "stop it first"
      kill -9 $pid_tc
      if [ -z $? ];then
	  echo_dialog_v " tomcat ($pid_tc) is  killed ..."
      else
	  echo_dialog_v "kill error ..." 
      fi
  fi
  echo "Starting Tomcat ...."
  ~/.nutchez/tomcat/bin/startup.sh
  sleep 3
}

show_report () {
  echo_dialog_v "9. show_report "
  FIREFOX=`which firefox`
  RET=$?
  if [[ $RET -eq 0 ]];then 
    $FIREFOX -D 0.0 http://localhost:$PORT
    RET=$?
  fi
  if ! [[ $RET -eq 0 ]];then
    $DIALOG --msgbox "恭喜你已經完成了! \n 你可以用瀏覽器瀏覽: \n  http://host_ip:$PORT" 0 0 
  fi
}
