Changeset 94
- Timestamp:
- Jul 8, 2009, 2:20:41 PM (15 years ago)
- Location:
- nutchez-0.1
- Files:
-
- 9 edited
Legend:
- Unmodified
- Added
- Removed
-
nutchez-0.1/Makefile
r90 r94 13 13 @echo "make deb - Build Debian Package." 14 14 @echo "make clean - Clean up Debian Package temparate files." 15 @echo "make source - download source tarball from hadoop mirror site."16 15 @echo "make help - show Makefile options." 17 16 @echo " " -
nutchez-0.1/README
r89 r94 3 3 ************************************************ 4 4 5 NutchEz is developed by N CHCin Taiwan5 NutchEz is developed by National Center for High-Performance Computing (NCHC) in Taiwan 6 6 7 * How to install : 8 You should download the deb file and type the instruction at the same dir. 9 ============================ 10 sudo dpkg -i nutchez-*.deb 11 ============================ 7 * How to lunch nutchez (Type following Instruction) 8 ======================================== 9 nutchez 10 ======================================== 12 11 13 14 * How to lunch nutchez 15 ============ 16 nutchez 17 ============ 18 19 * How to remove nutchez 20 ===================== 12 * How to remove nutchez (Type following Instruction) 13 ======================================== 21 14 sudo dpkg -P nutchez 22 ===================== 15 ======================================== 23 16 24 17 * Where are the program and configuration files … … 27 20 2. Nutch conf dir is set on ~/.nutchez/conf 28 21 3. Tomcat home is installed on ~/.nutchez/tomcat 29 30 22 4. The final crawl results is located on ~/.nutchez/search 23 5. The nutchez log will be recorded on ~/.nutchez/log 24 6. The full path of this README is /etc/nutch/README -
nutchez-0.1/bin/nutchez
r91 r94 12 12 export JAVA_HOME=/usr/lib/jvm/java-6-sun 13 13 export NUTCH_HOME=/opt/nutch 14 # export HADOOP_HOME=/opt/nutch15 14 export NUTCH_CONF_DIR=~/.nutchez/conf 16 # export HADOOP_CONF_DIR=~/.nutchez/conf17 15 export NUTCH_LOG_DIR=~/.nutchez/log 18 16 17 . /opt/nutch/bin/nutchez-func.sh 18 19 init_nutchez 19 20 20 21 22 tfile=`tempfile 2>/dev/null` || tfile=/tmp/test$$ 23 trap "rm -f $tfile" 0 1 2 5 15 21 24 22 . /opt/nutch/bin/nutchez-func.sh 23 #. ~/.nutchez/hadoop-env.sh 25 $DIALOG --backtitle "Developed By NCHC" --clear --item-help --title "NutchEz 雛型版" \ 26 --menu " 你好,歡迎使用NutchEz! \n\ 27 這套軟體是用來打造專屬於你的搜尋引擎 \n\ 28 你有網頁不希望被公開的搜尋引擎找到, \n\ 29 卻又希望能有個搜尋介面的困擾嗎? \n\ 30 用NutchEz就對了!因為他操作簡單, \n\ 31 除了基本的網頁以外,還支援多種格式(ppt,doc,txt...) \n\ 32 並且是開源碼軟體,完全免費,安全無虞\n\ 33 趕快來使用看看吧!\n\n\ 34 選擇你要的模式:" 20 60 4 \ 35 "1" "開始建構搜尋內容" "透過NutchEz來建構專屬於你自己所需的內容的搜尋引擎" \ 36 "2" "開啟或關閉NutchEz的網頁伺服器" "若您之前已經執行完1之後才需做網頁伺服器的管理" 2> $tfile 24 37 38 rev=$? 25 39 40 MAIN_CHOISE=`cat $tfile` 26 41 27 init_nutchez 42 case $rev in 43 0) 44 echo_dialog_v "OK '$MAIN_CHOISE' chosen.";; 45 1) 46 echo_dialog_v "Cancel pressed." 47 exit 0 ;; 48 2) 49 echo_dialog_v "HELP '$MAIN_CHOISE' chosen.";; 50 255) 51 echo_dialog_v "ESC pressed.";; 52 *) 53 echo_dialog_v "Unexpected code $MAIN_CHOISE";; 54 esac 55 56 if [ $MAIN_CHOISE -eq 2 ];then 57 58 pid_tc=$(ps axw -eo pid,command |\ 59 grep "catalina" | grep "java" |\ 60 grep "start" | awk '{print $1}') 61 if [ -z "$pid_tc" ]; then 62 echo_dialog_v "0. pid = $pid_tc ! no another toddmcat is running" 63 64 $DIALOG --title "你的NutchEz網頁伺服器沒打開.." --clear \ 65 --yesno "\n 要開啟NutchEz的網頁伺服器嗎? \n" 15 61 66 if [ $? -eq 0 ];then 67 ~/.nutchez/tomcat/bin/startup.sh 68 PORT=`cat ~/.nutchez/sav/n.tomcat.txt` 69 $DIALOG --msgbox "已經試著開啟瀏覽伺服器,你可以瀏覽這個網址看看: \n http://localhost:$PORT" 0 0 70 else 71 $DIALOG --msgbox "你選擇不要打開瀏覽伺服器!" 0 0 72 fi 73 74 else 75 echo_dialog_v "0. tomcat had been started and the pid is $pid_tc" 76 77 $DIALOG --title "偵測到NutchEz的網頁伺服器正在運作.." --clear \ 78 --yesno "\n 你要關閉他嗎? \n" 15 61 79 if [ $? -eq 0 ];then 80 ~/.nutchez/tomcat/bin/shutdown.sh 81 kill -9 $pid_tc 82 $DIALOG --msgbox "已經試著關閉NutchEz網頁伺服器" 0 0 83 else 84 $DIALOG --msgbox "你選擇不要關閉瀏覽伺服器!" 0 0 85 fi 86 87 fi 88 exit 0 89 90 elif [ $MAIN_CHOISE -eq 1 ];then 28 91 29 92 CHECK=0 … … 35 98 show_urls 36 99 URL=$? 37 echo_ vb"2.2 show_urls return $URL"100 echo_dialog_v "2.2 show_urls return $URL" 38 101 39 102 # add or delete url: ok , exit … … 63 126 setup_tomcat 64 127 128 # continue last search 129 if [ -e ~/.nutchez/search ];then 130 continue_previous 131 fi 65 132 # show the final checklist 66 133 final_confirm 67 134 FC=$? 68 echo_ vb"6.3 final confirm return = $FC "135 echo_dialog_v "6.3 final confirm return = $FC " 69 136 # START , back, cancel 70 137 # start =0 , back =1 … … 90 157 ;; 91 158 255) 92 echo_ vb"ESC pressed !!"159 echo_dialog_v "ESC pressed !!" 93 160 ;; 94 161 esac … … 108 175 # Done 109 176 177 178 179 180 else 181 182 echo_dialog_v "Main Choise is $MAIN_CHOISE ! Error !!" 183 184 fi -
nutchez-0.1/bin/nutchez-func.sh
r91 r94 6 6 7 7 8 : ${DIALOG=dialog} 8 #DIALOG='dialog --backtitle " NutchEz Setup Menu -- powered by NCHC "' 9 DIALOG=dialog 9 10 10 11 # set 1 to display more for debug, … … 39 40 40 41 check_if_root() { 41 if [ ! "$UID" ="0" ]; then42 if [ ! "$UID" -eq "0" ]; then 42 43 echo_dialog_v "Hi [$LOGNAME] !! " 43 44 echo_dialog_v "You need to run this script \"`basename $0`\" as root." … … 71 72 72 73 cp ~/.nutchez/sav/n.urls.txt ~/.nutchez/urls/urls.txt 74 75 if [ $NOCONTINUE -eq 1 ]; then 76 echo_dialog_v " delete the ~/.nutchez/search/*" 77 DATE=`date +%y%m%d%H%M%S` 78 mv ~/.nutchez/search ~/.nutchez/search-$DATE 79 $DIALOG --msgbox "上次搜尋的結果改放到 ~/.nutchez/search-$DATE " 0 0 80 rm -f /tmp/search 81 # rm -rf ~/.nutchez/search/* 82 fi 73 83 74 84 if [ -e ~/.nutchez/conf/nutch-site.xml ] ; then … … 100 110 echo_dialog_v "2.1 test_file ~/.nutchez/sav return : $?" 101 111 # dialog begin 102 dialog --title " The URLS that you want" --editbox ~/.nutchez/sav/n.urls.txt 16 51 2>/tmp/n.urls.tmp112 dialog --title "請輸入你要抓取的網址(一行一個網址)" --editbox ~/.nutchez/sav/n.urls.txt 16 51 2>/tmp/n.urls.tmp 103 113 RET=$? 104 114 echo_dialog_v "2.1 cat url: `cat /tmp/n.urls.tmp`" … … 110 120 echo_dialog_v "3. setup_robot" 111 121 # dialog 112 dialog -- nocancel --inputbox " This agent name \n" 16 51"`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp122 dialog --title "設定機器人名稱" --nocancel --inputbox " 這個爬網機器人,你要將他取名為:\n\n (ps: 這個設定只是禮貌性宣告,並不會對結果造成影響) \n" 16 55 "`cat ~/.nutchez/sav/n.robot.txt`" 2>/tmp/n.robot.tmp 113 123 echo_dialog_v "3.1 cat robot : `cat /tmp/n.robot.tmp`" 114 124 } … … 117 127 echo_dialog_v "4. setup_crawler" 118 128 test_file ~/.nutchez/sav/n.crawler.txt 119 dialog -- nocancel --inputbox " Depth\n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp129 dialog --title "設定抓取深度" --nocancel --inputbox " 對於每個網址,你需要NutchEz爬多深呢?\n\n (ps: 初次體驗建議將深度設為1來感受需要多久) \n " 16 51 "`cat ~/.nutchez/sav/n.crawler.txt`" 2>/tmp/n.crawler.tmp 120 130 echo_dialog_v "4.1 cat robot : `cat /tmp/n.robot.tmp`" 121 131 } … … 124 134 echo_dialog_v "5. setup_tomcat" 125 135 test_file ~/.nutchez/sav/n.tomcat.txt 126 dialog -- nocancel --inputbox " explorer port\n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp136 dialog --title "設定網頁伺服器" --nocancel --inputbox " 你希望NutchEz將網頁伺服器開在哪個port \n\n (ps: 請選擇一個沒用到的port以免造成衝突 \n 也請盡量不要設成80以免造成你誤以為是apache的混淆) \n " 16 51 "`cat ~/.nutchez/sav/n.tomcat.txt`" 2>/tmp/n.tomcat.tmp 127 137 echo_dialog_v "5.1 cat tomcat : `cat /tmp/n.tomcat.tmp`" 128 138 } 129 139 140 continue_previous () { 141 echo_dialog_v "6. setup_tomcat" 142 $DIALOG --title "清除上次搜尋" --clear \ 143 --yesno "你是否要清除上一次爬網所得的結果,\n否則將加入到URL列裡增加搜尋負擔 \n\n ps: 選no的話,會跑相當相當久,\n 請慎重考慮之\n" 16 51 144 145 case $? in 146 0) 147 NOCONTINUE=1;; 148 1) 149 NOCONTINUE=0;; 150 255) 151 echo "ESC pressed.";; 152 esac 153 echo_dialog_v " 6continue = $CONTINUE" 154 } 155 130 156 final_confirm () { 131 echo_dialog_v " 6. final_confirm : start =0 , back =1 "157 echo_dialog_v "7. final_confirm : start =0 , back =1 " 132 158 tempfile=/tmp/n.finalcheck.tmp 133 159 134 echo " \n 1. The url list is: \n " > $tempfile160 echo " \n 1. 你所選擇要爬取的網址為 : \n " > $tempfile 135 161 cat /tmp/n.urls.tmp >> $tempfile 136 echo " \n 2. The robot name is: \n" >> $tempfile162 echo " \n\n 2. 對於這個爬網機器人,你取名為 : \n" >> $tempfile 137 163 cat /tmp/n.robot.tmp >> $tempfile 138 echo " \n 3. The crawled depth is: \n " >> $tempfile164 echo " \n\n 3. 爬網的深度,你設定為 : \n " >> $tempfile 139 165 cat /tmp/n.crawler.tmp >> $tempfile 140 echo " \n 4. The explorer port is: \n " >> $tempfile166 echo " \n\n 4. NutchEz將會把你的搜尋結果呈現在這個Port : \n " >> $tempfile 141 167 cat /tmp/n.tomcat.tmp >> $tempfile 142 168 if [ $NOCONTINUE -eq 0 ];then 169 echo " \n\n 5. 是否要清除上一次的收尋結果 : \n " >> $tempfile 170 echo_dialog_v " 7continue = $CONTINUE" 171 echo "NO" >> $tempfile 172 elif [ $NOCONTINUE -eq 1 ];then 173 echo " \n\n 5. 是否要清除上一次的收尋結果繼續搜尋 : \n " >> $tempfile 174 echo_dialog_v " 7continue = $CONTINUE" 175 echo "YES" >> $tempfile 176 else 177 echo_dialog_v " 無資料可匯入 " 178 fi 143 179 MSG=`cat $tempfile` 144 echo_dialog_v " 6.1 final message :\n $MSG"180 echo_dialog_v "7.1 final message :\n $MSG" 145 181 #read READ 146 $DIALOG --title " Check It !!" --clear \182 $DIALOG --title "請檢查你的選擇 ! \n\n 若所有的設定都是正確的,你可以按 \"ok\",\n 若你按了 \"reset\" 則會重頭開始設定, \n 若你選擇 \"exit\" 則會跳出NutchEz的設定選單 \n ps: reset 與 exit都不會把資料記成預設值,請放心使用 " --clear \ 147 183 --extra-button --extra-label "reset" --ok-label "ok" --cancel-label "exit" \ 148 184 --yesno "$MSG" 26 51 … … 179 215 180 216 pid_tc=$(ps axw -eo pid,command |\ 181 grep "catalina.startup.Bootstrap" |\182 grep "start" | awk '{print $1}')217 grep "catalina" | grep "java" |\ 218 grep "start" | awk '{print $1}') 183 219 if [ -z "$pid_tc" ]; then 184 echo "starting tomcat"220 echo_dialog_v "no another tomcat is running" 185 221 else 186 echo "tomcat had been started and the pid is $pid_tc"187 echo "stop it first"222 echo_dialog_v "tomcat had been started and the pid is $pid_tc" 223 echo_dialog_v "stop it first" 188 224 kill -9 $pid_tc 189 225 if [ -z $? ];then 190 echo " tomcat ($pid_tc) is killed ..."226 echo_dialog_v " tomcat ($pid_tc) is killed ..." 191 227 else 192 echo "kill error ..."228 echo_dialog_v "kill error ..." 193 229 fi 194 230 fi 231 echo "Starting Tomcat ...." 195 232 ~/.nutchez/tomcat/bin/startup.sh 196 233 sleep 3 … … 201 238 FIREFOX=`which firefox` 202 239 RET=$? 203 if [ $RET ==0 ];then240 if [ $RET -eq 0 ];then 204 241 $FIREFOX -D 0.0 http://localhost:$PORT 205 242 RET=$? 206 243 fi 207 if ! [ $RET ==0 ];then208 $DIALOG --msgbox " Congratulations! \n you can explore the url: \n http://localhost:$PORT" 0 0209 fi 210 } 244 if ! [ $RET -eq 0 ];then 245 $DIALOG --msgbox "恭喜你已經完成了! \n 你可以用瀏覽器瀏覽: \n http://host_ip:$PORT" 0 0 246 fi 247 } -
nutchez-0.1/debian/changelog
r66 r94 1 nutchez (0.1- 1) unstable; urgency=low1 nutchez (0.1-2) unstable; urgency=low 2 2 3 3 * Initial release (Closes: #nnnn) <nnnn is the bug number of your ITP> 4 * 4 5 5 -- Wei-Yu Chen <waue0920@gmail.com> Tue, 12 May 2009 11:15:51 +0800 6 6 -- Wei-Yu Chen <waue0920@gmail.com> Tue, 07 Jul 2009 11:22:46 +0800 -
nutchez-0.1/debian/nutchez.install
r90 r94 3 3 README* etc/nutch 4 4 bin opt/nutch 5 bin/nutchez* usr/local/bin6 5 lib opt/nutch 7 webapps opt/nutch8 6 tomcat opt/nutch 9 7 plugins opt/nutch -
nutchez-0.1/debian/nutchez.postinst
r69 r94 8 8 fi 9 9 10 ln -sf /opt/nutch/bin/nutchez /usr/ local/sbin/11 ln -sf /opt/nutch/bin/nutchez-func.sh /usr/ local/sbin/10 ln -sf /opt/nutch/bin/nutchez /usr/sbin/ 11 ln -sf /opt/nutch/bin/nutchez-func.sh /usr/sbin/ 12 12 13 setup_hdfsadm_user() { 14 if ! getent passwd hdfsadm >/dev/null; then 15 useradd hdfsadm 16 mkdir -p /home/hdfsadm/.ssh 17 mkdir -p /var/log/nutch 18 ssh-keygen -t rsa -q -f /home/hdfsadm/.ssh/id_rsa -N "" 19 cp /home/hdfsadm/.ssh/id_rsa.pub /home/hdfsadm/.ssh/authorized_keys 20 chown hdfsadm:hdfsadm /var/log/nutch 21 chown -R hdfsadm:hdfsadm /opt/nutch 22 chown -R hdfsadm:hdfsadm /home/hdfsadm 23 fi 13 show_message(){ 14 cat /etc/nutch/README 24 15 } 25 26 check_root() { 27 if ! test -e /root/.ssh/id_rsa ; then 28 ssh-keygen -t rsa -q -f /root/.ssh/id_rsa -N "" 29 fi 30 if test -e /root/.ssh/id_rsa.pub ; then 31 if ! test -e /root/.ssh/authorized_keys ; then 32 cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys 33 fi 34 else 35 ssh-keygen -t rsa -q -f /root/.ssh/id_rsa -N "" 36 cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys 37 fi 38 39 } 40 41 start_hadoop() { 42 if getent passwd hdfsadm >/dev/null; then 43 su -c "/opt/nutch/bin/hadoop namenode -format" hdfsadm - 44 su -c "/opt/nutch/bin/start-all.sh" hdfsadm - 45 else 46 /opt/nutch/conf/hadoop-env.sh 47 /opt/nutch/bin/hadoop namenode -format 48 /opt/nutch/bin/start-all.sh 49 fi 50 } 51 show_message() { 16 show_old_message() { 52 17 echo "You can use the instruction : \" nutchez\" to easyly use nutch" 53 18 echo "Enjoy" 54 19 } 55 show_old_message() {56 echo "You can quickly start by following ways [in /opt/nutch/ with root privilege]:"57 echo "(1) Modify the urls/urls.txt file with indicate urls, one site one line."58 echo "(2) Use this instruction \"bin/nutch crawl urls -dir search -depth 4 -topN 50\" to crawl web"59 echo "(3) Type \" tomcat/bin/startup.sh \" and use browser to check the result in http://localhost:8080/"60 echo "Enjoy !"61 }62 #setup_hdfsadm_user63 #check_root64 #start_hadoop65 20 show_message -
nutchez-0.1/debian/nutchez.postrm
r80 r94 18 18 done 19 19 20 rm -f /usr/ local/sbin/nutchez*20 rm -f /usr/sbin/nutchez* 21 21 22 22 rm -rf /tmp/search -
nutchez-0.1/debian/rules
r66 r94 13 13 dh_compress 14 14 dh_fixperms 15 find ./debian/nutchez/ -name ".svn" -type d > svnfolders 16 rm -rf `cat svnfolders` 15 17 dh_installdeb 16 18 dh_link
Note: See TracChangeset
for help on using the changeset viewer.