source: nutchez-0.1/bin/nutch @ 156

Last change on this file since 156 was 100, checked in by waue, 15 years ago

0.3 v

fix some bug

  • Property svn:executable set to *
File size: 7.8 KB
Line 
1#!/bin/bash
2#
3# The Nutch command script
4#
5# Environment Variables
6#
7#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
8#
9#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB.
10#                   Default is 1000.
11#
12#   NUTCH_OPTS      Extra Java runtime options.
13#
14
15NUTCH_HOME=/opt/nutchez/nutch
16NUTCH_CONF_DIR=~/.nutchez/conf
17NUTCH_LOG_DIR=~/.nutchez/log
18
19cygwin=false
20case "`uname`" in
21CYGWIN*) cygwin=true;;
22esac
23
24# resolve links - $0 may be a softlink
25THIS="$0"
26while [ -h "$THIS" ]; do
27  ls=`ls -ld "$THIS"`
28  link=`expr "$ls" : '.*-> \(.*\)$'`
29  if expr "$link" : '.*/.*' > /dev/null; then
30    THIS="$link"
31  else
32    THIS=`dirname "$THIS"`/"$link"
33  fi
34done
35
36# if no args specified, show usage
37if [ $# = 0 ]; then
38  echo "Usage: nutch [-core] COMMAND"
39  echo "where COMMAND is one of:"
40  echo "  crawl             one-step crawler for intranets"
41  echo "  readdb            read / dump crawl db"
42  echo "  convdb            convert crawl db from pre-0.9 format"
43  echo "  mergedb           merge crawldb-s, with optional filtering"
44  echo "  readlinkdb        read / dump link db"
45  echo "  inject            inject new urls into the database"
46  echo "  generate          generate new segments to fetch from crawl db"
47  echo "  freegen           generate new segments to fetch from text files"
48  echo "  fetch             fetch a segment's pages"
49  echo "  parse             parse a segment's pages"
50  echo "  readseg           read / dump segment data"
51  echo "  mergesegs         merge several segments, with optional filtering and slicing"
52  echo "  updatedb          update crawl db from segments after fetching"
53  echo "  invertlinks       create a linkdb from parsed segments"
54  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
55  echo "  index             run the indexer on parsed segments and linkdb"
56  echo "  solrindex         run the solr indexer on parsed segments and linkdb"
57  echo "  merge             merge several segment indexes"
58  echo "  dedup             remove duplicates from a set of segment indexes"
59  echo "  solrdedup         remove duplicates from solr"
60  echo "  plugin            load a plugin and run one of its classes main()"
61  echo "  server            run a search server"
62  echo " or"
63  echo "  CLASSNAME         run the class named CLASSNAME"
64  echo "Most commands print help when invoked w/o parameters."
65  echo ""
66  echo "Expert: -core option is for developers only. It avoids building the job jar, "
67  echo "        instead it simply includes classes compiled with ant compile-core. "
68  echo "        NOTE: this works only for jobs executed in 'local' mode"
69  exit 1
70fi
71
72IS_CORE=0
73#check for -core option
74if [ "$1" == "-core" ] ; then
75  IS_CORE=1
76  shift
77fi
78
79# get arguments
80COMMAND=$1
81shift
82
83# some directories
84
85
86
87# some Java parameters
88if [ "$NUTCH_JAVA_HOME" != "" ]; then
89  #echo "run java in $NUTCH_JAVA_HOME"
90  JAVA_HOME=$NUTCH_JAVA_HOME
91fi
92 
93if [ "$JAVA_HOME" = "" ]; then
94  echo "Error: JAVA_HOME is not set."
95  exit 1
96fi
97
98JAVA=$JAVA_HOME/bin/java
99JAVA_HEAP_MAX=-Xmx1000m
100
101# check envvars which might override default args
102if [ "$NUTCH_HEAPSIZE" != "" ]; then
103  #echo "run with heapsize $NUTCH_HEAPSIZE"
104  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
105  #echo $JAVA_HEAP_MAX
106fi
107
108# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
109CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}
110
111CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
112
113# so that filenames w/ spaces are handled correctly in loops below
114IFS=
115
116# for developers, add plugins, job & test code to CLASSPATH
117if [ -d "$NUTCH_HOME/build/plugins" ]; then
118  CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build
119fi
120if [ -d "$NUTCH_HOME/build/test/classes" ]; then
121  CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes
122fi
123
124if [ $IS_CORE == 0 ] 
125then
126  for f in $NUTCH_HOME/build/nutch-*.job; do
127    CLASSPATH=${CLASSPATH}:$f;
128  done
129
130  # for releases, add Nutch job to CLASSPATH
131  for f in $NUTCH_HOME/nutch-*.job; do
132    CLASSPATH=${CLASSPATH}:$f;
133  done
134else
135  CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes
136fi
137
138# add plugins to classpath
139if [ -d "$NUTCH_HOME/plugins" ]; then
140  CLASSPATH=${NUTCH_HOME}:${CLASSPATH}
141fi
142
143# add libs to CLASSPATH
144for f in $NUTCH_HOME/lib/*.jar; do
145  CLASSPATH=${CLASSPATH}:$f;
146done
147
148for f in $NUTCH_HOME/lib/jetty-ext/*.jar; do
149  CLASSPATH=${CLASSPATH}:$f;
150done
151
152# cygwin path translation
153if $cygwin; then
154  CLASSPATH=`cygpath -p -w "$CLASSPATH"`
155fi
156
157# setup 'java.library.path' for native-hadoop code if necessary
158JAVA_LIBRARY_PATH=''
159if [ -d "${NUTCH_HOME}/build/native" -o -d "${NUTCH_HOME}/lib/native" ]; then
160  JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
161 
162  if [ -d "$NUTCH_HOME/build/native" ]; then
163    JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
164  fi
165 
166  if [ -d "${NUTCH_HOME}/lib/native" ]; then
167    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
168      JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}
169    else
170      JAVA_LIBRARY_PATH=${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}
171    fi
172  fi
173fi
174
175if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
176  JAVA_LIBRARY_PATH=`cygpath -p -w "$JAVA_LIBRARY_PATH"`
177fi
178
179# restore ordinary behaviour
180unset IFS
181
182# default log directory & file
183if [ "$NUTCH_LOG_DIR" = "" ]; then
184  NUTCH_LOG_DIR="$NUTCH_HOME/logs"
185fi
186if [ "$NUTCH_LOGFILE" = "" ]; then
187  NUTCH_LOGFILE='hadoop.log'
188fi
189
190#Fix log path under cygwin
191if $cygwin; then
192  NUTCH_LOG_DIR=`cygpath -p -w "$NUTCH_LOG_DIR"`
193fi
194
195NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR"
196NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.file=$NUTCH_LOGFILE"
197
198if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
199  NUTCH_OPTS="$NUTCH_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
200fi
201
202# figure out which class to run
203if [ "$COMMAND" = "crawl" ] ; then
204  CLASS=org.apache.nutch.crawl.Crawl
205elif [ "$COMMAND" = "inject" ] ; then
206  CLASS=org.apache.nutch.crawl.Injector
207elif [ "$COMMAND" = "generate" ] ; then
208  CLASS=org.apache.nutch.crawl.Generator
209elif [ "$COMMAND" = "freegen" ] ; then
210  CLASS=org.apache.nutch.tools.FreeGenerator
211elif [ "$COMMAND" = "fetch" ] ; then
212  CLASS=org.apache.nutch.fetcher.Fetcher
213elif [ "$COMMAND" = "fetch2" ] ; then
214  CLASS=org.apache.nutch.fetcher.Fetcher2
215elif [ "$COMMAND" = "parse" ] ; then
216  CLASS=org.apache.nutch.parse.ParseSegment
217elif [ "$COMMAND" = "readdb" ] ; then
218  CLASS=org.apache.nutch.crawl.CrawlDbReader
219elif [ "$COMMAND" = "convdb" ] ; then
220  CLASS=org.apache.nutch.tools.compat.CrawlDbConverter
221elif [ "$COMMAND" = "mergedb" ] ; then
222  CLASS=org.apache.nutch.crawl.CrawlDbMerger
223elif [ "$COMMAND" = "readlinkdb" ] ; then
224  CLASS=org.apache.nutch.crawl.LinkDbReader
225elif [ "$COMMAND" = "readseg" ] ; then
226  CLASS=org.apache.nutch.segment.SegmentReader
227elif [ "$COMMAND" = "segread" ] ; then
228  echo "[DEPRECATED] Command 'segread' is deprecated, use 'readseg' instead."
229  CLASS=org.apache.nutch.segment.SegmentReader
230elif [ "$COMMAND" = "mergesegs" ] ; then
231  CLASS=org.apache.nutch.segment.SegmentMerger
232elif [ "$COMMAND" = "updatedb" ] ; then
233  CLASS=org.apache.nutch.crawl.CrawlDb
234elif [ "$COMMAND" = "invertlinks" ] ; then
235  CLASS=org.apache.nutch.crawl.LinkDb
236elif [ "$COMMAND" = "mergelinkdb" ] ; then
237  CLASS=org.apache.nutch.crawl.LinkDbMerger
238elif [ "$COMMAND" = "index" ] ; then
239  CLASS=org.apache.nutch.indexer.Indexer
240elif [ "$COMMAND" = "solrindex" ] ; then
241  CLASS=org.apache.nutch.indexer.solr.SolrIndexer
242elif [ "$COMMAND" = "dedup" ] ; then
243  CLASS=org.apache.nutch.indexer.DeleteDuplicates
244elif [ "$COMMAND" = "solrdedup" ] ; then
245  CLASS=org.apache.nutch.indexer.solr.SolrDeleteDuplicates
246elif [ "$COMMAND" = "merge" ] ; then
247  CLASS=org.apache.nutch.indexer.IndexMerger
248elif [ "$COMMAND" = "plugin" ] ; then
249  CLASS=org.apache.nutch.plugin.PluginRepository
250elif [ "$COMMAND" = "server" ] ; then
251  CLASS='org.apache.nutch.searcher.DistributedSearch$Server'
252else
253  CLASS=$COMMAND
254fi
255
256# run it
257exec "$JAVA" $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@"
258
Note: See TracBrowser for help on using the repository browser.