Changeset 80
- Timestamp:
- Jun 9, 2009, 6:08:55 PM (16 years ago)
- Files:
-
- 2 added
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
nutchez-0.1/bin/nutchez-func.sh
r77 r80 169 169 echo_vb "/opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH" 170 170 echo_vb "nutch conf dir = $NUTCH_CONF_DIR" 171 /opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH 171 /opt/nutch/bin/nutch crawl ~/.nutchez/urls -dir ~/.nutchez/search -depth $DEPTH -topN 5000 -threads 1000 172 172 } 173 173 -
nutchez-0.1/conf/nutch-site.xml
r74 r80 66 66 </description> 67 67 </property> 68 68 <property> 69 <name>db.ignore.external.links</name> 70 <value>false</value> 71 <description>If true, outlinks leading from a page to external hosts 72 will be ignored. This is an effective way to limit the crawl to include 73 only initially injected hosts, without creating complex URLFilters. 74 </description> 75 </property> 76 <property> 77 <name>file.content.limit</name> 78 <value>1000000</value> 79 <description>The length limit for downloaded content, in bytes. 80 If this value is nonnegative (>=0), content longer than it will be truncated; 81 otherwise, no truncation at all. 82 </description> 83 </property> 69 84 </configuration> 70 85 -
nutchez-0.1/conf/sav/n.urls.txt
r76 r80 1 http://www.nchc.org.tw 1 http://www.nchc.org.tw/tw/ 2 2 http://www.hadoop.tw -
nutchez-0.1/debian/nutchez.install
r73 r80 1 1 conf/* etc/nutch 2 2 bin opt/nutch 3 bin/nutchez* usr/local/ sbin3 bin/nutchez* usr/local/bin 4 4 lib opt/nutch 5 5 webapps opt/nutch -
nutchez-0.1/debian/nutchez.postrm
r75 r80 14 14 if [ -d $i ];then 15 15 echo "delete this dir : $i" 16 rm - ir $i16 rm -r $i 17 17 fi 18 18 done
Note: See TracChangeset
for help on using the changeset viewer.