[66] | 1 | <?xml version="1.0"?> |
---|
| 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
---|
| 3 | <!-- |
---|
| 4 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
| 5 | contributor license agreements. See the NOTICE file distributed with |
---|
| 6 | this work for additional information regarding copyright ownership. |
---|
| 7 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
| 8 | (the "License"); you may not use this file except in compliance with |
---|
| 9 | the License. You may obtain a copy of the License at |
---|
| 10 | |
---|
| 11 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
| 12 | |
---|
| 13 | Unless required by applicable law or agreed to in writing, software |
---|
| 14 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
| 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
| 16 | See the License for the specific language governing permissions and |
---|
| 17 | limitations under the License. |
---|
| 18 | --> |
---|
| 19 | <!-- Do not modify this file directly. Instead, copy entries that you --> |
---|
| 20 | <!-- wish to modify from this file into nutch-site.xml and change them --> |
---|
| 21 | <!-- there. If nutch-site.xml does not already exist, create it. --> |
---|
| 22 | |
---|
| 23 | <configuration> |
---|
| 24 | |
---|
| 25 | <!-- file properties --> |
---|
| 26 | |
---|
| 27 | <property> |
---|
| 28 | <name>file.content.limit</name> |
---|
| 29 | <value>65536</value> |
---|
| 30 | <description>The length limit for downloaded content, in bytes. |
---|
| 31 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
| 32 | otherwise, no truncation at all. |
---|
| 33 | </description> |
---|
| 34 | </property> |
---|
| 35 | |
---|
| 36 | <property> |
---|
| 37 | <name>file.content.ignored</name> |
---|
| 38 | <value>true</value> |
---|
| 39 | <description>If true, no file content will be saved during fetch. |
---|
| 40 | And it is probably what we want to set most of time, since file:// URLs |
---|
| 41 | are meant to be local and we can always use them directly at parsing |
---|
| 42 | and indexing stages. Otherwise file contents will be saved. |
---|
| 43 | !! NO IMPLEMENTED YET !! |
---|
| 44 | </description> |
---|
| 45 | </property> |
---|
| 46 | |
---|
| 47 | <!-- HTTP properties --> |
---|
| 48 | |
---|
| 49 | <property> |
---|
| 50 | <name>http.agent.name</name> |
---|
| 51 | <value></value> |
---|
| 52 | <description>HTTP 'User-Agent' request header. MUST NOT be empty - |
---|
| 53 | please set this to a single word uniquely related to your organization. |
---|
| 54 | |
---|
| 55 | NOTE: You should also check other related properties: |
---|
| 56 | |
---|
| 57 | http.robots.agents |
---|
| 58 | http.agent.description |
---|
| 59 | http.agent.url |
---|
| 60 | http.agent.email |
---|
| 61 | http.agent.version |
---|
| 62 | |
---|
| 63 | and set their values appropriately. |
---|
| 64 | |
---|
| 65 | </description> |
---|
| 66 | </property> |
---|
| 67 | |
---|
| 68 | <property> |
---|
| 69 | <name>http.robots.agents</name> |
---|
| 70 | <value>*</value> |
---|
| 71 | <description>The agent strings we'll look for in robots.txt files, |
---|
| 72 | comma-separated, in decreasing order of precedence. You should |
---|
| 73 | put the value of http.agent.name as the first agent name, and keep the |
---|
| 74 | default * at the end of the list. E.g.: BlurflDev,Blurfl,* |
---|
| 75 | </description> |
---|
| 76 | </property> |
---|
| 77 | |
---|
| 78 | <property> |
---|
| 79 | <name>http.robots.403.allow</name> |
---|
| 80 | <value>true</value> |
---|
| 81 | <description>Some servers return HTTP status 403 (Forbidden) if |
---|
| 82 | /robots.txt doesn't exist. This should probably mean that we are |
---|
| 83 | allowed to crawl the site nonetheless. If this is set to false, |
---|
| 84 | then such sites will be treated as forbidden.</description> |
---|
| 85 | </property> |
---|
| 86 | |
---|
| 87 | <property> |
---|
| 88 | <name>http.agent.description</name> |
---|
| 89 | <value></value> |
---|
| 90 | <description>Further description of our bot- this text is used in |
---|
| 91 | the User-Agent header. It appears in parenthesis after the agent name. |
---|
| 92 | </description> |
---|
| 93 | </property> |
---|
| 94 | |
---|
| 95 | <property> |
---|
| 96 | <name>http.agent.url</name> |
---|
| 97 | <value></value> |
---|
| 98 | <description>A URL to advertise in the User-Agent header. This will |
---|
| 99 | appear in parenthesis after the agent name. Custom dictates that this |
---|
| 100 | should be a URL of a page explaining the purpose and behavior of this |
---|
| 101 | crawler. |
---|
| 102 | </description> |
---|
| 103 | </property> |
---|
| 104 | |
---|
| 105 | <property> |
---|
| 106 | <name>http.agent.email</name> |
---|
| 107 | <value></value> |
---|
| 108 | <description>An email address to advertise in the HTTP 'From' request |
---|
| 109 | header and User-Agent header. A good practice is to mangle this |
---|
| 110 | address (e.g. 'info at example dot com') to avoid spamming. |
---|
| 111 | </description> |
---|
| 112 | </property> |
---|
| 113 | |
---|
| 114 | <property> |
---|
| 115 | <name>http.agent.version</name> |
---|
| 116 | <value>Nutch-1.0</value> |
---|
| 117 | <description>A version string to advertise in the User-Agent |
---|
| 118 | header.</description> |
---|
| 119 | </property> |
---|
| 120 | |
---|
| 121 | <property> |
---|
| 122 | <name>http.agent.host</name> |
---|
| 123 | <value></value> |
---|
| 124 | <description>Name or IP address of the host on which the Nutch crawler |
---|
| 125 | would be running. Currently this is used by 'protocol-httpclient' |
---|
| 126 | plugin. |
---|
| 127 | </description> |
---|
| 128 | </property> |
---|
| 129 | |
---|
| 130 | <property> |
---|
| 131 | <name>http.timeout</name> |
---|
| 132 | <value>10000</value> |
---|
| 133 | <description>The default network timeout, in milliseconds.</description> |
---|
| 134 | </property> |
---|
| 135 | |
---|
| 136 | <property> |
---|
| 137 | <name>http.max.delays</name> |
---|
| 138 | <value>100</value> |
---|
| 139 | <description>The number of times a thread will delay when trying to |
---|
| 140 | fetch a page. Each time it finds that a host is busy, it will wait |
---|
| 141 | fetcher.server.delay. After http.max.delays attepts, it will give |
---|
| 142 | up on the page for now.</description> |
---|
| 143 | </property> |
---|
| 144 | |
---|
| 145 | <property> |
---|
| 146 | <name>http.content.limit</name> |
---|
| 147 | <value>65536</value> |
---|
| 148 | <description>The length limit for downloaded content, in bytes. |
---|
| 149 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
| 150 | otherwise, no truncation at all. |
---|
| 151 | </description> |
---|
| 152 | </property> |
---|
| 153 | |
---|
| 154 | <property> |
---|
| 155 | <name>http.proxy.host</name> |
---|
| 156 | <value></value> |
---|
| 157 | <description>The proxy hostname. If empty, no proxy is used.</description> |
---|
| 158 | </property> |
---|
| 159 | |
---|
| 160 | <property> |
---|
| 161 | <name>http.proxy.port</name> |
---|
| 162 | <value></value> |
---|
| 163 | <description>The proxy port.</description> |
---|
| 164 | </property> |
---|
| 165 | |
---|
| 166 | <property> |
---|
| 167 | <name>http.proxy.username</name> |
---|
| 168 | <value></value> |
---|
| 169 | <description>Username for proxy. This will be used by |
---|
| 170 | 'protocol-httpclient', if the proxy server requests basic, digest |
---|
| 171 | and/or NTLM authentication. To use this, 'protocol-httpclient' must |
---|
| 172 | be present in the value of 'plugin.includes' property. |
---|
| 173 | NOTE: For NTLM authentication, do not prefix the username with the |
---|
| 174 | domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. |
---|
| 175 | </description> |
---|
| 176 | </property> |
---|
| 177 | |
---|
| 178 | <property> |
---|
| 179 | <name>http.proxy.password</name> |
---|
| 180 | <value></value> |
---|
| 181 | <description>Password for proxy. This will be used by |
---|
| 182 | 'protocol-httpclient', if the proxy server requests basic, digest |
---|
| 183 | and/or NTLM authentication. To use this, 'protocol-httpclient' must |
---|
| 184 | be present in the value of 'plugin.includes' property. |
---|
| 185 | </description> |
---|
| 186 | </property> |
---|
| 187 | |
---|
| 188 | <property> |
---|
| 189 | <name>http.proxy.realm</name> |
---|
| 190 | <value></value> |
---|
| 191 | <description>Authentication realm for proxy. Do not define a value |
---|
| 192 | if realm is not required or authentication should take place for any |
---|
| 193 | realm. NTLM does not use the notion of realms. Specify the domain name |
---|
| 194 | of NTLM authentication as the value for this property. To use this, |
---|
| 195 | 'protocol-httpclient' must be present in the value of |
---|
| 196 | 'plugin.includes' property. |
---|
| 197 | </description> |
---|
| 198 | </property> |
---|
| 199 | |
---|
| 200 | <property> |
---|
| 201 | <name>http.auth.file</name> |
---|
| 202 | <value>httpclient-auth.xml</value> |
---|
| 203 | <description>Authentication configuration file for |
---|
| 204 | 'protocol-httpclient' plugin. |
---|
| 205 | </description> |
---|
| 206 | </property> |
---|
| 207 | |
---|
| 208 | <property> |
---|
| 209 | <name>http.verbose</name> |
---|
| 210 | <value>false</value> |
---|
| 211 | <description>If true, HTTP will log more verbosely.</description> |
---|
| 212 | </property> |
---|
| 213 | |
---|
| 214 | <property> |
---|
| 215 | <name>http.redirect.max</name> |
---|
| 216 | <value>0</value> |
---|
| 217 | <description>The maximum number of redirects the fetcher will follow when |
---|
| 218 | trying to fetch a page. If set to negative or 0, fetcher won't immediately |
---|
| 219 | follow redirected URLs, instead it will record them for later fetching. |
---|
| 220 | </description> |
---|
| 221 | </property> |
---|
| 222 | |
---|
| 223 | <property> |
---|
| 224 | <name>http.useHttp11</name> |
---|
| 225 | <value>false</value> |
---|
| 226 | <description>NOTE: at the moment this works only for protocol-httpclient. |
---|
| 227 | If true, use HTTP 1.1, if false use HTTP 1.0 . |
---|
| 228 | </description> |
---|
| 229 | </property> |
---|
| 230 | |
---|
| 231 | <!-- FTP properties --> |
---|
| 232 | |
---|
| 233 | <property> |
---|
| 234 | <name>ftp.username</name> |
---|
| 235 | <value>anonymous</value> |
---|
| 236 | <description>ftp login username.</description> |
---|
| 237 | </property> |
---|
| 238 | |
---|
| 239 | <property> |
---|
| 240 | <name>ftp.password</name> |
---|
| 241 | <value>anonymous@example.com</value> |
---|
| 242 | <description>ftp login password.</description> |
---|
| 243 | </property> |
---|
| 244 | |
---|
| 245 | <property> |
---|
| 246 | <name>ftp.content.limit</name> |
---|
| 247 | <value>65536</value> |
---|
| 248 | <description>The length limit for downloaded content, in bytes. |
---|
| 249 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
| 250 | otherwise, no truncation at all. |
---|
| 251 | Caution: classical ftp RFCs never defines partial transfer and, in fact, |
---|
| 252 | some ftp servers out there do not handle client side forced close-down very |
---|
| 253 | well. Our implementation tries its best to handle such situations smoothly. |
---|
| 254 | </description> |
---|
| 255 | </property> |
---|
| 256 | |
---|
| 257 | <property> |
---|
| 258 | <name>ftp.timeout</name> |
---|
| 259 | <value>60000</value> |
---|
| 260 | <description>Default timeout for ftp client socket, in millisec. |
---|
| 261 | Please also see ftp.keep.connection below.</description> |
---|
| 262 | </property> |
---|
| 263 | |
---|
| 264 | <property> |
---|
| 265 | <name>ftp.server.timeout</name> |
---|
| 266 | <value>100000</value> |
---|
| 267 | <description>An estimation of ftp server idle time, in millisec. |
---|
| 268 | Typically it is 120000 millisec for many ftp servers out there. |
---|
| 269 | Better be conservative here. Together with ftp.timeout, it is used to |
---|
| 270 | decide if we need to delete (annihilate) current ftp.client instance and |
---|
| 271 | force to start another ftp.client instance anew. This is necessary because |
---|
| 272 | a fetcher thread may not be able to obtain next request from queue in time |
---|
| 273 | (due to idleness) before our ftp client times out or remote server |
---|
| 274 | disconnects. Used only when ftp.keep.connection is true (please see below). |
---|
| 275 | </description> |
---|
| 276 | </property> |
---|
| 277 | |
---|
| 278 | <property> |
---|
| 279 | <name>ftp.keep.connection</name> |
---|
| 280 | <value>false</value> |
---|
| 281 | <description>Whether to keep ftp connection. Useful if crawling same host |
---|
| 282 | again and again. When set to true, it avoids connection, login and dir list |
---|
| 283 | parser setup for subsequent urls. If it is set to true, however, you must |
---|
| 284 | make sure (roughly): |
---|
| 285 | (1) ftp.timeout is less than ftp.server.timeout |
---|
| 286 | (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) |
---|
| 287 | Otherwise there will be too many "delete client because idled too long" |
---|
| 288 | messages in thread logs.</description> |
---|
| 289 | </property> |
---|
| 290 | |
---|
| 291 | <property> |
---|
| 292 | <name>ftp.follow.talk</name> |
---|
| 293 | <value>false</value> |
---|
| 294 | <description>Whether to log dialogue between our client and remote |
---|
| 295 | server. Useful for debugging.</description> |
---|
| 296 | </property> |
---|
| 297 | |
---|
| 298 | <!-- web db properties --> |
---|
| 299 | |
---|
| 300 | <property> |
---|
| 301 | <name>db.default.fetch.interval</name> |
---|
| 302 | <value>30</value> |
---|
| 303 | <description>(DEPRECATED) The default number of days between re-fetches of a page. |
---|
| 304 | </description> |
---|
| 305 | </property> |
---|
| 306 | |
---|
| 307 | <property> |
---|
| 308 | <name>db.fetch.interval.default</name> |
---|
| 309 | <value>2592000</value> |
---|
| 310 | <description>The default number of seconds between re-fetches of a page (30 days). |
---|
| 311 | </description> |
---|
| 312 | </property> |
---|
| 313 | |
---|
| 314 | <property> |
---|
| 315 | <name>db.fetch.interval.max</name> |
---|
| 316 | <value>7776000</value> |
---|
| 317 | <description>The maximum number of seconds between re-fetches of a page |
---|
| 318 | (90 days). After this period every page in the db will be re-tried, no |
---|
| 319 | matter what is its status. |
---|
| 320 | </description> |
---|
| 321 | </property> |
---|
| 322 | |
---|
| 323 | <property> |
---|
| 324 | <name>db.fetch.schedule.class</name> |
---|
| 325 | <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> |
---|
| 326 | <description>The implementation of fetch schedule. DefaultFetchSchedule simply |
---|
| 327 | adds the original fetchInterval to the last fetch time, regardless of |
---|
| 328 | page changes.</description> |
---|
| 329 | </property> |
---|
| 330 | |
---|
| 331 | <property> |
---|
| 332 | <name>db.fetch.schedule.adaptive.inc_rate</name> |
---|
| 333 | <value>0.4</value> |
---|
| 334 | <description>If a page is unmodified, its fetchInterval will be |
---|
| 335 | increased by this rate. This value should not |
---|
| 336 | exceed 0.5, otherwise the algorithm becomes unstable.</description> |
---|
| 337 | </property> |
---|
| 338 | |
---|
| 339 | <property> |
---|
| 340 | <name>db.fetch.schedule.adaptive.dec_rate</name> |
---|
| 341 | <value>0.2</value> |
---|
| 342 | <description>If a page is modified, its fetchInterval will be |
---|
| 343 | decreased by this rate. This value should not |
---|
| 344 | exceed 0.5, otherwise the algorithm becomes unstable.</description> |
---|
| 345 | </property> |
---|
| 346 | |
---|
| 347 | <property> |
---|
| 348 | <name>db.fetch.schedule.adaptive.min_interval</name> |
---|
| 349 | <value>60.0</value> |
---|
| 350 | <description>Minimum fetchInterval, in seconds.</description> |
---|
| 351 | </property> |
---|
| 352 | |
---|
| 353 | <property> |
---|
| 354 | <name>db.fetch.schedule.adaptive.max_interval</name> |
---|
| 355 | <value>31536000.0</value> |
---|
| 356 | <description>Maximum fetchInterval, in seconds (365 days). |
---|
| 357 | NOTE: this is limited by db.fetch.interval.max. Pages with |
---|
| 358 | fetchInterval larger than db.fetch.interval.max |
---|
| 359 | will be fetched anyway.</description> |
---|
| 360 | </property> |
---|
| 361 | |
---|
| 362 | <property> |
---|
| 363 | <name>db.fetch.schedule.adaptive.sync_delta</name> |
---|
| 364 | <value>true</value> |
---|
| 365 | <description>If true, try to synchronize with the time of page change. |
---|
| 366 | by shifting the next fetchTime by a fraction (sync_rate) of the difference |
---|
| 367 | between the last modification time, and the last fetch time.</description> |
---|
| 368 | </property> |
---|
| 369 | |
---|
| 370 | <property> |
---|
| 371 | <name>db.fetch.schedule.adaptive.sync_delta_rate</name> |
---|
| 372 | <value>0.3</value> |
---|
| 373 | <description>See sync_delta for description. This value should not |
---|
| 374 | exceed 0.5, otherwise the algorithm becomes unstable.</description> |
---|
| 375 | </property> |
---|
| 376 | |
---|
| 377 | <property> |
---|
| 378 | <name>db.update.additions.allowed</name> |
---|
| 379 | <value>true</value> |
---|
| 380 | <description>If true, updatedb will add newly discovered URLs, if false |
---|
| 381 | only already existing URLs in the CrawlDb will be updated and no new |
---|
| 382 | URLs will be added. |
---|
| 383 | </description> |
---|
| 384 | </property> |
---|
| 385 | |
---|
| 386 | <property> |
---|
| 387 | <name>db.ignore.internal.links</name> |
---|
| 388 | <value>true</value> |
---|
| 389 | <description>If true, when adding new links to a page, links from |
---|
| 390 | the same host are ignored. This is an effective way to limit the |
---|
| 391 | size of the link database, keeping only the highest quality |
---|
| 392 | links. |
---|
| 393 | </description> |
---|
| 394 | </property> |
---|
| 395 | |
---|
| 396 | <property> |
---|
| 397 | <name>db.ignore.external.links</name> |
---|
| 398 | <value>false</value> |
---|
| 399 | <description>If true, outlinks leading from a page to external hosts |
---|
| 400 | will be ignored. This is an effective way to limit the crawl to include |
---|
| 401 | only initially injected hosts, without creating complex URLFilters. |
---|
| 402 | </description> |
---|
| 403 | </property> |
---|
| 404 | |
---|
| 405 | <property> |
---|
| 406 | <name>db.score.injected</name> |
---|
| 407 | <value>1.0</value> |
---|
| 408 | <description>The score of new pages added by the injector. |
---|
| 409 | </description> |
---|
| 410 | </property> |
---|
| 411 | |
---|
| 412 | <property> |
---|
| 413 | <name>db.score.link.external</name> |
---|
| 414 | <value>1.0</value> |
---|
| 415 | <description>The score factor for new pages added due to a link from |
---|
| 416 | another host relative to the referencing page's score. Scoring plugins |
---|
| 417 | may use this value to affect initial scores of external links. |
---|
| 418 | </description> |
---|
| 419 | </property> |
---|
| 420 | |
---|
| 421 | <property> |
---|
| 422 | <name>db.score.link.internal</name> |
---|
| 423 | <value>1.0</value> |
---|
| 424 | <description>The score factor for pages added due to a link from the |
---|
| 425 | same host, relative to the referencing page's score. Scoring plugins |
---|
| 426 | may use this value to affect initial scores of internal links. |
---|
| 427 | </description> |
---|
| 428 | </property> |
---|
| 429 | |
---|
| 430 | <property> |
---|
| 431 | <name>db.score.count.filtered</name> |
---|
| 432 | <value>false</value> |
---|
| 433 | <description>The score value passed to newly discovered pages is |
---|
| 434 | calculated as a fraction of the original page score divided by the |
---|
| 435 | number of outlinks. If this option is false, only the outlinks that passed |
---|
| 436 | URLFilters will count, if it's true then all outlinks will count. |
---|
| 437 | </description> |
---|
| 438 | </property> |
---|
| 439 | |
---|
| 440 | <property> |
---|
| 441 | <name>db.max.inlinks</name> |
---|
| 442 | <value>10000</value> |
---|
| 443 | <description>Maximum number of Inlinks per URL to be kept in LinkDb. |
---|
| 444 | If "invertlinks" finds more inlinks than this number, only the first |
---|
| 445 | N inlinks will be stored, and the rest will be discarded. |
---|
| 446 | </description> |
---|
| 447 | </property> |
---|
| 448 | |
---|
| 449 | <property> |
---|
| 450 | <name>db.max.outlinks.per.page</name> |
---|
| 451 | <value>100</value> |
---|
| 452 | <description>The maximum number of outlinks that we'll process for a page. |
---|
| 453 | If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks |
---|
| 454 | will be processed for a page; otherwise, all outlinks will be processed. |
---|
| 455 | </description> |
---|
| 456 | </property> |
---|
| 457 | |
---|
| 458 | <property> |
---|
| 459 | <name>db.max.anchor.length</name> |
---|
| 460 | <value>100</value> |
---|
| 461 | <description>The maximum number of characters permitted in an anchor. |
---|
| 462 | </description> |
---|
| 463 | </property> |
---|
| 464 | |
---|
| 465 | <property> |
---|
| 466 | <name>db.fetch.retry.max</name> |
---|
| 467 | <value>3</value> |
---|
| 468 | <description>The maximum number of times a url that has encountered |
---|
| 469 | recoverable errors is generated for fetch.</description> |
---|
| 470 | </property> |
---|
| 471 | |
---|
| 472 | <property> |
---|
| 473 | <name>db.signature.class</name> |
---|
| 474 | <value>org.apache.nutch.crawl.MD5Signature</value> |
---|
| 475 | <description>The default implementation of a page signature. Signatures |
---|
| 476 | created with this implementation will be used for duplicate detection |
---|
| 477 | and removal.</description> |
---|
| 478 | </property> |
---|
| 479 | |
---|
| 480 | <property> |
---|
| 481 | <name>db.signature.text_profile.min_token_len</name> |
---|
| 482 | <value>2</value> |
---|
| 483 | <description>Minimum token length to be included in the signature. |
---|
| 484 | </description> |
---|
| 485 | </property> |
---|
| 486 | |
---|
| 487 | <property> |
---|
| 488 | <name>db.signature.text_profile.quant_rate</name> |
---|
| 489 | <value>0.01</value> |
---|
| 490 | <description>Profile frequencies will be rounded down to a multiple of |
---|
| 491 | QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token |
---|
| 492 | frequency. If maxFreq > 1 then QUANT will be at least 2, which means that |
---|
| 493 | for longer texts tokens with frequency 1 will always be discarded. |
---|
| 494 | </description> |
---|
| 495 | </property> |
---|
| 496 | |
---|
| 497 | <!-- generate properties --> |
---|
| 498 | |
---|
| 499 | <property> |
---|
| 500 | <name>generate.max.per.host</name> |
---|
| 501 | <value>-1</value> |
---|
| 502 | <description>The maximum number of urls per host in a single |
---|
| 503 | fetchlist. -1 if unlimited.</description> |
---|
| 504 | </property> |
---|
| 505 | |
---|
| 506 | <property> |
---|
| 507 | <name>generate.max.per.host.by.ip</name> |
---|
| 508 | <value>false</value> |
---|
| 509 | <description>If false, same host names are counted. If true, |
---|
| 510 | hosts' IP addresses are resolved and the same IP-s are counted. |
---|
| 511 | |
---|
| 512 | -+-+-+- WARNING !!! -+-+-+- |
---|
| 513 | When set to true, Generator will create a lot of DNS lookup |
---|
| 514 | requests, rapidly. This may cause a DOS attack on |
---|
| 515 | remote DNS servers, not to mention increased external traffic |
---|
| 516 | and latency. For these reasons when using this option it is |
---|
| 517 | required that a local caching DNS be used.</description> |
---|
| 518 | </property> |
---|
| 519 | |
---|
| 520 | <property> |
---|
| 521 | <name>generate.update.crawldb</name> |
---|
| 522 | <value>false</value> |
---|
| 523 | <description>For highly-concurrent environments, where several |
---|
| 524 | generate/fetch/update cycles may overlap, setting this to true ensures |
---|
| 525 | that generate will create different fetchlists even without intervening |
---|
| 526 | updatedb-s, at the cost of running an additional job to update CrawlDB. |
---|
| 527 | If false, running generate twice without intervening |
---|
| 528 | updatedb will generate identical fetchlists.</description> |
---|
| 529 | </property> |
---|
| 530 | |
---|
| 531 | <!-- fetcher properties --> |
---|
| 532 | |
---|
| 533 | <property> |
---|
| 534 | <name>fetcher.server.delay</name> |
---|
| 535 | <value>5.0</value> |
---|
| 536 | <description>The number of seconds the fetcher will delay between |
---|
| 537 | successive requests to the same server.</description> |
---|
| 538 | </property> |
---|
| 539 | |
---|
| 540 | <property> |
---|
| 541 | <name>fetcher.server.min.delay</name> |
---|
| 542 | <value>0.0</value> |
---|
| 543 | <description>The minimum number of seconds the fetcher will delay between |
---|
| 544 | successive requests to the same server. This value is applicable ONLY |
---|
| 545 | if fetcher.threads.per.host is greater than 1 (i.e. the host blocking |
---|
| 546 | is turned off).</description> |
---|
| 547 | </property> |
---|
| 548 | |
---|
| 549 | <property> |
---|
| 550 | <name>fetcher.max.crawl.delay</name> |
---|
| 551 | <value>30</value> |
---|
| 552 | <description> |
---|
| 553 | If the Crawl-Delay in robots.txt is set to greater than this value (in |
---|
| 554 | seconds) then the fetcher will skip this page, generating an error report. |
---|
| 555 | If set to -1 the fetcher will never skip such pages and will wait the |
---|
| 556 | amount of time retrieved from robots.txt Crawl-Delay, however long that |
---|
| 557 | might be. |
---|
| 558 | </description> |
---|
| 559 | </property> |
---|
| 560 | |
---|
| 561 | <property> |
---|
| 562 | <name>fetcher.threads.fetch</name> |
---|
| 563 | <value>10</value> |
---|
| 564 | <description>The number of FetcherThreads the fetcher should use. |
---|
| 565 | This is also determines the maximum number of requests that are |
---|
| 566 | made at once (each FetcherThread handles one connection).</description> |
---|
| 567 | </property> |
---|
| 568 | |
---|
| 569 | <property> |
---|
| 570 | <name>fetcher.threads.per.host</name> |
---|
| 571 | <value>1</value> |
---|
| 572 | <description>This number is the maximum number of threads that |
---|
| 573 | should be allowed to access a host at one time.</description> |
---|
| 574 | </property> |
---|
| 575 | |
---|
| 576 | <property> |
---|
| 577 | <name>fetcher.threads.per.host.by.ip</name> |
---|
| 578 | <value>true</value> |
---|
| 579 | <description>If true, then fetcher will count threads by IP address, |
---|
| 580 | to which the URL's host name resolves. If false, only host name will be |
---|
| 581 | used. NOTE: this should be set to the same value as |
---|
| 582 | "generate.max.per.host.by.ip" - default settings are different only for |
---|
| 583 | reasons of backward-compatibility.</description> |
---|
| 584 | </property> |
---|
| 585 | |
---|
| 586 | <property> |
---|
| 587 | <name>fetcher.verbose</name> |
---|
| 588 | <value>false</value> |
---|
| 589 | <description>If true, fetcher will log more verbosely.</description> |
---|
| 590 | </property> |
---|
| 591 | |
---|
| 592 | <property> |
---|
| 593 | <name>fetcher.parse</name> |
---|
| 594 | <value>true</value> |
---|
| 595 | <description>If true, fetcher will parse content.</description> |
---|
| 596 | </property> |
---|
| 597 | |
---|
| 598 | <property> |
---|
| 599 | <name>fetcher.store.content</name> |
---|
| 600 | <value>true</value> |
---|
| 601 | <description>If true, fetcher will store content.</description> |
---|
| 602 | </property> |
---|
| 603 | |
---|
| 604 | <!-- indexer properties --> |
---|
| 605 | |
---|
| 606 | <property> |
---|
| 607 | <name>indexer.score.power</name> |
---|
| 608 | <value>0.5</value> |
---|
| 609 | <description>Determines the power of link analyis scores. Each |
---|
| 610 | pages's boost is set to <i>score<sup>scorePower</sup></i> where |
---|
| 611 | <i>score</i> is its link analysis score and <i>scorePower</i> is the |
---|
| 612 | value of this parameter. This is compiled into indexes, so, when |
---|
| 613 | this is changed, pages must be re-indexed for it to take |
---|
| 614 | effect.</description> |
---|
| 615 | </property> |
---|
| 616 | |
---|
| 617 | <property> |
---|
| 618 | <name>indexer.max.title.length</name> |
---|
| 619 | <value>100</value> |
---|
| 620 | <description>The maximum number of characters of a title that are indexed. |
---|
| 621 | </description> |
---|
| 622 | </property> |
---|
| 623 | |
---|
| 624 | <property> |
---|
| 625 | <name>indexer.max.tokens</name> |
---|
| 626 | <value>10000</value> |
---|
| 627 | <description> |
---|
| 628 | The maximum number of tokens that will be indexed for a single field |
---|
| 629 | in a document. This limits the amount of memory required for |
---|
| 630 | indexing, so that collections with very large files will not crash |
---|
| 631 | the indexing process by running out of memory. |
---|
| 632 | |
---|
| 633 | Note that this effectively truncates large documents, excluding |
---|
| 634 | from the index tokens that occur further in the document. If you |
---|
| 635 | know your source documents are large, be sure to set this value |
---|
| 636 | high enough to accomodate the expected size. If you set it to |
---|
| 637 | -1, then the only limit is your memory, but you should anticipate |
---|
| 638 | an OutOfMemoryError. |
---|
| 639 | </description> |
---|
| 640 | </property> |
---|
| 641 | |
---|
| 642 | <property> |
---|
| 643 | <name>indexer.mergeFactor</name> |
---|
| 644 | <value>50</value> |
---|
| 645 | <description>The factor that determines the frequency of Lucene segment |
---|
| 646 | merges. This must not be less than 2, higher values increase indexing |
---|
| 647 | speed but lead to increased RAM usage, and increase the number of |
---|
| 648 | open file handles (which may lead to "Too many open files" errors). |
---|
| 649 | NOTE: the "segments" here have nothing to do with Nutch segments, they |
---|
| 650 | are a low-level data unit used by Lucene. |
---|
| 651 | </description> |
---|
| 652 | </property> |
---|
| 653 | |
---|
| 654 | <property> |
---|
| 655 | <name>indexer.minMergeDocs</name> |
---|
| 656 | <value>50</value> |
---|
| 657 | <description>This number determines the minimum number of Lucene |
---|
| 658 | Documents buffered in memory between Lucene segment merges. Larger |
---|
| 659 | values increase indexing speed and increase RAM usage. |
---|
| 660 | </description> |
---|
| 661 | </property> |
---|
| 662 | |
---|
| 663 | <property> |
---|
| 664 | <name>indexer.maxMergeDocs</name> |
---|
| 665 | <value>2147483647</value> |
---|
| 666 | <description>This number determines the maximum number of Lucene |
---|
| 667 | Documents to be merged into a new Lucene segment. Larger values |
---|
| 668 | increase batch indexing speed and reduce the number of Lucene segments, |
---|
| 669 | which reduces the number of open file handles; however, this also |
---|
| 670 | decreases incremental indexing performance. |
---|
| 671 | </description> |
---|
| 672 | </property> |
---|
| 673 | |
---|
| 674 | <property> |
---|
| 675 | <name>indexer.termIndexInterval</name> |
---|
| 676 | <value>128</value> |
---|
| 677 | <description>Determines the fraction of terms which Lucene keeps in |
---|
| 678 | RAM when searching, to facilitate random-access. Smaller values use |
---|
| 679 | more memory but make searches somewhat faster. Larger values use |
---|
| 680 | less memory but make searches somewhat slower. |
---|
| 681 | </description> |
---|
| 682 | </property> |
---|
| 683 | |
---|
| 684 | <!-- indexingfilter plugin properties --> |
---|
| 685 | |
---|
| 686 | <property> |
---|
| 687 | <name>indexingfilter.order</name> |
---|
| 688 | <value></value> |
---|
| 689 | <description>The order by which index filters are applied. |
---|
| 690 | If empty, all available index filters (as dictated by properties |
---|
| 691 | plugin-includes and plugin-excludes above) are loaded and applied in system |
---|
| 692 | defined order. If not empty, only named filters are loaded and applied |
---|
| 693 | in given order. For example, if this property has value: |
---|
| 694 | org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter |
---|
| 695 | then BasicIndexingFilter is applied first, and MoreIndexingFilter second. |
---|
| 696 | |
---|
| 697 | Filter ordering might have impact on result if one filter depends on output of |
---|
| 698 | another filter. |
---|
| 699 | </description> |
---|
| 700 | </property> |
---|
| 701 | |
---|
| 702 | |
---|
| 703 | <!-- analysis properties --> |
---|
| 704 | |
---|
| 705 | <property> |
---|
| 706 | <name>analysis.common.terms.file</name> |
---|
| 707 | <value>common-terms.utf8</value> |
---|
| 708 | <description>The name of a file containing a list of common terms |
---|
| 709 | that should be indexed in n-grams.</description> |
---|
| 710 | </property> |
---|
| 711 | |
---|
| 712 | <!-- searcher properties --> |
---|
| 713 | |
---|
| 714 | <property> |
---|
| 715 | <name>searcher.dir</name> |
---|
| 716 | <value>crawl</value> |
---|
| 717 | <description> |
---|
| 718 | Path to root of crawl. This directory is searched (in |
---|
| 719 | order) for either the file search-servers.txt, containing a list of |
---|
| 720 | distributed search servers, or the directory "index" containing |
---|
| 721 | merged indexes, or the directory "segments" containing segment |
---|
| 722 | indexes. |
---|
| 723 | </description> |
---|
| 724 | </property> |
---|
| 725 | |
---|
| 726 | <property> |
---|
| 727 | <name>searcher.filter.cache.size</name> |
---|
| 728 | <value>16</value> |
---|
| 729 | <description> |
---|
| 730 | Maximum number of filters to cache. Filters can accelerate certain |
---|
| 731 | field-based queries, like language, document format, etc. Each |
---|
| 732 | filter requires one bit of RAM per page. So, with a 10 million page |
---|
| 733 | index, a cache size of 16 consumes two bytes per page, or 20MB. |
---|
| 734 | </description> |
---|
| 735 | </property> |
---|
| 736 | |
---|
| 737 | <property> |
---|
| 738 | <name>searcher.filter.cache.threshold</name> |
---|
| 739 | <value>0.05</value> |
---|
| 740 | <description> |
---|
| 741 | Filters are cached when their term is matched by more than this |
---|
| 742 | fraction of pages. For example, with a threshold of 0.05, and 10 |
---|
| 743 | million pages, the term must match more than 1/20, or 50,000 pages. |
---|
| 744 | So, if out of 10 million pages, 50% of pages are in English, and 2% |
---|
| 745 | are in Finnish, then, with a threshold of 0.05, searches for |
---|
| 746 | "lang:en" will use a cached filter, while searches for "lang:fi" |
---|
| 747 | will score all 20,000 finnish documents. |
---|
| 748 | </description> |
---|
| 749 | </property> |
---|
| 750 | |
---|
| 751 | <property> |
---|
| 752 | <name>searcher.hostgrouping.rawhits.factor</name> |
---|
| 753 | <value>2.0</value> |
---|
| 754 | <description> |
---|
| 755 | A factor that is used to determine the number of raw hits |
---|
| 756 | initially fetched, before host grouping is done. |
---|
| 757 | </description> |
---|
| 758 | </property> |
---|
| 759 | |
---|
| 760 | <property> |
---|
| 761 | <name>searcher.summary.context</name> |
---|
| 762 | <value>5</value> |
---|
| 763 | <description> |
---|
| 764 | The number of context terms to display preceding and following |
---|
| 765 | matching terms in a hit summary. |
---|
| 766 | </description> |
---|
| 767 | </property> |
---|
| 768 | |
---|
| 769 | <property> |
---|
| 770 | <name>searcher.summary.length</name> |
---|
| 771 | <value>20</value> |
---|
| 772 | <description> |
---|
| 773 | The total number of terms to display in a hit summary. |
---|
| 774 | </description> |
---|
| 775 | </property> |
---|
| 776 | |
---|
| 777 | <property> |
---|
| 778 | <name>searcher.max.hits</name> |
---|
| 779 | <value>-1</value> |
---|
| 780 | <description>If positive, search stops after this many hits are |
---|
| 781 | found. Setting this to small, positive values (e.g., 1000) can make |
---|
| 782 | searches much faster. With a sorted index, the quality of the hits |
---|
| 783 | suffers little.</description> |
---|
| 784 | </property> |
---|
| 785 | |
---|
| 786 | <property> |
---|
| 787 | <name>searcher.max.time.tick_count</name> |
---|
| 788 | <value>-1</value> |
---|
| 789 | <description>If positive value is defined here, limit search time for |
---|
| 790 | every request to this number of elapsed ticks (see the tick_length |
---|
| 791 | property below). The total maximum time for any search request will be |
---|
| 792 | then limited to tick_count * tick_length milliseconds. When search time |
---|
| 793 | is exceeded, partial results will be returned, and the total number of |
---|
| 794 | hits will be estimated. |
---|
| 795 | </description> |
---|
| 796 | </property> |
---|
| 797 | |
---|
| 798 | <property> |
---|
| 799 | <name>searcher.max.time.tick_length</name> |
---|
| 800 | <value>200</value> |
---|
| 801 | <description>The number of milliseconds between ticks. Larger values |
---|
| 802 | reduce the timer granularity (precision). Smaller values bring more |
---|
| 803 | overhead. |
---|
| 804 | </description> |
---|
| 805 | </property> |
---|
| 806 | |
---|
| 807 | <property> |
---|
| 808 | <name>searcher.num.handlers</name> |
---|
| 809 | <value>10</value> |
---|
| 810 | <description>The number of handlers for the distributed search server. |
---|
| 811 | </description> |
---|
| 812 | </property> |
---|
| 813 | |
---|
| 814 | <property> |
---|
| 815 | <name>searcher.max.hits.per.page</name> |
---|
| 816 | <value>1000</value> |
---|
| 817 | <description> The maximum number of hits to show per page. -1 if |
---|
| 818 | unlimited. If the number of hits requested by the user (via |
---|
| 819 | hitsPerPage parameter in the query string) is more than the value |
---|
| 820 | specified in this property, then this value is assumed as the number |
---|
| 821 | of hits per page. |
---|
| 822 | </description> |
---|
| 823 | </property> |
---|
| 824 | |
---|
| 825 | <!-- URL normalizer properties --> |
---|
| 826 | |
---|
| 827 | <property> |
---|
| 828 | <name>urlnormalizer.order</name> |
---|
| 829 | <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> |
---|
| 830 | <description>Order in which normalizers will run. If any of these isn't |
---|
| 831 | activated it will be silently skipped. If other normalizers not on the |
---|
| 832 | list are activated, they will run in random order after the ones |
---|
| 833 | specified here are run. |
---|
| 834 | </description> |
---|
| 835 | </property> |
---|
| 836 | |
---|
| 837 | <property> |
---|
| 838 | <name>urlnormalizer.regex.file</name> |
---|
| 839 | <value>regex-normalize.xml</value> |
---|
| 840 | <description>Name of the config file used by the RegexUrlNormalizer class. |
---|
| 841 | </description> |
---|
| 842 | </property> |
---|
| 843 | |
---|
| 844 | <property> |
---|
| 845 | <name>urlnormalizer.loop.count</name> |
---|
| 846 | <value>1</value> |
---|
| 847 | <description>Optionally loop through normalizers several times, to make |
---|
| 848 | sure that all transformations have been performed. |
---|
| 849 | </description> |
---|
| 850 | </property> |
---|
| 851 | |
---|
| 852 | <!-- mime properties --> |
---|
| 853 | |
---|
| 854 | <property> |
---|
| 855 | <name>mime.types.file</name> |
---|
| 856 | <value>tika-mimetypes.xml</value> |
---|
| 857 | <description>Name of file in CLASSPATH containing filename extension and |
---|
| 858 | magic sequence to mime types mapping information</description> |
---|
| 859 | </property> |
---|
| 860 | |
---|
| 861 | <property> |
---|
| 862 | <name>mime.type.magic</name> |
---|
| 863 | <value>true</value> |
---|
| 864 | <description>Defines if the mime content type detector uses magic resolution. |
---|
| 865 | </description> |
---|
| 866 | </property> |
---|
| 867 | |
---|
| 868 | <!-- plugin properties --> |
---|
| 869 | |
---|
| 870 | <property> |
---|
| 871 | <name>plugin.folders</name> |
---|
| 872 | <value>plugins</value> |
---|
| 873 | <description>Directories where nutch plugins are located. Each |
---|
| 874 | element may be a relative or absolute path. If absolute, it is used |
---|
| 875 | as is. If relative, it is searched for on the classpath.</description> |
---|
| 876 | </property> |
---|
| 877 | |
---|
| 878 | <property> |
---|
| 879 | <name>plugin.auto-activation</name> |
---|
| 880 | <value>true</value> |
---|
| 881 | <description>Defines if some plugins that are not activated regarding |
---|
| 882 | the plugin.includes and plugin.excludes properties must be automaticaly |
---|
| 883 | activated if they are needed by some actived plugins. |
---|
| 884 | </description> |
---|
| 885 | </property> |
---|
| 886 | |
---|
| 887 | <property> |
---|
| 888 | <name>plugin.includes</name> |
---|
| 889 | <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
---|
| 890 | <description>Regular expression naming plugin directory names to |
---|
| 891 | include. Any plugin not matching this expression is excluded. |
---|
| 892 | In any case you need at least include the nutch-extensionpoints plugin. By |
---|
| 893 | default Nutch includes crawling just HTML and plain text via HTTP, |
---|
| 894 | and basic indexing and search plugins. In order to use HTTPS please enable |
---|
| 895 | protocol-httpclient, but be aware of possible intermittent problems with the |
---|
| 896 | underlying commons-httpclient library. |
---|
| 897 | </description> |
---|
| 898 | </property> |
---|
| 899 | |
---|
| 900 | <property> |
---|
| 901 | <name>plugin.excludes</name> |
---|
| 902 | <value></value> |
---|
| 903 | <description>Regular expression naming plugin directory names to exclude. |
---|
| 904 | </description> |
---|
| 905 | </property> |
---|
| 906 | |
---|
| 907 | <!-- parser properties --> |
---|
| 908 | |
---|
| 909 | <property> |
---|
| 910 | <name>parse.plugin.file</name> |
---|
| 911 | <value>parse-plugins.xml</value> |
---|
| 912 | <description>The name of the file that defines the associations between |
---|
| 913 | content-types and parsers.</description> |
---|
| 914 | </property> |
---|
| 915 | |
---|
| 916 | <property> |
---|
| 917 | <name>parser.character.encoding.default</name> |
---|
| 918 | <value>windows-1252</value> |
---|
| 919 | <description>The character encoding to fall back to when no other information |
---|
| 920 | is available</description> |
---|
| 921 | </property> |
---|
| 922 | |
---|
| 923 | <property> |
---|
| 924 | <name>encodingdetector.charset.min.confidence</name> |
---|
| 925 | <value>-1</value> |
---|
| 926 | <description>A integer between 0-100 indicating minimum confidence value |
---|
| 927 | for charset auto-detection. Any negative value disables auto-detection. |
---|
| 928 | </description> |
---|
| 929 | </property> |
---|
| 930 | |
---|
| 931 | <property> |
---|
| 932 | <name>parser.caching.forbidden.policy</name> |
---|
| 933 | <value>content</value> |
---|
| 934 | <description>If a site (or a page) requests through its robot metatags |
---|
| 935 | that it should not be shown as cached content, apply this policy. Currently |
---|
| 936 | three keywords are recognized: "none" ignores any "noarchive" directives. |
---|
| 937 | "content" doesn't show the content, but shows summaries (snippets). |
---|
| 938 | "all" doesn't show either content or summaries.</description> |
---|
| 939 | </property> |
---|
| 940 | |
---|
| 941 | |
---|
| 942 | <property> |
---|
| 943 | <name>parser.html.impl</name> |
---|
| 944 | <value>neko</value> |
---|
| 945 | <description>HTML Parser implementation. Currently the following keywords |
---|
| 946 | are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. |
---|
| 947 | </description> |
---|
| 948 | </property> |
---|
| 949 | |
---|
| 950 | <property> |
---|
| 951 | <name>parser.html.form.use_action</name> |
---|
| 952 | <value>false</value> |
---|
| 953 | <description>If true, HTML parser will collect URLs from form action |
---|
| 954 | attributes. This may lead to undesirable behavior (submitting empty |
---|
| 955 | forms during next fetch cycle). If false, form action attribute will |
---|
| 956 | be ignored.</description> |
---|
| 957 | </property> |
---|
| 958 | |
---|
| 959 | <property> |
---|
| 960 | <name>parser.html.outlinks.ignore_tags</name> |
---|
| 961 | <value></value> |
---|
| 962 | <description>Comma separated list of HTML tags, from which outlinks |
---|
| 963 | shouldn't be extracted. Nutch takes links from: a, area, form, frame, |
---|
| 964 | iframe, script, link, img. If you add any of those tags here, it |
---|
| 965 | won't be taken. Default is empty list. Probably reasonable value |
---|
| 966 | for most people would be "img,script,link".</description> |
---|
| 967 | </property> |
---|
| 968 | |
---|
| 969 | |
---|
| 970 | <!-- urlfilter plugin properties --> |
---|
| 971 | |
---|
| 972 | <property> |
---|
| 973 | <name>urlfilter.domain.file</name> |
---|
| 974 | <value>domain-urlfilter.txt</value> |
---|
| 975 | <description>Name of file on CLASSPATH containing either top level domains or |
---|
| 976 | hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> |
---|
| 977 | </property> |
---|
| 978 | |
---|
| 979 | <property> |
---|
| 980 | <name>urlfilter.regex.file</name> |
---|
| 981 | <value>regex-urlfilter.txt</value> |
---|
| 982 | <description>Name of file on CLASSPATH containing regular expressions |
---|
| 983 | used by urlfilter-regex (RegexURLFilter) plugin.</description> |
---|
| 984 | </property> |
---|
| 985 | |
---|
| 986 | <property> |
---|
| 987 | <name>urlfilter.automaton.file</name> |
---|
| 988 | <value>automaton-urlfilter.txt</value> |
---|
| 989 | <description>Name of file on CLASSPATH containing regular expressions |
---|
| 990 | used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> |
---|
| 991 | </property> |
---|
| 992 | |
---|
| 993 | <property> |
---|
| 994 | <name>urlfilter.prefix.file</name> |
---|
| 995 | <value>prefix-urlfilter.txt</value> |
---|
| 996 | <description>Name of file on CLASSPATH containing url prefixes |
---|
| 997 | used by urlfilter-prefix (PrefixURLFilter) plugin.</description> |
---|
| 998 | </property> |
---|
| 999 | |
---|
| 1000 | <property> |
---|
| 1001 | <name>urlfilter.suffix.file</name> |
---|
| 1002 | <value>suffix-urlfilter.txt</value> |
---|
| 1003 | <description>Name of file on CLASSPATH containing url suffixes |
---|
| 1004 | used by urlfilter-suffix (SuffixURLFilter) plugin.</description> |
---|
| 1005 | </property> |
---|
| 1006 | |
---|
| 1007 | <property> |
---|
| 1008 | <name>urlfilter.order</name> |
---|
| 1009 | <value></value> |
---|
| 1010 | <description>The order by which url filters are applied. |
---|
| 1011 | If empty, all available url filters (as dictated by properties |
---|
| 1012 | plugin-includes and plugin-excludes above) are loaded and applied in system |
---|
| 1013 | defined order. If not empty, only named filters are loaded and applied |
---|
| 1014 | in given order. For example, if this property has value: |
---|
| 1015 | org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter |
---|
| 1016 | then RegexURLFilter is applied first, and PrefixURLFilter second. |
---|
| 1017 | Since all filters are AND'ed, filter ordering does not have impact |
---|
| 1018 | on end result, but it may have performance implication, depending |
---|
| 1019 | on relative expensiveness of filters. |
---|
| 1020 | </description> |
---|
| 1021 | </property> |
---|
| 1022 | |
---|
| 1023 | <!-- scoring filters properties --> |
---|
| 1024 | |
---|
| 1025 | <property> |
---|
| 1026 | <name>scoring.filter.order</name> |
---|
| 1027 | <value></value> |
---|
| 1028 | <description>The order in which scoring filters are applied. |
---|
| 1029 | This may be left empty (in which case all available scoring |
---|
| 1030 | filters will be applied in the order defined in plugin-includes |
---|
| 1031 | and plugin-excludes), or a space separated list of implementation |
---|
| 1032 | classes. |
---|
| 1033 | </description> |
---|
| 1034 | </property> |
---|
| 1035 | |
---|
| 1036 | <!-- clustering extension properties --> |
---|
| 1037 | |
---|
| 1038 | <property> |
---|
| 1039 | <name>extension.clustering.hits-to-cluster</name> |
---|
| 1040 | <value>100</value> |
---|
| 1041 | <description>Number of snippets retrieved for the clustering extension |
---|
| 1042 | if clustering extension is available and user requested results |
---|
| 1043 | to be clustered.</description> |
---|
| 1044 | </property> |
---|
| 1045 | |
---|
| 1046 | <property> |
---|
| 1047 | <name>extension.clustering.extension-name</name> |
---|
| 1048 | <value></value> |
---|
| 1049 | <description>Use the specified online clustering extension. If empty, |
---|
| 1050 | the first available extension will be used. The "name" here refers to an 'id' |
---|
| 1051 | attribute of the 'implementation' element in the plugin descriptor XML |
---|
| 1052 | file.</description> |
---|
| 1053 | </property> |
---|
| 1054 | |
---|
| 1055 | <!-- ontology extension properties --> |
---|
| 1056 | |
---|
| 1057 | <property> |
---|
| 1058 | <name>extension.ontology.extension-name</name> |
---|
| 1059 | <value></value> |
---|
| 1060 | <description>Use the specified online ontology extension. If empty, |
---|
| 1061 | the first available extension will be used. The "name" here refers to an 'id' |
---|
| 1062 | attribute of the 'implementation' element in the plugin descriptor XML |
---|
| 1063 | file.</description> |
---|
| 1064 | </property> |
---|
| 1065 | |
---|
| 1066 | <property> |
---|
| 1067 | <name>extension.ontology.urls</name> |
---|
| 1068 | <value> |
---|
| 1069 | </value> |
---|
| 1070 | <description>Urls of owl files, separated by spaces, such as |
---|
| 1071 | http://www.example.com/ontology/time.owl |
---|
| 1072 | http://www.example.com/ontology/space.owl |
---|
| 1073 | http://www.example.com/ontology/wine.owl |
---|
| 1074 | Or |
---|
| 1075 | file:/ontology/time.owl |
---|
| 1076 | file:/ontology/space.owl |
---|
| 1077 | file:/ontology/wine.owl |
---|
| 1078 | You have to make sure each url is valid. |
---|
| 1079 | By default, there is no owl file, so query refinement based on ontology |
---|
| 1080 | is silently ignored. |
---|
| 1081 | </description> |
---|
| 1082 | </property> |
---|
| 1083 | |
---|
| 1084 | <!-- query-basic plugin properties --> |
---|
| 1085 | |
---|
| 1086 | <property> |
---|
| 1087 | <name>query.url.boost</name> |
---|
| 1088 | <value>4.0</value> |
---|
| 1089 | <description> Used as a boost for url field in Lucene query. |
---|
| 1090 | </description> |
---|
| 1091 | </property> |
---|
| 1092 | |
---|
| 1093 | <property> |
---|
| 1094 | <name>query.anchor.boost</name> |
---|
| 1095 | <value>2.0</value> |
---|
| 1096 | <description> Used as a boost for anchor field in Lucene query. |
---|
| 1097 | </description> |
---|
| 1098 | </property> |
---|
| 1099 | |
---|
| 1100 | <property> |
---|
| 1101 | <name>query.title.boost</name> |
---|
| 1102 | <value>1.5</value> |
---|
| 1103 | <description> Used as a boost for title field in Lucene query. |
---|
| 1104 | </description> |
---|
| 1105 | </property> |
---|
| 1106 | |
---|
| 1107 | <property> |
---|
| 1108 | <name>query.host.boost</name> |
---|
| 1109 | <value>2.0</value> |
---|
| 1110 | <description> Used as a boost for host field in Lucene query. |
---|
| 1111 | </description> |
---|
| 1112 | </property> |
---|
| 1113 | |
---|
| 1114 | <property> |
---|
| 1115 | <name>query.phrase.boost</name> |
---|
| 1116 | <value>1.0</value> |
---|
| 1117 | <description> Used as a boost for phrase in Lucene query. |
---|
| 1118 | Multiplied by boost for field phrase is matched in. |
---|
| 1119 | </description> |
---|
| 1120 | </property> |
---|
| 1121 | |
---|
| 1122 | <!-- |
---|
| 1123 | <property> |
---|
| 1124 | <name>query.basic.description.boost</name> |
---|
| 1125 | <value>1.0</value> |
---|
| 1126 | <description> Declares a custom field and its boost to be added to the default fields of the Lucene query. |
---|
| 1127 | </description> |
---|
| 1128 | </property> |
---|
| 1129 | --> |
---|
| 1130 | |
---|
| 1131 | <!-- creative-commons plugin properties --> |
---|
| 1132 | |
---|
| 1133 | <property> |
---|
| 1134 | <name>query.cc.boost</name> |
---|
| 1135 | <value>0.0</value> |
---|
| 1136 | <description> Used as a boost for cc field in Lucene query. |
---|
| 1137 | </description> |
---|
| 1138 | </property> |
---|
| 1139 | |
---|
| 1140 | <!-- query-more plugin properties --> |
---|
| 1141 | |
---|
| 1142 | <property> |
---|
| 1143 | <name>query.type.boost</name> |
---|
| 1144 | <value>0.0</value> |
---|
| 1145 | <description> Used as a boost for type field in Lucene query. |
---|
| 1146 | </description> |
---|
| 1147 | </property> |
---|
| 1148 | |
---|
| 1149 | <!-- query-site plugin properties --> |
---|
| 1150 | |
---|
| 1151 | <property> |
---|
| 1152 | <name>query.site.boost</name> |
---|
| 1153 | <value>0.0</value> |
---|
| 1154 | <description> Used as a boost for site field in Lucene query. |
---|
| 1155 | </description> |
---|
| 1156 | </property> |
---|
| 1157 | |
---|
| 1158 | <!-- microformats-reltag plugin properties --> |
---|
| 1159 | |
---|
| 1160 | <property> |
---|
| 1161 | <name>query.tag.boost</name> |
---|
| 1162 | <value>1.0</value> |
---|
| 1163 | <description> Used as a boost for tag field in Lucene query. |
---|
| 1164 | </description> |
---|
| 1165 | </property> |
---|
| 1166 | |
---|
| 1167 | <!-- language-identifier plugin properties --> |
---|
| 1168 | |
---|
| 1169 | <property> |
---|
| 1170 | <name>lang.ngram.min.length</name> |
---|
| 1171 | <value>1</value> |
---|
| 1172 | <description> The minimum size of ngrams to uses to identify |
---|
| 1173 | language (must be between 1 and lang.ngram.max.length). |
---|
| 1174 | The larger is the range between lang.ngram.min.length and |
---|
| 1175 | lang.ngram.max.length, the better is the identification, but |
---|
| 1176 | the slowest it is. |
---|
| 1177 | </description> |
---|
| 1178 | </property> |
---|
| 1179 | |
---|
| 1180 | <property> |
---|
| 1181 | <name>lang.ngram.max.length</name> |
---|
| 1182 | <value>4</value> |
---|
| 1183 | <description> The maximum size of ngrams to uses to identify |
---|
| 1184 | language (must be between lang.ngram.min.length and 4). |
---|
| 1185 | The larger is the range between lang.ngram.min.length and |
---|
| 1186 | lang.ngram.max.length, the better is the identification, but |
---|
| 1187 | the slowest it is. |
---|
| 1188 | </description> |
---|
| 1189 | </property> |
---|
| 1190 | |
---|
| 1191 | <property> |
---|
| 1192 | <name>lang.analyze.max.length</name> |
---|
| 1193 | <value>2048</value> |
---|
| 1194 | <description> The maximum bytes of data to uses to indentify |
---|
| 1195 | the language (0 means full content analysis). |
---|
| 1196 | The larger is this value, the better is the analysis, but the |
---|
| 1197 | slowest it is. |
---|
| 1198 | </description> |
---|
| 1199 | </property> |
---|
| 1200 | |
---|
| 1201 | <property> |
---|
| 1202 | <name>query.lang.boost</name> |
---|
| 1203 | <value>0.0</value> |
---|
| 1204 | <description> Used as a boost for lang field in Lucene query. |
---|
| 1205 | </description> |
---|
| 1206 | </property> |
---|
| 1207 | |
---|
| 1208 | <!-- Temporary Hadoop 0.17.x workaround. --> |
---|
| 1209 | |
---|
| 1210 | <property> |
---|
| 1211 | <name>hadoop.job.history.user.location</name> |
---|
| 1212 | <value>${hadoop.log.dir}/history/user</value> |
---|
| 1213 | <description>Hadoop 0.17.x comes with a default setting to create |
---|
| 1214 | user logs inside the output path of the job. This breaks some |
---|
| 1215 | Hadoop classes, which expect the output to contain only |
---|
| 1216 | part-XXXXX files. This setting changes the output to a |
---|
| 1217 | subdirectory of the regular log directory. |
---|
| 1218 | </description> |
---|
| 1219 | </property> |
---|
| 1220 | |
---|
| 1221 | <!-- response writer properties --> |
---|
| 1222 | |
---|
| 1223 | <property> |
---|
| 1224 | <name>search.response.default.type</name> |
---|
| 1225 | <value>xml</value> |
---|
| 1226 | <description> |
---|
| 1227 | The default response type returned if none is specified. |
---|
| 1228 | </description> |
---|
| 1229 | </property> |
---|
| 1230 | |
---|
| 1231 | <property> |
---|
| 1232 | <name>search.response.default.lang</name> |
---|
| 1233 | <value>en</value> |
---|
| 1234 | <description> |
---|
| 1235 | The default response language if none is specified. |
---|
| 1236 | </description> |
---|
| 1237 | </property> |
---|
| 1238 | |
---|
| 1239 | <property> |
---|
| 1240 | <name>search.response.default.numrows</name> |
---|
| 1241 | <value>10</value> |
---|
| 1242 | <description> |
---|
| 1243 | The default number of rows to return if none is specified. |
---|
| 1244 | </description> |
---|
| 1245 | </property> |
---|
| 1246 | |
---|
| 1247 | <property> |
---|
| 1248 | <name>search.response.default.dedupfield</name> |
---|
| 1249 | <value>site</value> |
---|
| 1250 | <description> |
---|
| 1251 | The default dedup field if none is specified. |
---|
| 1252 | </description> |
---|
| 1253 | </property> |
---|
| 1254 | |
---|
| 1255 | <property> |
---|
| 1256 | <name>search.response.default.numdupes</name> |
---|
| 1257 | <value>1</value> |
---|
| 1258 | <description> |
---|
| 1259 | The default number of duplicates returned if none is specified. |
---|
| 1260 | </description> |
---|
| 1261 | </property> |
---|
| 1262 | |
---|
| 1263 | <property> |
---|
| 1264 | <name>searcher.response.maxage</name> |
---|
| 1265 | <value>86400</value> |
---|
| 1266 | <description> |
---|
| 1267 | The maxage of a response in seconds. Used in caching headers. |
---|
| 1268 | </description> |
---|
| 1269 | </property> |
---|
| 1270 | |
---|
| 1271 | <property> |
---|
| 1272 | <name>searcher.response.prettyprint</name> |
---|
| 1273 | <value>true</value> |
---|
| 1274 | <description> |
---|
| 1275 | Should the response output be pretty printed. Setting to true enables better |
---|
| 1276 | debugging, false removes unneeded spaces and gives better throughput. |
---|
| 1277 | </description> |
---|
| 1278 | </property> |
---|
| 1279 | |
---|
| 1280 | </configuration> |
---|