[66] | 1 | <?xml version="1.0" encoding="UTF-8" ?> |
---|
| 2 | <!-- |
---|
| 3 | Licensed to the Apache Software Foundation (ASF) under one or |
---|
| 4 | more contributor license agreements. See the NOTICE file |
---|
| 5 | distributed with this work for additional information regarding |
---|
| 6 | copyright ownership. The ASF licenses this file to You under the |
---|
| 7 | Apache License, Version 2.0 (the "License"); you may not use |
---|
| 8 | this file except in compliance with the License. You may obtain |
---|
| 9 | a copy of the License at |
---|
| 10 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by |
---|
| 11 | applicable law or agreed to in writing, software distributed |
---|
| 12 | under the License is distributed on an "AS IS" BASIS, WITHOUT |
---|
| 13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
| 14 | See the License for the specific language governing permissions |
---|
| 15 | and limitations under the License. |
---|
| 16 | --> |
---|
| 17 | <!-- |
---|
| 18 | Description: This document contains solr schema definition to be |
---|
| 19 | used with solr integration currently build into Nutch. See |
---|
| 20 | https://issues.apache.org/jira/browse/NUTCH-442 |
---|
| 21 | https://issues.apache.org/jira/browse/NUTCH-699 for more info. |
---|
| 22 | --> |
---|
| 23 | <schema name="nutch" version="1.1"> |
---|
| 24 | <types> |
---|
| 25 | <fieldType name="string" class="solr.StrField" |
---|
| 26 | sortMissingLast="true" omitNorms="true"/> |
---|
| 27 | <fieldType name="long" class="solr.LongField" |
---|
| 28 | omitNorms="true"/> |
---|
| 29 | <fieldType name="float" class="solr.FloatField" |
---|
| 30 | omitNorms="true"/> |
---|
| 31 | <fieldType name="text" class="solr.TextField" |
---|
| 32 | positionIncrementGap="100"> |
---|
| 33 | <analyzer> |
---|
| 34 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
| 35 | <filter class="solr.StopFilterFactory" |
---|
| 36 | ignoreCase="true" words="stopwords.txt"/> |
---|
| 37 | <filter class="solr.WordDelimiterFilterFactory" |
---|
| 38 | generateWordParts="1" generateNumberParts="1" |
---|
| 39 | catenateWords="1" catenateNumbers="1" catenateAll="0" |
---|
| 40 | splitOnCaseChange="1"/> |
---|
| 41 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
| 42 | <filter class="solr.EnglishPorterFilterFactory" |
---|
| 43 | protected="protwords.txt"/> |
---|
| 44 | <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
---|
| 45 | </analyzer> |
---|
| 46 | </fieldType> |
---|
| 47 | <fieldType name="url" class="solr.TextField" |
---|
| 48 | positionIncrementGap="100"> |
---|
| 49 | <analyzer> |
---|
| 50 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
| 51 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
| 52 | <filter class="solr.WordDelimiterFilterFactory" |
---|
| 53 | generateWordParts="1" generateNumberParts="1"/> |
---|
| 54 | <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
---|
| 55 | </analyzer> |
---|
| 56 | </fieldType> |
---|
| 57 | </types> |
---|
| 58 | <fields> |
---|
| 59 | <field name="id" type="string" stored="true" indexed="true"/> |
---|
| 60 | |
---|
| 61 | <!-- core fields --> |
---|
| 62 | <field name="segment" type="string" stored="true" indexed="false"/> |
---|
| 63 | <field name="digest" type="string" stored="true" indexed="false"/> |
---|
| 64 | <field name="boost" type="float" stored="true" indexed="false"/> |
---|
| 65 | |
---|
| 66 | <!-- fields for index-basic plugin --> |
---|
| 67 | <field name="host" type="url" stored="false" indexed="true"/> |
---|
| 68 | <field name="site" type="string" stored="false" indexed="true"/> |
---|
| 69 | <field name="url" type="url" stored="true" indexed="true" |
---|
| 70 | required="true"/> |
---|
| 71 | <field name="content" type="text" stored="false" indexed="true"/> |
---|
| 72 | <field name="title" type="text" stored="true" indexed="true"/> |
---|
| 73 | <field name="cache" type="string" stored="true" indexed="false"/> |
---|
| 74 | <field name="tstamp" type="long" stored="true" indexed="false"/> |
---|
| 75 | |
---|
| 76 | <!-- fields for index-anchor plugin --> |
---|
| 77 | <field name="anchor" type="string" stored="true" indexed="true" |
---|
| 78 | multiValued="true"/> |
---|
| 79 | |
---|
| 80 | <!-- fields for index-more plugin --> |
---|
| 81 | <field name="type" type="string" stored="true" indexed="true" |
---|
| 82 | multiValued="true"/> |
---|
| 83 | <field name="contentLength" type="long" stored="true" |
---|
| 84 | indexed="false"/> |
---|
| 85 | <field name="lastModified" type="long" stored="true" |
---|
| 86 | indexed="false"/> |
---|
| 87 | <field name="date" type="string" stored="true" indexed="true"/> |
---|
| 88 | |
---|
| 89 | <!-- fields for languageidentifier plugin --> |
---|
| 90 | <field name="lang" type="string" stored="true" indexed="true"/> |
---|
| 91 | |
---|
| 92 | <!-- fields for subcollection plugin --> |
---|
| 93 | <field name="subcollection" type="string" stored="true" |
---|
| 94 | indexed="true"/> |
---|
| 95 | |
---|
| 96 | <!-- fields for feed plugin --> |
---|
| 97 | <field name="author" type="string" stored="true" indexed="true"/> |
---|
| 98 | <field name="tag" type="string" stored="true" indexed="true"/> |
---|
| 99 | <field name="feed" type="string" stored="true" indexed="true"/> |
---|
| 100 | <field name="publishedDate" type="string" stored="true" |
---|
| 101 | indexed="true"/> |
---|
| 102 | <field name="updatedDate" type="string" stored="true" |
---|
| 103 | indexed="true"/> |
---|
| 104 | </fields> |
---|
| 105 | <uniqueKey>id</uniqueKey> |
---|
| 106 | <defaultSearchField>content</defaultSearchField> |
---|
| 107 | <solrQueryParser defaultOperator="OR"/> |
---|
| 108 | <copyField source="url" dest="id"/> |
---|
| 109 | </schema> |
---|