1 | <?xml version="1.0" encoding="UTF-8" ?> |
---|
2 | <!-- |
---|
3 | Licensed to the Apache Software Foundation (ASF) under one or |
---|
4 | more contributor license agreements. See the NOTICE file |
---|
5 | distributed with this work for additional information regarding |
---|
6 | copyright ownership. The ASF licenses this file to You under the |
---|
7 | Apache License, Version 2.0 (the "License"); you may not use |
---|
8 | this file except in compliance with the License. You may obtain |
---|
9 | a copy of the License at |
---|
10 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by |
---|
11 | applicable law or agreed to in writing, software distributed |
---|
12 | under the License is distributed on an "AS IS" BASIS, WITHOUT |
---|
13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
14 | See the License for the specific language governing permissions |
---|
15 | and limitations under the License. |
---|
16 | --> |
---|
17 | <!-- |
---|
18 | Description: This document contains solr schema definition to be |
---|
19 | used with solr integration currently build into Nutch. See |
---|
20 | https://issues.apache.org/jira/browse/NUTCH-442 |
---|
21 | https://issues.apache.org/jira/browse/NUTCH-699 for more info. |
---|
22 | --> |
---|
23 | <schema name="nutch" version="1.1"> |
---|
24 | <types> |
---|
25 | <fieldType name="string" class="solr.StrField" |
---|
26 | sortMissingLast="true" omitNorms="true"/> |
---|
27 | <fieldType name="long" class="solr.LongField" |
---|
28 | omitNorms="true"/> |
---|
29 | <fieldType name="float" class="solr.FloatField" |
---|
30 | omitNorms="true"/> |
---|
31 | <fieldType name="text" class="solr.TextField" |
---|
32 | positionIncrementGap="100"> |
---|
33 | <analyzer> |
---|
34 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
35 | <filter class="solr.StopFilterFactory" |
---|
36 | ignoreCase="true" words="stopwords.txt"/> |
---|
37 | <filter class="solr.WordDelimiterFilterFactory" |
---|
38 | generateWordParts="1" generateNumberParts="1" |
---|
39 | catenateWords="1" catenateNumbers="1" catenateAll="0" |
---|
40 | splitOnCaseChange="1"/> |
---|
41 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
42 | <filter class="solr.EnglishPorterFilterFactory" |
---|
43 | protected="protwords.txt"/> |
---|
44 | <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
---|
45 | </analyzer> |
---|
46 | </fieldType> |
---|
47 | <fieldType name="url" class="solr.TextField" |
---|
48 | positionIncrementGap="100"> |
---|
49 | <analyzer> |
---|
50 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
51 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
52 | <filter class="solr.WordDelimiterFilterFactory" |
---|
53 | generateWordParts="1" generateNumberParts="1"/> |
---|
54 | <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
---|
55 | </analyzer> |
---|
56 | </fieldType> |
---|
57 | </types> |
---|
58 | <fields> |
---|
59 | <field name="id" type="string" stored="true" indexed="true"/> |
---|
60 | |
---|
61 | <!-- core fields --> |
---|
62 | <field name="segment" type="string" stored="true" indexed="false"/> |
---|
63 | <field name="digest" type="string" stored="true" indexed="false"/> |
---|
64 | <field name="boost" type="float" stored="true" indexed="false"/> |
---|
65 | |
---|
66 | <!-- fields for index-basic plugin --> |
---|
67 | <field name="host" type="url" stored="false" indexed="true"/> |
---|
68 | <field name="site" type="string" stored="false" indexed="true"/> |
---|
69 | <field name="url" type="url" stored="true" indexed="true" |
---|
70 | required="true"/> |
---|
71 | <field name="content" type="text" stored="false" indexed="true"/> |
---|
72 | <field name="title" type="text" stored="true" indexed="true"/> |
---|
73 | <field name="cache" type="string" stored="true" indexed="false"/> |
---|
74 | <field name="tstamp" type="long" stored="true" indexed="false"/> |
---|
75 | |
---|
76 | <!-- fields for index-anchor plugin --> |
---|
77 | <field name="anchor" type="string" stored="true" indexed="true" |
---|
78 | multiValued="true"/> |
---|
79 | |
---|
80 | <!-- fields for index-more plugin --> |
---|
81 | <field name="type" type="string" stored="true" indexed="true" |
---|
82 | multiValued="true"/> |
---|
83 | <field name="contentLength" type="long" stored="true" |
---|
84 | indexed="false"/> |
---|
85 | <field name="lastModified" type="long" stored="true" |
---|
86 | indexed="false"/> |
---|
87 | <field name="date" type="string" stored="true" indexed="true"/> |
---|
88 | |
---|
89 | <!-- fields for languageidentifier plugin --> |
---|
90 | <field name="lang" type="string" stored="true" indexed="true"/> |
---|
91 | |
---|
92 | <!-- fields for subcollection plugin --> |
---|
93 | <field name="subcollection" type="string" stored="true" |
---|
94 | indexed="true"/> |
---|
95 | |
---|
96 | <!-- fields for feed plugin --> |
---|
97 | <field name="author" type="string" stored="true" indexed="true"/> |
---|
98 | <field name="tag" type="string" stored="true" indexed="true"/> |
---|
99 | <field name="feed" type="string" stored="true" indexed="true"/> |
---|
100 | <field name="publishedDate" type="string" stored="true" |
---|
101 | indexed="true"/> |
---|
102 | <field name="updatedDate" type="string" stored="true" |
---|
103 | indexed="true"/> |
---|
104 | </fields> |
---|
105 | <uniqueKey>id</uniqueKey> |
---|
106 | <defaultSearchField>content</defaultSearchField> |
---|
107 | <solrQueryParser defaultOperator="OR"/> |
---|
108 | <copyField source="url" dest="id"/> |
---|
109 | </schema> |
---|