source: nutchez-0.1/tomcat/webapps/ROOT/WEB-INF/classes/parse-plugins.xml @ 95

Last change on this file since 95 was 66, checked in by waue, 16 years ago

NutchEz - an easy way to nutch

File size: 6.1 KB
Line 
1<?xml version="1.0" encoding="UTF-8"?>
2<!--
3  Licensed to the Apache Software Foundation (ASF) under one or more
4  contributor license agreements.  See the NOTICE file distributed with
5  this work for additional information regarding copyright ownership.
6  The ASF licenses this file to You under the Apache License, Version 2.0
7  (the "License"); you may not use this file except in compliance with
8  the License.  You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12  Unless required by applicable law or agreed to in writing, software
13  distributed under the License is distributed on an "AS IS" BASIS,
14  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  See the License for the specific language governing permissions and
16  limitations under the License.
17 
18  Author     : mattmann
19  Description: This xml file represents a natural ordering for which parsing
20  plugin should get called for a particular mimeType.
21-->
22
23<parse-plugins>
24
25  <mimeType name="application/msword">
26    <plugin id="parse-msword" />
27  </mimeType>
28
29  <mimeType name="application/pdf">
30    <plugin id="parse-pdf" />
31  </mimeType>
32
33  <mimeType name="application/postscript">
34    <plugin id="parse-pdf" />
35  </mimeType>
36
37  <mimeType name="application/rss+xml">
38      <plugin id="parse-rss" />
39      <plugin id="feed" />
40  </mimeType>
41
42  <mimeType name="application/vnd.ms-excel">
43    <plugin id="parse-msexcel" />
44  </mimeType>
45
46  <mimeType name="application/vnd.ms-powerpoint">
47    <plugin id="parse-mspowerpoint" />
48  </mimeType>
49
50  <mimeType name="application/vnd.oasis.opendocument.text">
51    <plugin id="parse-oo" />
52  </mimeType>
53
54  <mimeType name="application/vnd.oasis.opendocument.text-template">
55    <plugin id="parse-oo" />
56  </mimeType>
57
58  <mimeType name="application/vnd.oasis.opendocument.text-master">
59    <plugin id="parse-oo" />
60  </mimeType>
61
62  <mimeType name="application/vnd.oasis.opendocument.text-web">
63    <plugin id="parse-oo" />
64  </mimeType>
65
66  <mimeType name="application/vnd.oasis.opendocument.presentation">
67    <plugin id="parse-oo" />
68  </mimeType>
69
70  <mimeType name="application/vnd.oasis.opendocument.presentation-template">
71    <plugin id="parse-oo" />
72  </mimeType>
73
74  <mimeType name="application/vnd.oasis.opendocument.spreadsheet">
75    <plugin id="parse-oo" />
76  </mimeType>
77
78  <mimeType name="application/vnd.oasis.opendocument.spreadsheet-template">
79    <plugin id="parse-oo" />
80  </mimeType>
81
82  <mimeType name="application/vnd.sun.xml.calc">
83    <plugin id="parse-oo" />
84  </mimeType>
85
86  <mimeType name="application/vnd.sun.xml.calc.template">
87    <plugin id="parse-oo" />
88  </mimeType>
89
90  <mimeType name="application/vnd.sun.xml.impress">
91    <plugin id="parse-oo" />
92  </mimeType>
93
94  <mimeType name="application/vnd.sun.xml.impress.template">
95    <plugin id="parse-oo" />
96  </mimeType>
97
98  <mimeType name="application/vnd.sun.xml.writer">
99    <plugin id="parse-oo" />
100  </mimeType>
101
102  <mimeType name="application/vnd.sun.xml.writer.template">
103    <plugin id="parse-oo" />
104  </mimeType>
105
106  <mimeType name="application/xhtml+xml">
107    <plugin id="parse-html" />
108  </mimeType>
109
110  <mimeType name="application/x-bzip2">
111    <!--  try and parse it with the zip parser -->
112    <plugin id="parse-zip" />
113  </mimeType>
114
115  <mimeType name="application/x-csh">
116    <plugin id="parse-text" />
117  </mimeType>
118
119  <mimeType name="application/x-gzip">
120    <!--  try and parse it with the zip parser -->
121    <plugin id="parse-zip" />
122  </mimeType>
123
124  <mimeType name="application/x-javascript">
125    <plugin id="parse-js" />
126  </mimeType>
127
128  <mimeType name="application/x-kword">
129    <!--  try and parse it with the word parser -->
130    <plugin id="parse-msword" />
131  </mimeType>
132
133  <mimeType name="application/x-kspread">
134    <!--  try and parse it with the msexcel parser -->
135    <plugin id="parse-msexcel" />
136  </mimeType>
137
138  <mimeType name="application/x-shockwave-flash">
139    <plugin id="parse-swf" />
140  </mimeType>
141
142  <mimeType name="application/zip">
143    <plugin id="parse-zip" />
144  </mimeType>
145
146  <mimeType name="text/html">
147    <plugin id="parse-html" />
148  </mimeType>
149
150  <mimeType name="text/plain">
151    <plugin id="parse-text" />
152  </mimeType>
153
154  <mimeType name="text/richtext">
155    <plugin id="parse-rtf" />
156    <plugin id="parse-msword" />
157  </mimeType>
158
159  <mimeType name="text/rtf">
160    <plugin id="parse-rtf" />
161    <plugin id="parse-msword" />
162  </mimeType>
163
164  <mimeType name="text/sgml">
165    <plugin id="parse-html" />
166  </mimeType>
167
168  <mimeType name="text/tab-separated-values">
169    <plugin id="parse-msexcel" />
170  </mimeType>
171
172  <mimeType name="text/xml">
173    <plugin id="parse-html" />
174    <plugin id="parse-rss" />
175        <plugin id="feed" />
176  </mimeType>
177
178       <!-- Types for parse-ext plugin: required for unit tests to pass. -->
179
180  <mimeType name="application/vnd.nutch.example.cat">
181    <plugin id="parse-ext" />
182  </mimeType>
183
184  <mimeType name="application/vnd.nutch.example.md5sum">
185    <plugin id="parse-ext" />
186  </mimeType>
187
188  <!--  alias mappings for parse-xxx names to the actual extension implementation
189  ids described in each plugin's plugin.xml file -->
190  <aliases>
191    <alias name="parse-ext" extension-id="ExtParser" />
192    <alias name="parse-html"
193      extension-id="org.apache.nutch.parse.html.HtmlParser" />
194    <alias name="parse-js" extension-id="JSParser" />
195    <alias name="parse-mp3"
196      extension-id="org.apache.nutch.parse.mp3.MP3Parser" />
197    <alias name="parse-msexcel"
198      extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" />
199    <alias name="parse-mspowerpoint"
200      extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" />
201    <alias name="parse-msword"
202      extension-id="org.apache.nutch.parse.msword.MSWordParser" />
203    <alias name="parse-oo"
204      extension-id="org.apache.nutch.parse.oo.OpenDocument.Text" />
205    <alias name="parse-pdf"
206      extension-id="org.apache.nutch.parse.pdf.PdfParser" />
207    <alias name="parse-rss"
208      extension-id="org.apache.nutch.parse.rss.RSSParser" />
209        <alias name="feed"
210            extension-id="org.apache.nutch.parse.feed.FeedParser" />
211    <alias name="parse-rtf"
212      extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" />
213    <alias name="parse-swf"
214      extension-id="org.apache.nutch.parse.swf.SWFParser" />
215    <alias name="parse-text"
216      extension-id="org.apache.nutch.parse.text.TextParser" />
217    <alias name="parse-zip"
218      extension-id="org.apache.nutch.parse.zip.ZipParser" />
219  </aliases>
220 
221</parse-plugins>
Note: See TracBrowser for help on using the repository browser.