source: nutchez-0.1/conf/tika-mimetypes.xml @ 202

Last change on this file since 202 was 66, checked in by waue, 16 years ago

NutchEz - an easy way to nutch

  • Property svn:executable set to *
File size: 10.3 KB
Line 
1<?xml version="1.0" encoding="UTF-8"?>
2<!--
3  Licensed to the Apache Software Foundation (ASF) under one or more
4  contributor license agreements.  See the NOTICE file distributed with
5  this work for additional information regarding copyright ownership.
6  The ASF licenses this file to You under the Apache License, Version 2.0
7  (the "License"); you may not use this file except in compliance with
8  the License.  You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12  Unless required by applicable law or agreed to in writing, software
13  distributed under the License is distributed on an "AS IS" BASIS,
14  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  See the License for the specific language governing permissions and
16  limitations under the License.
17 
18  Description: This xml file defines the valid mime types used by Tika.
19  The mime types within this file are based on the types in the mime-types.xml
20  file available in Apache Nutch.
21-->
22
23<mime-info>
24
25  <mime-type type="text/plain">
26    <magic priority="50">
27      <match value="This is TeX," type="string" offset="0" />
28      <match value="This is METAFONT," type="string" offset="0" />
29    </magic>
30    <glob pattern="*.txt" />
31    <glob pattern="*.asc" />
32  </mime-type>
33
34  <mime-type type="text/html">
35    <magic priority="50">
36      <match value="&lt;!DOCTYPE HTML" type="string"
37        offset="0:64" />
38      <match value="&lt;!doctype html" type="string"
39        offset="0:64" />
40      <match value="&lt;HEAD" type="string" offset="0:64" />
41      <match value="&lt;head" type="string" offset="0:64" />
42      <match value="&lt;TITLE" type="string" offset="0:64" />
43      <match value="&lt;title" type="string" offset="0:64" />
44      <match value="&lt;html" type="string" offset="0:64" />
45      <match value="&lt;HTML" type="string" offset="0:64" />
46      <match value="&lt;BODY" type="string" offset="0" />
47      <match value="&lt;body" type="string" offset="0" />
48      <match value="&lt;TITLE" type="string" offset="0" />
49      <match value="&lt;title" type="string" offset="0" />
50      <match value="&lt;!--" type="string" offset="0" />
51      <match value="&lt;h1" type="string" offset="0" />
52      <match value="&lt;H1" type="string" offset="0" />
53      <match value="&lt;!doctype HTML" type="string" offset="0" />
54      <match value="&lt;!DOCTYPE html" type="string" offset="0" />
55    </magic>
56    <glob pattern="*.html" />
57    <glob pattern="*.htm" />
58  </mime-type>
59
60  <mime-type type="application/xhtml+xml">
61    <glob pattern="*.xhtml" />
62    <root-XML namespaceURI='http://www.w3.org/1999/xhtml'
63      localName='html' />
64  </mime-type>
65
66  <mime-type type="application/vnd.ms-powerpoint">
67    <glob pattern="*.ppz" />
68    <glob pattern="*.ppt" />
69    <glob pattern="*.pps" />
70    <glob pattern="*.pot" />
71    <magic priority="50">
72      <match value="0xcfd0e011" type="little32" offset="0" />
73    </magic>
74  </mime-type>
75
76  <mime-type type="application/vnd.ms-excel">
77    <magic priority="50">
78      <match value="Microsoft Excel 5.0 Worksheet" type="string"
79        offset="2080" />
80    </magic>
81    <glob pattern="*.xls" />
82    <glob pattern="*.xlc" />
83    <glob pattern="*.xll" />
84    <glob pattern="*.xlm" />
85    <glob pattern="*.xlw" />
86    <glob pattern="*.xla" />
87    <glob pattern="*.xlt" />
88    <glob pattern="*.xld" />
89    <alias type="application/msexcel" />
90  </mime-type>
91
92  <mime-type type="application/vnd.oasis.opendocument.text">
93    <glob pattern="*.odt" />
94  </mime-type>
95
96
97  <mime-type type="application/zip">
98    <alias type="application/x-zip-compressed" />
99    <magic priority="40">
100      <match value="PK\003\004" type="string" offset="0" />
101    </magic>
102    <glob pattern="*.zip" />
103  </mime-type>
104
105  <mime-type type="application/vnd.oasis.opendocument.text">
106    <glob pattern="*.oth" />
107  </mime-type>
108
109  <mime-type type="application/msword">
110    <magic priority="50">
111      <match value="\x31\xbe\x00\x00" type="string" offset="0" />
112      <match value="PO^Q`" type="string" offset="0" />
113      <match value="\376\067\0\043" type="string" offset="0" />
114      <match value="\333\245-\0\0\0" type="string" offset="0" />
115      <match value="Microsoft Word 6.0 Document" type="string"
116        offset="2080" />
117      <match value="Microsoft Word document data" type="string"
118        offset="2112" />
119    </magic>
120    <glob pattern="*.doc" />
121    <alias type="application/vnd.ms-word" />
122  </mime-type>
123
124  <mime-type type="application/octet-stream">
125    <magic priority="50">
126      <match value="\037\036" type="string" offset="0" />
127      <match value="017437" type="host16" offset="0" />
128      <match value="0x1fff" type="host16" offset="0" />
129      <match value="\377\037" type="string" offset="0" />
130      <match value="0145405" type="host16" offset="0" />
131    </magic>
132    <glob pattern="*.bin" />
133  </mime-type>
134
135  <mime-type type="application/pdf">
136    <magic priority="50">
137      <match value="%PDF-" type="string" offset="0" />
138    </magic>
139    <glob pattern="*.pdf" />
140    <alias type="application/x-pdf" />
141  </mime-type>
142
143  <mime-type type="application/atom+xml">
144    <root-XML localName="feed"
145      namespaceURI="http://purl.org/atom/ns#" />
146  </mime-type>
147
148  <mime-type type="application/mac-binhex40">
149    <glob pattern="*.hqx" />
150  </mime-type>
151
152  <mime-type type="application/mac-compactpro">
153    <glob pattern="*.cpt" />
154  </mime-type>
155
156  <mime-type type="application/rtf">
157      <glob pattern="*.rtf"/>
158    <alias type="text/rtf" />
159  </mime-type>
160
161  <mime-type type="application/rss+xml">
162    <alias type="text/rss" />
163    <root-XML localName="rss" />
164    <root-XML namespaceURI="http://purl.org/rss/1.0/" />
165    <glob pattern="*.rss" />
166  </mime-type>
167
168  <!--  added in by mattmann -->
169  <mime-type type="application/xml">
170    <alias type="text/xml" />
171    <glob pattern="*.xml" />
172  </mime-type>
173
174  <mime-type type="application/x-mif">
175    <alias type="application/vnd.mif" />
176  </mime-type>
177
178  <mime-type type="application/vnd.wap.wbxml">
179    <glob pattern="*.wbxml" />
180  </mime-type>
181
182  <mime-type type="application/vnd.wap.wmlc">
183    <_comment>Compiled WML Document</_comment>
184    <glob pattern="*.wmlc" />
185  </mime-type>
186
187  <mime-type type="application/vnd.wap.wmlscriptc">
188    <_comment>Compiled WML Script</_comment>
189    <glob pattern="*.wmlsc" />
190  </mime-type>
191
192  <mime-type type="text/vnd.wap.wmlscript">
193    <_comment>WML Script</_comment>
194    <glob pattern="*.wmls" />
195  </mime-type>
196
197  <mime-type type="application/x-bzip">
198    <alias type="application/x-bzip2" />
199  </mime-type>
200
201  <mime-type type="application/x-bzip-compressed-tar">
202    <glob pattern="*.tbz" />
203    <glob pattern="*.tbz2" />
204  </mime-type>
205
206  <mime-type type="application/x-cdlink">
207    <_comment>Virtual CD-ROM CD Image File</_comment>
208    <glob pattern="*.vcd" />
209  </mime-type>
210
211  <mime-type type="application/x-director">
212    <_comment>Shockwave Movie</_comment>
213    <glob pattern="*.dcr" />
214    <glob pattern="*.dir" />
215    <glob pattern="*.dxr" />
216  </mime-type>
217
218  <mime-type type="application/x-futuresplash">
219    <_comment>Macromedia FutureSplash File</_comment>
220    <glob pattern="*.spl" />
221  </mime-type>
222
223  <mime-type type="application/x-java">
224    <alias type="application/java" />
225  </mime-type>
226
227  <mime-type type="application/x-koan">
228    <_comment>SSEYO Koan File</_comment>
229    <glob pattern="*.skp" />
230    <glob pattern="*.skd" />
231    <glob pattern="*.skt" />
232    <glob pattern="*.skm" />
233  </mime-type>
234
235  <mime-type type="application/x-latex">
236    <_comment>LaTeX Source Document</_comment>
237    <glob pattern="*.latex" />
238  </mime-type>
239
240  <!-- JC CHANGED
241    <mime-type type="application/x-mif">
242    <_comment>FrameMaker MIF document</_comment>
243    <glob pattern="*.mif"/>
244    </mime-type> -->
245
246  <mime-type type="application/x-ms-dos-executable">
247    <alias type="application/x-dosexec" />
248  </mime-type>
249
250  <mime-type type="application/ogg">
251    <alias type="application/x-ogg" />
252  </mime-type>
253
254  <mime-type type="application/x-rar">
255    <alias type="application/x-rar-compressed" />
256  </mime-type>
257
258  <mime-type type="application/x-shellscript">
259    <alias type="application/x-sh" />
260  </mime-type>
261
262  <mime-type type="application/xhtml+xml">
263    <glob pattern="*.xht" />
264  </mime-type>
265
266  <mime-type type="audio/midi">
267    <glob pattern="*.kar" />
268  </mime-type>
269
270  <mime-type type="audio/x-pn-realaudio">
271    <alias type="audio/x-realaudio" />
272  </mime-type>
273
274  <mime-type type="image/tiff">
275    <magic priority="50">
276      <match value="0x4d4d2a00" type="string" offset="0" />
277      <match value="0x49492a00" type="string" offset="0" />
278    </magic>
279  </mime-type>
280
281  <mime-type type="message/rfc822">
282    <magic priority="50">
283      <match type="string" value="Relay-Version:" offset="0" />
284      <match type="string" value="#! rnews" offset="0" />
285      <match type="string" value="N#! rnews" offset="0" />
286      <match type="string" value="Forward to" offset="0" />
287      <match type="string" value="Pipe to" offset="0" />
288      <match type="string" value="Return-Path:" offset="0" />
289      <match type="string" value="From:" offset="0" />
290      <match type="string" value="Message-ID:" offset="0" />
291      <match type="string" value="Date:" offset="0" />
292    </magic>
293  </mime-type>
294 
295  <mime-type type="application/x-javascript">
296        <glob pattern="*.js" />
297    </mime-type>
298   
299
300  <mime-type type="image/vnd.wap.wbmp">
301    <_comment>Wireless Bitmap File Format</_comment>
302    <glob pattern="*.wbmp" />
303  </mime-type>
304
305  <mime-type type="image/x-psd">
306    <alias type="image/photoshop" />
307  </mime-type>
308
309  <mime-type type="image/x-xcf">
310    <alias type="image/xcf" />
311    <magic priority="50">
312      <match type="string" value="gimp xcf " offset="0" />
313    </magic>
314  </mime-type>
315 
316  <mime-type type="application/x-shockwave-flash">
317      <glob pattern="*.swf"/>
318      <magic priority="50">
319        <match type="string" value="FWS" offset="0"/>
320        <match type="string" value="CWS" offset="0"/>
321      </magic>
322    </mime-type>
323
324  <mime-type type="model/iges">
325    <_comment>
326      Initial Graphics Exchange Specification Format
327    </_comment>
328    <glob pattern="*.igs" />
329    <glob pattern="*.iges" />
330  </mime-type>
331
332  <mime-type type="model/mesh">
333    <glob pattern="*.msh" />
334    <glob pattern="*.mesh" />
335    <glob pattern="*.silo" />
336  </mime-type>
337
338  <mime-type type="model/vrml">
339    <glob pattern="*.vrml" />
340  </mime-type>
341
342  <mime-type type="text/x-tcl">
343    <alias type="application/x-tcl" />
344  </mime-type>
345
346  <mime-type type="text/x-tex">
347    <alias type="application/x-tex" />
348  </mime-type>
349
350  <mime-type type="text/x-texinfo">
351    <alias type="application/x-texinfo" />
352  </mime-type>
353
354  <mime-type type="text/x-troff-me">
355    <alias type="application/x-troff-me" />
356  </mime-type>
357
358  <mime-type type="video/vnd.mpegurl">
359    <glob pattern="*.mxu" />
360  </mime-type>
361
362  <mime-type type="x-conference/x-cooltalk">
363    <_comment>Cooltalk Audio</_comment>
364    <glob pattern="*.ice" />
365  </mime-type>
366
367</mime-info>
Note: See TracBrowser for help on using the repository browser.