1 | <?xml version="1.0" encoding="UTF-8"?> |
---|
2 | <!-- |
---|
3 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
4 | contributor license agreements. See the NOTICE file distributed with |
---|
5 | this work for additional information regarding copyright ownership. |
---|
6 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
7 | (the "License"); you may not use this file except in compliance with |
---|
8 | the License. You may obtain a copy of the License at |
---|
9 | |
---|
10 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | |
---|
12 | Unless required by applicable law or agreed to in writing, software |
---|
13 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | See the License for the specific language governing permissions and |
---|
16 | limitations under the License. |
---|
17 | |
---|
18 | Description: This xml file defines the valid mime types used by Tika. |
---|
19 | The mime types within this file are based on the types in the mime-types.xml |
---|
20 | file available in Apache Nutch. |
---|
21 | --> |
---|
22 | |
---|
23 | <mime-info> |
---|
24 | |
---|
25 | <mime-type type="text/plain"> |
---|
26 | <magic priority="50"> |
---|
27 | <match value="This is TeX," type="string" offset="0" /> |
---|
28 | <match value="This is METAFONT," type="string" offset="0" /> |
---|
29 | </magic> |
---|
30 | <glob pattern="*.txt" /> |
---|
31 | <glob pattern="*.asc" /> |
---|
32 | </mime-type> |
---|
33 | |
---|
34 | <mime-type type="text/html"> |
---|
35 | <magic priority="50"> |
---|
36 | <match value="<!DOCTYPE HTML" type="string" |
---|
37 | offset="0:64" /> |
---|
38 | <match value="<!doctype html" type="string" |
---|
39 | offset="0:64" /> |
---|
40 | <match value="<HEAD" type="string" offset="0:64" /> |
---|
41 | <match value="<head" type="string" offset="0:64" /> |
---|
42 | <match value="<TITLE" type="string" offset="0:64" /> |
---|
43 | <match value="<title" type="string" offset="0:64" /> |
---|
44 | <match value="<html" type="string" offset="0:64" /> |
---|
45 | <match value="<HTML" type="string" offset="0:64" /> |
---|
46 | <match value="<BODY" type="string" offset="0" /> |
---|
47 | <match value="<body" type="string" offset="0" /> |
---|
48 | <match value="<TITLE" type="string" offset="0" /> |
---|
49 | <match value="<title" type="string" offset="0" /> |
---|
50 | <match value="<!--" type="string" offset="0" /> |
---|
51 | <match value="<h1" type="string" offset="0" /> |
---|
52 | <match value="<H1" type="string" offset="0" /> |
---|
53 | <match value="<!doctype HTML" type="string" offset="0" /> |
---|
54 | <match value="<!DOCTYPE html" type="string" offset="0" /> |
---|
55 | </magic> |
---|
56 | <glob pattern="*.html" /> |
---|
57 | <glob pattern="*.htm" /> |
---|
58 | </mime-type> |
---|
59 | |
---|
60 | <mime-type type="application/xhtml+xml"> |
---|
61 | <glob pattern="*.xhtml" /> |
---|
62 | <root-XML namespaceURI='http://www.w3.org/1999/xhtml' |
---|
63 | localName='html' /> |
---|
64 | </mime-type> |
---|
65 | |
---|
66 | <mime-type type="application/vnd.ms-powerpoint"> |
---|
67 | <glob pattern="*.ppz" /> |
---|
68 | <glob pattern="*.ppt" /> |
---|
69 | <glob pattern="*.pps" /> |
---|
70 | <glob pattern="*.pot" /> |
---|
71 | <magic priority="50"> |
---|
72 | <match value="0xcfd0e011" type="little32" offset="0" /> |
---|
73 | </magic> |
---|
74 | </mime-type> |
---|
75 | |
---|
76 | <mime-type type="application/vnd.ms-excel"> |
---|
77 | <magic priority="50"> |
---|
78 | <match value="Microsoft Excel 5.0 Worksheet" type="string" |
---|
79 | offset="2080" /> |
---|
80 | </magic> |
---|
81 | <glob pattern="*.xls" /> |
---|
82 | <glob pattern="*.xlc" /> |
---|
83 | <glob pattern="*.xll" /> |
---|
84 | <glob pattern="*.xlm" /> |
---|
85 | <glob pattern="*.xlw" /> |
---|
86 | <glob pattern="*.xla" /> |
---|
87 | <glob pattern="*.xlt" /> |
---|
88 | <glob pattern="*.xld" /> |
---|
89 | <alias type="application/msexcel" /> |
---|
90 | </mime-type> |
---|
91 | |
---|
92 | <mime-type type="application/vnd.oasis.opendocument.text"> |
---|
93 | <glob pattern="*.odt" /> |
---|
94 | </mime-type> |
---|
95 | |
---|
96 | |
---|
97 | <mime-type type="application/zip"> |
---|
98 | <alias type="application/x-zip-compressed" /> |
---|
99 | <magic priority="40"> |
---|
100 | <match value="PK\003\004" type="string" offset="0" /> |
---|
101 | </magic> |
---|
102 | <glob pattern="*.zip" /> |
---|
103 | </mime-type> |
---|
104 | |
---|
105 | <mime-type type="application/vnd.oasis.opendocument.text"> |
---|
106 | <glob pattern="*.oth" /> |
---|
107 | </mime-type> |
---|
108 | |
---|
109 | <mime-type type="application/msword"> |
---|
110 | <magic priority="50"> |
---|
111 | <match value="\x31\xbe\x00\x00" type="string" offset="0" /> |
---|
112 | <match value="PO^Q`" type="string" offset="0" /> |
---|
113 | <match value="\376\067\0\043" type="string" offset="0" /> |
---|
114 | <match value="\333\245-\0\0\0" type="string" offset="0" /> |
---|
115 | <match value="Microsoft Word 6.0 Document" type="string" |
---|
116 | offset="2080" /> |
---|
117 | <match value="Microsoft Word document data" type="string" |
---|
118 | offset="2112" /> |
---|
119 | </magic> |
---|
120 | <glob pattern="*.doc" /> |
---|
121 | <alias type="application/vnd.ms-word" /> |
---|
122 | </mime-type> |
---|
123 | |
---|
124 | <mime-type type="application/octet-stream"> |
---|
125 | <magic priority="50"> |
---|
126 | <match value="\037\036" type="string" offset="0" /> |
---|
127 | <match value="017437" type="host16" offset="0" /> |
---|
128 | <match value="0x1fff" type="host16" offset="0" /> |
---|
129 | <match value="\377\037" type="string" offset="0" /> |
---|
130 | <match value="0145405" type="host16" offset="0" /> |
---|
131 | </magic> |
---|
132 | <glob pattern="*.bin" /> |
---|
133 | </mime-type> |
---|
134 | |
---|
135 | <mime-type type="application/pdf"> |
---|
136 | <magic priority="50"> |
---|
137 | <match value="%PDF-" type="string" offset="0" /> |
---|
138 | </magic> |
---|
139 | <glob pattern="*.pdf" /> |
---|
140 | <alias type="application/x-pdf" /> |
---|
141 | </mime-type> |
---|
142 | |
---|
143 | <mime-type type="application/atom+xml"> |
---|
144 | <root-XML localName="feed" |
---|
145 | namespaceURI="http://purl.org/atom/ns#" /> |
---|
146 | </mime-type> |
---|
147 | |
---|
148 | <mime-type type="application/mac-binhex40"> |
---|
149 | <glob pattern="*.hqx" /> |
---|
150 | </mime-type> |
---|
151 | |
---|
152 | <mime-type type="application/mac-compactpro"> |
---|
153 | <glob pattern="*.cpt" /> |
---|
154 | </mime-type> |
---|
155 | |
---|
156 | <mime-type type="application/rtf"> |
---|
157 | <glob pattern="*.rtf"/> |
---|
158 | <alias type="text/rtf" /> |
---|
159 | </mime-type> |
---|
160 | |
---|
161 | <mime-type type="application/rss+xml"> |
---|
162 | <alias type="text/rss" /> |
---|
163 | <root-XML localName="rss" /> |
---|
164 | <root-XML namespaceURI="http://purl.org/rss/1.0/" /> |
---|
165 | <glob pattern="*.rss" /> |
---|
166 | </mime-type> |
---|
167 | |
---|
168 | <!-- added in by mattmann --> |
---|
169 | <mime-type type="application/xml"> |
---|
170 | <alias type="text/xml" /> |
---|
171 | <glob pattern="*.xml" /> |
---|
172 | </mime-type> |
---|
173 | |
---|
174 | <mime-type type="application/x-mif"> |
---|
175 | <alias type="application/vnd.mif" /> |
---|
176 | </mime-type> |
---|
177 | |
---|
178 | <mime-type type="application/vnd.wap.wbxml"> |
---|
179 | <glob pattern="*.wbxml" /> |
---|
180 | </mime-type> |
---|
181 | |
---|
182 | <mime-type type="application/vnd.wap.wmlc"> |
---|
183 | <_comment>Compiled WML Document</_comment> |
---|
184 | <glob pattern="*.wmlc" /> |
---|
185 | </mime-type> |
---|
186 | |
---|
187 | <mime-type type="application/vnd.wap.wmlscriptc"> |
---|
188 | <_comment>Compiled WML Script</_comment> |
---|
189 | <glob pattern="*.wmlsc" /> |
---|
190 | </mime-type> |
---|
191 | |
---|
192 | <mime-type type="text/vnd.wap.wmlscript"> |
---|
193 | <_comment>WML Script</_comment> |
---|
194 | <glob pattern="*.wmls" /> |
---|
195 | </mime-type> |
---|
196 | |
---|
197 | <mime-type type="application/x-bzip"> |
---|
198 | <alias type="application/x-bzip2" /> |
---|
199 | </mime-type> |
---|
200 | |
---|
201 | <mime-type type="application/x-bzip-compressed-tar"> |
---|
202 | <glob pattern="*.tbz" /> |
---|
203 | <glob pattern="*.tbz2" /> |
---|
204 | </mime-type> |
---|
205 | |
---|
206 | <mime-type type="application/x-cdlink"> |
---|
207 | <_comment>Virtual CD-ROM CD Image File</_comment> |
---|
208 | <glob pattern="*.vcd" /> |
---|
209 | </mime-type> |
---|
210 | |
---|
211 | <mime-type type="application/x-director"> |
---|
212 | <_comment>Shockwave Movie</_comment> |
---|
213 | <glob pattern="*.dcr" /> |
---|
214 | <glob pattern="*.dir" /> |
---|
215 | <glob pattern="*.dxr" /> |
---|
216 | </mime-type> |
---|
217 | |
---|
218 | <mime-type type="application/x-futuresplash"> |
---|
219 | <_comment>Macromedia FutureSplash File</_comment> |
---|
220 | <glob pattern="*.spl" /> |
---|
221 | </mime-type> |
---|
222 | |
---|
223 | <mime-type type="application/x-java"> |
---|
224 | <alias type="application/java" /> |
---|
225 | </mime-type> |
---|
226 | |
---|
227 | <mime-type type="application/x-koan"> |
---|
228 | <_comment>SSEYO Koan File</_comment> |
---|
229 | <glob pattern="*.skp" /> |
---|
230 | <glob pattern="*.skd" /> |
---|
231 | <glob pattern="*.skt" /> |
---|
232 | <glob pattern="*.skm" /> |
---|
233 | </mime-type> |
---|
234 | |
---|
235 | <mime-type type="application/x-latex"> |
---|
236 | <_comment>LaTeX Source Document</_comment> |
---|
237 | <glob pattern="*.latex" /> |
---|
238 | </mime-type> |
---|
239 | |
---|
240 | <!-- JC CHANGED |
---|
241 | <mime-type type="application/x-mif"> |
---|
242 | <_comment>FrameMaker MIF document</_comment> |
---|
243 | <glob pattern="*.mif"/> |
---|
244 | </mime-type> --> |
---|
245 | |
---|
246 | <mime-type type="application/x-ms-dos-executable"> |
---|
247 | <alias type="application/x-dosexec" /> |
---|
248 | </mime-type> |
---|
249 | |
---|
250 | <mime-type type="application/ogg"> |
---|
251 | <alias type="application/x-ogg" /> |
---|
252 | </mime-type> |
---|
253 | |
---|
254 | <mime-type type="application/x-rar"> |
---|
255 | <alias type="application/x-rar-compressed" /> |
---|
256 | </mime-type> |
---|
257 | |
---|
258 | <mime-type type="application/x-shellscript"> |
---|
259 | <alias type="application/x-sh" /> |
---|
260 | </mime-type> |
---|
261 | |
---|
262 | <mime-type type="application/xhtml+xml"> |
---|
263 | <glob pattern="*.xht" /> |
---|
264 | </mime-type> |
---|
265 | |
---|
266 | <mime-type type="audio/midi"> |
---|
267 | <glob pattern="*.kar" /> |
---|
268 | </mime-type> |
---|
269 | |
---|
270 | <mime-type type="audio/x-pn-realaudio"> |
---|
271 | <alias type="audio/x-realaudio" /> |
---|
272 | </mime-type> |
---|
273 | |
---|
274 | <mime-type type="image/tiff"> |
---|
275 | <magic priority="50"> |
---|
276 | <match value="0x4d4d2a00" type="string" offset="0" /> |
---|
277 | <match value="0x49492a00" type="string" offset="0" /> |
---|
278 | </magic> |
---|
279 | </mime-type> |
---|
280 | |
---|
281 | <mime-type type="message/rfc822"> |
---|
282 | <magic priority="50"> |
---|
283 | <match type="string" value="Relay-Version:" offset="0" /> |
---|
284 | <match type="string" value="#! rnews" offset="0" /> |
---|
285 | <match type="string" value="N#! rnews" offset="0" /> |
---|
286 | <match type="string" value="Forward to" offset="0" /> |
---|
287 | <match type="string" value="Pipe to" offset="0" /> |
---|
288 | <match type="string" value="Return-Path:" offset="0" /> |
---|
289 | <match type="string" value="From:" offset="0" /> |
---|
290 | <match type="string" value="Message-ID:" offset="0" /> |
---|
291 | <match type="string" value="Date:" offset="0" /> |
---|
292 | </magic> |
---|
293 | </mime-type> |
---|
294 | |
---|
295 | <mime-type type="application/x-javascript"> |
---|
296 | <glob pattern="*.js" /> |
---|
297 | </mime-type> |
---|
298 | |
---|
299 | |
---|
300 | <mime-type type="image/vnd.wap.wbmp"> |
---|
301 | <_comment>Wireless Bitmap File Format</_comment> |
---|
302 | <glob pattern="*.wbmp" /> |
---|
303 | </mime-type> |
---|
304 | |
---|
305 | <mime-type type="image/x-psd"> |
---|
306 | <alias type="image/photoshop" /> |
---|
307 | </mime-type> |
---|
308 | |
---|
309 | <mime-type type="image/x-xcf"> |
---|
310 | <alias type="image/xcf" /> |
---|
311 | <magic priority="50"> |
---|
312 | <match type="string" value="gimp xcf " offset="0" /> |
---|
313 | </magic> |
---|
314 | </mime-type> |
---|
315 | |
---|
316 | <mime-type type="application/x-shockwave-flash"> |
---|
317 | <glob pattern="*.swf"/> |
---|
318 | <magic priority="50"> |
---|
319 | <match type="string" value="FWS" offset="0"/> |
---|
320 | <match type="string" value="CWS" offset="0"/> |
---|
321 | </magic> |
---|
322 | </mime-type> |
---|
323 | |
---|
324 | <mime-type type="model/iges"> |
---|
325 | <_comment> |
---|
326 | Initial Graphics Exchange Specification Format |
---|
327 | </_comment> |
---|
328 | <glob pattern="*.igs" /> |
---|
329 | <glob pattern="*.iges" /> |
---|
330 | </mime-type> |
---|
331 | |
---|
332 | <mime-type type="model/mesh"> |
---|
333 | <glob pattern="*.msh" /> |
---|
334 | <glob pattern="*.mesh" /> |
---|
335 | <glob pattern="*.silo" /> |
---|
336 | </mime-type> |
---|
337 | |
---|
338 | <mime-type type="model/vrml"> |
---|
339 | <glob pattern="*.vrml" /> |
---|
340 | </mime-type> |
---|
341 | |
---|
342 | <mime-type type="text/x-tcl"> |
---|
343 | <alias type="application/x-tcl" /> |
---|
344 | </mime-type> |
---|
345 | |
---|
346 | <mime-type type="text/x-tex"> |
---|
347 | <alias type="application/x-tex" /> |
---|
348 | </mime-type> |
---|
349 | |
---|
350 | <mime-type type="text/x-texinfo"> |
---|
351 | <alias type="application/x-texinfo" /> |
---|
352 | </mime-type> |
---|
353 | |
---|
354 | <mime-type type="text/x-troff-me"> |
---|
355 | <alias type="application/x-troff-me" /> |
---|
356 | </mime-type> |
---|
357 | |
---|
358 | <mime-type type="video/vnd.mpegurl"> |
---|
359 | <glob pattern="*.mxu" /> |
---|
360 | </mime-type> |
---|
361 | |
---|
362 | <mime-type type="x-conference/x-cooltalk"> |
---|
363 | <_comment>Cooltalk Audio</_comment> |
---|
364 | <glob pattern="*.ice" /> |
---|
365 | </mime-type> |
---|
366 | |
---|
367 | </mime-info> |
---|