net_traits/
mime_classifier.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use mime::{self, Mime};
6
7use crate::LoadContext;
8
9pub struct MimeClassifier {
10    image_classifier: GroupedClassifier,
11    audio_video_classifier: GroupedClassifier,
12    scriptable_classifier: GroupedClassifier,
13    plaintext_classifier: GroupedClassifier,
14    archive_classifier: GroupedClassifier,
15    binary_or_plaintext: BinaryOrPlaintextClassifier,
16    font_classifier: GroupedClassifier,
17}
18
19#[derive(PartialEq)]
20pub enum MediaType {
21    Xml,
22    Html,
23    AudioVideo,
24    Image,
25    JavaScript,
26    Json,
27    Font,
28    Text,
29    Css,
30}
31
32#[derive(PartialEq)]
33pub enum ApacheBugFlag {
34    On,
35    Off,
36}
37
38impl ApacheBugFlag {
39    /// <https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm>
40    pub fn from_content_type(mime_type: Option<&Mime>) -> ApacheBugFlag {
41        // TODO(36801): also handle charset ISO-8859-1
42        if mime_type.is_some_and(|mime_type| {
43            *mime_type == mime::TEXT_PLAIN || *mime_type == mime::TEXT_PLAIN_UTF_8
44        }) {
45            ApacheBugFlag::On
46        } else {
47            ApacheBugFlag::Off
48        }
49    }
50}
51
52#[derive(PartialEq)]
53pub enum NoSniffFlag {
54    On,
55    Off,
56}
57
58impl Default for MimeClassifier {
59    fn default() -> Self {
60        Self {
61            image_classifier: GroupedClassifier::image_classifer(),
62            audio_video_classifier: GroupedClassifier::audio_video_classifier(),
63            scriptable_classifier: GroupedClassifier::scriptable_classifier(),
64            plaintext_classifier: GroupedClassifier::plaintext_classifier(),
65            archive_classifier: GroupedClassifier::archive_classifier(),
66            binary_or_plaintext: BinaryOrPlaintextClassifier,
67            font_classifier: GroupedClassifier::font_classifier(),
68        }
69    }
70}
71
72impl MimeClassifier {
73    /// <https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm>
74    pub fn classify<'a>(
75        &'a self,
76        context: LoadContext,
77        no_sniff_flag: NoSniffFlag,
78        apache_bug_flag: ApacheBugFlag,
79        supplied_type: &Option<Mime>,
80        data: &'a [u8],
81    ) -> Mime {
82        let supplied_type_or_octet_stream = supplied_type
83            .clone()
84            .unwrap_or(mime::APPLICATION_OCTET_STREAM);
85        // Step 1. If the supplied MIME type is an XML MIME type or HTML MIME type,
86        // the computed MIME type is the supplied MIME type.
87        if Self::is_xml(&supplied_type_or_octet_stream) ||
88            Self::is_html(&supplied_type_or_octet_stream)
89        {
90            return supplied_type_or_octet_stream;
91        }
92        match context {
93            LoadContext::Browsing => match *supplied_type {
94                // Step 2. If the supplied MIME type is undefined or if the supplied MIME type’s essence is "unknown/unknown",
95                // "application/unknown", or "*/*", execute the rules for identifying
96                // an unknown MIME type with the sniff-scriptable flag equal to the inverse of the no-sniff flag and abort these steps.
97                None => self.sniff_unknown_type(no_sniff_flag, data),
98                Some(ref supplied_type) => {
99                    if MimeClassifier::is_explicit_unknown(supplied_type) {
100                        return self.sniff_unknown_type(no_sniff_flag, data);
101                    }
102                    // Step 3. If the no-sniff flag is set, the computed MIME type is the supplied MIME type.
103                    // Abort these steps.
104                    if no_sniff_flag == NoSniffFlag::On {
105                        return supplied_type.clone();
106                    }
107                    // Step 4. If the check-for-apache-bug flag is set,
108                    // execute the rules for distinguishing if a resource is text or binary and abort these steps.
109                    if apache_bug_flag == ApacheBugFlag::On {
110                        return self.sniff_text_or_data(data);
111                    }
112                    match MimeClassifier::get_media_type(supplied_type) {
113                        // Step 5. If the supplied MIME type is an image MIME type supported by the user agent,
114                        // let matched-type be the result of executing the image type pattern matching algorithm with
115                        // the resource header as the byte sequence to be matched.
116                        Some(MediaType::Image) => {
117                            // Step 6. If matched-type is not undefined, the computed MIME type is matched-type.
118                            self.image_classifier.classify(data)
119                        },
120                        // Step 7. If the supplied MIME type is an audio or video MIME type supported by the user agent,
121                        // let matched-type be the result of executing the audio or video type pattern matching algorithm
122                        // with the resource header as the byte sequence to be matched.
123                        Some(MediaType::AudioVideo) => {
124                            // Step 8. If matched-type is not undefined, the computed MIME type is matched-type.
125                            self.audio_video_classifier.classify(data)
126                        },
127                        Some(MediaType::Html) | Some(MediaType::Xml) => unreachable!(),
128                        _ => None,
129                    }
130                    // Step 9. The computed MIME type is the supplied MIME type.
131                    .unwrap_or(supplied_type.clone())
132                },
133            },
134            LoadContext::Image => {
135                // Section 8.2 Sniffing an image context
136                match MimeClassifier::maybe_get_media_type(supplied_type) {
137                    Some(MediaType::Xml) => None,
138                    _ => self.image_classifier.classify(data),
139                }
140                .unwrap_or(supplied_type_or_octet_stream)
141            },
142            LoadContext::AudioVideo => {
143                // Section 8.3 Sniffing an image context
144                match MimeClassifier::maybe_get_media_type(supplied_type) {
145                    Some(MediaType::Xml) => None,
146                    _ => self.audio_video_classifier.classify(data),
147                }
148                .unwrap_or(supplied_type_or_octet_stream)
149            },
150            LoadContext::Plugin => {
151                // 8.4 Sniffing in a plugin context
152                //
153                // This section was *not* finalized in the specs at the time
154                // of this implementation.
155                match *supplied_type {
156                    None => mime::APPLICATION_OCTET_STREAM,
157                    _ => supplied_type_or_octet_stream,
158                }
159            },
160            LoadContext::Style => {
161                // 8.5 Sniffing in a style context
162                //
163                // This section was *not* finalized in the specs at the time
164                // of this implementation.
165                match *supplied_type {
166                    None => mime::TEXT_CSS,
167                    _ => supplied_type_or_octet_stream,
168                }
169            },
170            LoadContext::Script => {
171                // 8.6 Sniffing in a script context
172                //
173                // This section was *not* finalized in the specs at the time
174                // of this implementation.
175                match *supplied_type {
176                    None => mime::TEXT_JAVASCRIPT,
177                    _ => supplied_type_or_octet_stream,
178                }
179            },
180            LoadContext::Font => {
181                // 8.7 Sniffing in a font context
182                match MimeClassifier::maybe_get_media_type(supplied_type) {
183                    Some(MediaType::Xml) => None,
184                    _ => self.font_classifier.classify(data),
185                }
186                .unwrap_or(supplied_type_or_octet_stream)
187            },
188            LoadContext::TextTrack => {
189                // 8.8 Sniffing in a text track context
190                //
191                // This section was *not* finalized in the specs at the time
192                // of this implementation.
193                "text/vtt".parse().unwrap()
194            },
195            LoadContext::CacheManifest => {
196                // 8.9 Sniffing in a cache manifest context
197                //
198                // This section was *not* finalized in the specs at the time
199                // of this implementation.
200                "text/cache-manifest".parse().unwrap()
201            },
202        }
203    }
204
205    pub fn validate(&self) -> Result<(), String> {
206        self.image_classifier.validate()?;
207        self.audio_video_classifier.validate()?;
208        self.scriptable_classifier.validate()?;
209        self.plaintext_classifier.validate()?;
210        self.archive_classifier.validate()?;
211        self.binary_or_plaintext.validate()?;
212        self.font_classifier.validate()?;
213        Ok(())
214    }
215
216    // some sort of iterator over the classifiers might be better?
217    fn sniff_unknown_type(&self, no_sniff_flag: NoSniffFlag, data: &[u8]) -> Mime {
218        let should_sniff_scriptable = no_sniff_flag == NoSniffFlag::Off;
219        let sniffed = if should_sniff_scriptable {
220            self.scriptable_classifier.classify(data)
221        } else {
222            None
223        };
224
225        sniffed
226            .or_else(|| self.plaintext_classifier.classify(data))
227            .or_else(|| self.image_classifier.classify(data))
228            .or_else(|| self.audio_video_classifier.classify(data))
229            .or_else(|| self.archive_classifier.classify(data))
230            .or_else(|| self.binary_or_plaintext.classify(data))
231            .expect("BinaryOrPlaintextClassifier always succeeds")
232    }
233
234    fn sniff_text_or_data<'a>(&'a self, data: &'a [u8]) -> Mime {
235        self.binary_or_plaintext
236            .classify(data)
237            .expect("BinaryOrPlaintextClassifier always succeeds")
238    }
239
240    /// <https://mimesniff.spec.whatwg.org/#xml-mime-type>
241    /// SVG is worth distinguishing from other XML MIME types:
242    /// <https://mimesniff.spec.whatwg.org/#mime-type-miscellaneous>
243    fn is_xml(mt: &Mime) -> bool {
244        !Self::is_image(mt) &&
245            (mt.suffix() == Some(mime::XML) ||
246                mt.essence_str() == "text/xml" ||
247                mt.essence_str() == "application/xml")
248    }
249
250    /// <https://mimesniff.spec.whatwg.org/#html-mime-type>
251    fn is_html(mt: &Mime) -> bool {
252        mt.essence_str() == "text/html"
253    }
254
255    /// <https://mimesniff.spec.whatwg.org/#image-mime-type>
256    fn is_image(mt: &Mime) -> bool {
257        mt.type_() == mime::IMAGE
258    }
259
260    /// <https://mimesniff.spec.whatwg.org/#audio-or-video-mime-type>
261    fn is_audio_video(mt: &Mime) -> bool {
262        mt.type_() == mime::AUDIO ||
263            mt.type_() == mime::VIDEO ||
264            mt.essence_str() == "application/ogg"
265    }
266
267    fn is_explicit_unknown(mt: &Mime) -> bool {
268        mt.type_().as_str() == "unknown" && mt.subtype().as_str() == "unknown" ||
269            mt.type_() == mime::APPLICATION && mt.subtype().as_str() == "unknown" ||
270            mt.type_() == mime::STAR && mt.subtype() == mime::STAR
271    }
272
273    /// <https://mimesniff.spec.whatwg.org/#javascript-mime-type>
274    pub fn is_javascript(mt: &Mime) -> bool {
275        (mt.type_() == mime::APPLICATION &&
276            (["ecmascript", "javascript", "x-ecmascript", "x-javascript"]
277                .contains(&mt.subtype().as_str()))) ||
278            (mt.type_() == mime::TEXT &&
279                ([
280                    "ecmascript",
281                    "javascript",
282                    "javascript1.0",
283                    "javascript1.1",
284                    "javascript1.2",
285                    "javascript1.3",
286                    "javascript1.4",
287                    "javascript1.5",
288                    "jscript",
289                    "livescript",
290                    "x-ecmascript",
291                    "x-javascript",
292                ]
293                .contains(&mt.subtype().as_str())))
294    }
295
296    /// <https://mimesniff.spec.whatwg.org/#json-mime-type>
297    pub fn is_json(mt: &Mime) -> bool {
298        mt.suffix() == Some(mime::JSON) ||
299            (mt.subtype() == mime::JSON &&
300                (mt.type_() == mime::APPLICATION || mt.type_() == mime::TEXT))
301    }
302
303    /// <https://mimesniff.spec.whatwg.org/#font-mime-type>
304    fn is_font(mt: &Mime) -> bool {
305        mt.type_() == mime::FONT ||
306            (mt.type_() == mime::APPLICATION &&
307                ([
308                    "font-cff",
309                    "font-off",
310                    "font-sfnt",
311                    "font-ttf",
312                    "font-woff",
313                    "vnd.ms-fontobject",
314                    "vnd.ms-opentype",
315                ]
316                .contains(&mt.subtype().as_str())))
317    }
318
319    fn is_text(mt: &Mime) -> bool {
320        *mt == mime::TEXT_PLAIN || mt.essence_str() == "text/vtt"
321    }
322
323    fn is_css(mt: &Mime) -> bool {
324        mt.essence_str() == "text/css"
325    }
326
327    pub fn get_media_type(mime: &Mime) -> Option<MediaType> {
328        if MimeClassifier::is_xml(mime) {
329            Some(MediaType::Xml)
330        } else if MimeClassifier::is_html(mime) {
331            Some(MediaType::Html)
332        } else if MimeClassifier::is_image(mime) {
333            Some(MediaType::Image)
334        } else if MimeClassifier::is_audio_video(mime) {
335            Some(MediaType::AudioVideo)
336        } else if MimeClassifier::is_javascript(mime) {
337            Some(MediaType::JavaScript)
338        } else if MimeClassifier::is_font(mime) {
339            Some(MediaType::Font)
340        } else if MimeClassifier::is_json(mime) {
341            Some(MediaType::Json)
342        } else if MimeClassifier::is_text(mime) {
343            Some(MediaType::Text)
344        } else if MimeClassifier::is_css(mime) {
345            Some(MediaType::Css)
346        } else {
347            None
348        }
349    }
350
351    fn maybe_get_media_type(supplied_type: &Option<Mime>) -> Option<MediaType> {
352        supplied_type
353            .as_ref()
354            .and_then(MimeClassifier::get_media_type)
355    }
356}
357
358// Interface used for composite types
359trait MIMEChecker {
360    fn classify(&self, data: &[u8]) -> Option<Mime>;
361    /// Validate the MIME checker configuration
362    fn validate(&self) -> Result<(), String>;
363}
364
365struct ByteMatcher {
366    pattern: &'static [u8],
367    mask: &'static [u8],
368    leading_ignore: &'static [u8],
369    content_type: Mime,
370}
371
372impl ByteMatcher {
373    fn matches(&self, data: &[u8]) -> Option<usize> {
374        if data.len() < self.pattern.len() {
375            None
376        } else if data == self.pattern {
377            Some(self.pattern.len())
378        } else {
379            data[..data.len() - self.pattern.len() + 1]
380                .iter()
381                .position(|x| !self.leading_ignore.contains(x))
382                .and_then(|start| {
383                    if data[start..]
384                        .iter()
385                        .zip(self.pattern.iter())
386                        .zip(self.mask.iter())
387                        .all(|((&data, &pattern), &mask)| (data & mask) == pattern)
388                    {
389                        Some(start + self.pattern.len())
390                    } else {
391                        None
392                    }
393                })
394        }
395    }
396}
397
398impl MIMEChecker for ByteMatcher {
399    fn classify(&self, data: &[u8]) -> Option<Mime> {
400        self.matches(data).map(|_| self.content_type.clone())
401    }
402
403    fn validate(&self) -> Result<(), String> {
404        if self.pattern.is_empty() {
405            return Err(format!("Zero length pattern for {:?}", self.content_type));
406        }
407        if self.pattern.len() != self.mask.len() {
408            return Err(format!(
409                "Unequal pattern and mask length for {:?}",
410                self.content_type
411            ));
412        }
413        if self
414            .pattern
415            .iter()
416            .zip(self.mask.iter())
417            .any(|(&pattern, &mask)| pattern & mask != pattern)
418        {
419            return Err(format!(
420                "Pattern not pre-masked for {:?}",
421                self.content_type
422            ));
423        }
424        Ok(())
425    }
426}
427
428struct TagTerminatedByteMatcher {
429    matcher: ByteMatcher,
430}
431
432impl MIMEChecker for TagTerminatedByteMatcher {
433    fn classify(&self, data: &[u8]) -> Option<Mime> {
434        self.matcher.matches(data).and_then(|j| {
435            if j < data.len() && (data[j] == b' ' || data[j] == b'>') {
436                Some(self.matcher.content_type.clone())
437            } else {
438                None
439            }
440        })
441    }
442
443    fn validate(&self) -> Result<(), String> {
444        self.matcher.validate()
445    }
446}
447
448pub struct Mp4Matcher;
449
450impl Mp4Matcher {
451    /// <https://mimesniff.spec.whatwg.org/#matches-the-signature-for-mp4>
452    pub fn matches(&self, data: &[u8]) -> bool {
453        // Step 1. Let sequence be the byte sequence to be matched,
454        // where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence.
455        // Step 2. Let length be the number of bytes in sequence.
456        // Step 3. If length is less than 12, return false.
457        if data.len() < 12 {
458            return false;
459        }
460
461        // Step 4. Let box-size be the four bytes from sequence[0] to sequence[3],
462        // interpreted as a 32-bit unsigned big-endian integer.
463        let box_size = (((data[0] as u32) << 24) |
464            ((data[1] as u32) << 16) |
465            ((data[2] as u32) << 8) |
466            (data[3] as u32)) as usize;
467        // Step 5. If length is less than box-size or if box-size modulo 4 is not equal to 0, return false.
468        if (data.len() < box_size) || (box_size % 4 != 0) {
469            return false;
470        }
471
472        // Step 6. If the four bytes from sequence[4] to sequence[7] are not equal to 0x66 0x74 0x79 0x70 ("ftyp"), return false.
473        let ftyp = [0x66, 0x74, 0x79, 0x70];
474        if !data[4..].starts_with(&ftyp) {
475            return false;
476        }
477
478        // Step 7. If the three bytes from sequence[8] to sequence[10] are equal to 0x6D 0x70 0x34 ("mp4"), return true.
479        let mp4 = [0x6D, 0x70, 0x34];
480        data[8..].starts_with(&mp4) ||
481        // Step 8. Let bytes-read be 16.
482        // Step 9. While bytes-read is less than box-size, continuously loop through these steps:
483            data[16..box_size]
484            // Step 11. Increment bytes-read by 4.
485                .chunks(4)
486                // Step 10. If the three bytes from sequence[bytes-read] to sequence[bytes-read + 2]
487                // are equal to 0x6D 0x70 0x34 ("mp4"), return true.
488                .any(|chunk| chunk.starts_with(&mp4))
489        // Step 12. Return false.
490    }
491}
492impl MIMEChecker for Mp4Matcher {
493    fn classify(&self, data: &[u8]) -> Option<Mime> {
494        if self.matches(data) {
495            Some("video/mp4".parse().unwrap())
496        } else {
497            None
498        }
499    }
500
501    fn validate(&self) -> Result<(), String> {
502        Ok(())
503    }
504}
505
506struct BinaryOrPlaintextClassifier;
507
508impl BinaryOrPlaintextClassifier {
509    /// <https://mimesniff.spec.whatwg.org/#rules-for-text-or-binary>
510    fn classify_impl(&self, data: &[u8]) -> Mime {
511        // Step 1. Let length be the number of bytes in the resource header.
512        // Step 2. If length is greater than or equal to 2 and
513        // the first 2 bytes of the resource header are equal to 0xFE 0xFF (UTF-16BE BOM)
514        // or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
515        // Step 3. If length is greater than or equal to 3
516        // and the first 3 bytes of the resource header are equal to
517        // 0xEF 0xBB 0xBF (UTF-8 BOM), the computed MIME type is "text/plain".
518        if data.starts_with(&[0xFFu8, 0xFEu8]) ||
519            data.starts_with(&[0xFEu8, 0xFFu8]) ||
520            data.starts_with(&[0xEFu8, 0xBBu8, 0xBFu8])
521        {
522            mime::TEXT_PLAIN
523        } else if data.iter().any(|&x| {
524            x <= 0x08u8 ||
525                x == 0x0Bu8 ||
526                (0x0Eu8..=0x1Au8).contains(&x) ||
527                (0x1Cu8..=0x1Fu8).contains(&x)
528        }) {
529            // Step 5. The computed MIME type is "application/octet-stream".
530            mime::APPLICATION_OCTET_STREAM
531        } else {
532            // Step 4. If the resource header contains no binary data bytes,
533            // the computed MIME type is "text/plain".
534            mime::TEXT_PLAIN
535        }
536    }
537}
538impl MIMEChecker for BinaryOrPlaintextClassifier {
539    fn classify(&self, data: &[u8]) -> Option<Mime> {
540        Some(self.classify_impl(data))
541    }
542
543    fn validate(&self) -> Result<(), String> {
544        Ok(())
545    }
546}
547struct GroupedClassifier {
548    byte_matchers: Vec<Box<dyn MIMEChecker + Send + Sync>>,
549}
550impl GroupedClassifier {
551    fn image_classifer() -> GroupedClassifier {
552        GroupedClassifier {
553            byte_matchers: vec![
554                // Keep this in sync with 'is_supported_mime_type' from
555                // components/style/servo/media_queries.rs
556                Box::new(ByteMatcher::image_x_icon()),
557                Box::new(ByteMatcher::image_x_icon_cursor()),
558                Box::new(ByteMatcher::image_bmp()),
559                Box::new(ByteMatcher::image_gif89a()),
560                Box::new(ByteMatcher::image_gif87a()),
561                Box::new(ByteMatcher::image_webp()),
562                Box::new(ByteMatcher::image_png()),
563                Box::new(ByteMatcher::image_jpeg()),
564            ],
565        }
566    }
567    fn audio_video_classifier() -> GroupedClassifier {
568        GroupedClassifier {
569            byte_matchers: vec![
570                Box::new(ByteMatcher::video_webm()),
571                Box::new(ByteMatcher::audio_basic()),
572                Box::new(ByteMatcher::audio_aiff()),
573                Box::new(ByteMatcher::audio_mpeg()),
574                Box::new(ByteMatcher::application_ogg()),
575                Box::new(ByteMatcher::audio_midi()),
576                Box::new(ByteMatcher::video_avi()),
577                Box::new(ByteMatcher::audio_wave()),
578                Box::new(Mp4Matcher),
579            ],
580        }
581    }
582    fn scriptable_classifier() -> GroupedClassifier {
583        GroupedClassifier {
584            byte_matchers: vec![
585                Box::new(ByteMatcher::text_html_doctype()),
586                Box::new(ByteMatcher::text_html_page()),
587                Box::new(ByteMatcher::text_html_head()),
588                Box::new(ByteMatcher::text_html_script()),
589                Box::new(ByteMatcher::text_html_iframe()),
590                Box::new(ByteMatcher::text_html_h1()),
591                Box::new(ByteMatcher::text_html_div()),
592                Box::new(ByteMatcher::text_html_font()),
593                Box::new(ByteMatcher::text_html_table()),
594                Box::new(ByteMatcher::text_html_a()),
595                Box::new(ByteMatcher::text_html_style()),
596                Box::new(ByteMatcher::text_html_title()),
597                Box::new(ByteMatcher::text_html_b()),
598                Box::new(ByteMatcher::text_html_body()),
599                Box::new(ByteMatcher::text_html_br()),
600                Box::new(ByteMatcher::text_html_p()),
601                Box::new(ByteMatcher::text_html_comment()),
602                Box::new(ByteMatcher::text_xml()),
603                Box::new(ByteMatcher::application_pdf()),
604            ],
605        }
606    }
607    fn plaintext_classifier() -> GroupedClassifier {
608        GroupedClassifier {
609            byte_matchers: vec![
610                Box::new(ByteMatcher::text_plain_utf_8_bom()),
611                Box::new(ByteMatcher::text_plain_utf_16le_bom()),
612                Box::new(ByteMatcher::text_plain_utf_16be_bom()),
613                Box::new(ByteMatcher::application_postscript()),
614            ],
615        }
616    }
617    fn archive_classifier() -> GroupedClassifier {
618        GroupedClassifier {
619            byte_matchers: vec![
620                Box::new(ByteMatcher::application_x_gzip()),
621                Box::new(ByteMatcher::application_zip()),
622                Box::new(ByteMatcher::application_x_rar_compressed()),
623            ],
624        }
625    }
626
627    fn font_classifier() -> GroupedClassifier {
628        GroupedClassifier {
629            byte_matchers: vec![
630                Box::new(ByteMatcher::application_font_woff()),
631                Box::new(ByteMatcher::true_type_collection()),
632                Box::new(ByteMatcher::open_type()),
633                Box::new(ByteMatcher::true_type()),
634                Box::new(ByteMatcher::application_vnd_ms_font_object()),
635            ],
636        }
637    }
638}
639impl MIMEChecker for GroupedClassifier {
640    fn classify(&self, data: &[u8]) -> Option<Mime> {
641        self.byte_matchers
642            .iter()
643            .filter_map(|matcher| matcher.classify(data))
644            .next()
645    }
646
647    fn validate(&self) -> Result<(), String> {
648        for byte_matcher in &self.byte_matchers {
649            byte_matcher.validate()?
650        }
651        Ok(())
652    }
653}
654
655// Contains hard coded byte matchers
656// TODO: These should be configured and not hard coded
657impl ByteMatcher {
658    // A Windows Icon signature
659    fn image_x_icon() -> ByteMatcher {
660        ByteMatcher {
661            pattern: b"\x00\x00\x01\x00",
662            mask: b"\xFF\xFF\xFF\xFF",
663            content_type: "image/x-icon".parse().unwrap(),
664            leading_ignore: &[],
665        }
666    }
667    // A Windows Cursor signature.
668    fn image_x_icon_cursor() -> ByteMatcher {
669        ByteMatcher {
670            pattern: b"\x00\x00\x02\x00",
671            mask: b"\xFF\xFF\xFF\xFF",
672            content_type: "image/x-icon".parse().unwrap(),
673            leading_ignore: &[],
674        }
675    }
676    // The string "BM", a BMP signature.
677    fn image_bmp() -> ByteMatcher {
678        ByteMatcher {
679            pattern: b"BM",
680            mask: b"\xFF\xFF",
681            content_type: mime::IMAGE_BMP,
682            leading_ignore: &[],
683        }
684    }
685    // The string "GIF89a", a GIF signature.
686    fn image_gif89a() -> ByteMatcher {
687        ByteMatcher {
688            pattern: b"GIF89a",
689            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF",
690            content_type: mime::IMAGE_GIF,
691            leading_ignore: &[],
692        }
693    }
694    // The string "GIF87a", a GIF signature.
695    fn image_gif87a() -> ByteMatcher {
696        ByteMatcher {
697            pattern: b"GIF87a",
698            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF",
699            content_type: mime::IMAGE_GIF,
700            leading_ignore: &[],
701        }
702    }
703    // The string "RIFF" followed by four bytes followed by the string "WEBPVP".
704    fn image_webp() -> ByteMatcher {
705        ByteMatcher {
706            pattern: b"RIFF\x00\x00\x00\x00WEBPVP",
707            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
708            content_type: "image/webp".parse().unwrap(),
709            leading_ignore: &[],
710        }
711    }
712    // An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG
713    // signature.
714    fn image_png() -> ByteMatcher {
715        ByteMatcher {
716            pattern: b"\x89PNG\r\n\x1A\n",
717            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
718            content_type: mime::IMAGE_PNG,
719            leading_ignore: &[],
720        }
721    }
722    // The JPEG Start of Image marker followed by the indicator byte of another marker.
723    fn image_jpeg() -> ByteMatcher {
724        ByteMatcher {
725            pattern: b"\xFF\xD8\xFF",
726            mask: b"\xFF\xFF\xFF",
727            content_type: mime::IMAGE_JPEG,
728            leading_ignore: &[],
729        }
730    }
731    // The WebM signature. [TODO: Use more bytes?]
732    fn video_webm() -> ByteMatcher {
733        ByteMatcher {
734            pattern: b"\x1A\x45\xDF\xA3",
735            mask: b"\xFF\xFF\xFF\xFF",
736            content_type: "video/webm".parse().unwrap(),
737            leading_ignore: &[],
738        }
739    }
740    // The string ".snd", the basic audio signature.
741    fn audio_basic() -> ByteMatcher {
742        ByteMatcher {
743            pattern: b".snd",
744            mask: b"\xFF\xFF\xFF\xFF",
745            content_type: "audio/basic".parse().unwrap(),
746            leading_ignore: &[],
747        }
748    }
749    // The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature.
750    fn audio_aiff() -> ByteMatcher {
751        ByteMatcher {
752            pattern: b"FORM\x00\x00\x00\x00AIFF",
753            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
754            content_type: "audio/aiff".parse().unwrap(),
755            leading_ignore: &[],
756        }
757    }
758    // The string "ID3", the ID3v2-tagged MP3 signature.
759    fn audio_mpeg() -> ByteMatcher {
760        ByteMatcher {
761            pattern: b"ID3",
762            mask: b"\xFF\xFF\xFF",
763            content_type: "audio/mpeg".parse().unwrap(),
764            leading_ignore: &[],
765        }
766    }
767    // The string "OggS" followed by NUL, the Ogg container signature.
768    fn application_ogg() -> ByteMatcher {
769        ByteMatcher {
770            pattern: b"OggS\x00",
771            mask: b"\xFF\xFF\xFF\xFF\xFF",
772            content_type: "application/ogg".parse().unwrap(),
773            leading_ignore: &[],
774        }
775    }
776    // The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian),
777    // the MIDI signature.
778    fn audio_midi() -> ByteMatcher {
779        ByteMatcher {
780            pattern: b"MThd\x00\x00\x00\x06",
781            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
782            content_type: "audio/midi".parse().unwrap(),
783            leading_ignore: &[],
784        }
785    }
786    // The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature.
787    fn video_avi() -> ByteMatcher {
788        ByteMatcher {
789            pattern: b"RIFF\x00\x00\x00\x00AVI ",
790            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
791            content_type: "video/avi".parse().unwrap(),
792            leading_ignore: &[],
793        }
794    }
795    // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature.
796    fn audio_wave() -> ByteMatcher {
797        ByteMatcher {
798            pattern: b"RIFF\x00\x00\x00\x00WAVE",
799            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
800            content_type: "audio/wave".parse().unwrap(),
801            leading_ignore: &[],
802        }
803    }
804    // doctype terminated with Tag terminating (TT) Byte
805    fn text_html_doctype() -> TagTerminatedByteMatcher {
806        TagTerminatedByteMatcher {
807            matcher: ByteMatcher {
808                pattern: b"<!DOCTYPE HTML",
809                mask: b"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
810                content_type: mime::TEXT_HTML,
811                leading_ignore: b"\t\n\x0C\r ",
812            },
813        }
814    }
815
816    // HTML terminated with Tag terminating (TT) Byte: 0x20 (SP)
817    fn text_html_page() -> TagTerminatedByteMatcher {
818        TagTerminatedByteMatcher {
819            matcher: ByteMatcher {
820                pattern: b"<HTML",
821                mask: b"\xFF\xDF\xDF\xDF\xDF",
822                content_type: mime::TEXT_HTML,
823                leading_ignore: b"\t\n\x0C\r ",
824            },
825        }
826    }
827
828    // head terminated with Tag Terminating (TT) Byte
829    fn text_html_head() -> TagTerminatedByteMatcher {
830        TagTerminatedByteMatcher {
831            matcher: ByteMatcher {
832                pattern: b"<HEAD",
833                mask: b"\xFF\xDF\xDF\xDF\xDF",
834                content_type: mime::TEXT_HTML,
835                leading_ignore: b"\t\n\x0C\r ",
836            },
837        }
838    }
839
840    // script terminated with Tag Terminating (TT) Byte
841    fn text_html_script() -> TagTerminatedByteMatcher {
842        TagTerminatedByteMatcher {
843            matcher: ByteMatcher {
844                pattern: b"<SCRIPT",
845                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
846                content_type: mime::TEXT_HTML,
847                leading_ignore: b"\t\n\x0C\r ",
848            },
849        }
850    }
851
852    // iframe terminated with Tag Terminating (TT) Byte
853    fn text_html_iframe() -> TagTerminatedByteMatcher {
854        TagTerminatedByteMatcher {
855            matcher: ByteMatcher {
856                pattern: b"<IFRAME",
857                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
858                content_type: mime::TEXT_HTML,
859                leading_ignore: b"\t\n\x0C\r ",
860            },
861        }
862    }
863
864    // h1 terminated with Tag Terminating (TT) Byte
865    fn text_html_h1() -> TagTerminatedByteMatcher {
866        TagTerminatedByteMatcher {
867            matcher: ByteMatcher {
868                pattern: b"<H1",
869                mask: b"\xFF\xDF\xFF",
870                content_type: mime::TEXT_HTML,
871                leading_ignore: b"\t\n\x0C\r ",
872            },
873        }
874    }
875
876    // div terminated with Tag Terminating (TT) Byte
877    fn text_html_div() -> TagTerminatedByteMatcher {
878        TagTerminatedByteMatcher {
879            matcher: ByteMatcher {
880                pattern: b"<DIV",
881                mask: b"\xFF\xDF\xDF\xDF",
882                content_type: mime::TEXT_HTML,
883                leading_ignore: b"\t\n\x0C\r ",
884            },
885        }
886    }
887
888    // font terminated with Tag Terminating (TT) Byte
889    fn text_html_font() -> TagTerminatedByteMatcher {
890        TagTerminatedByteMatcher {
891            matcher: ByteMatcher {
892                pattern: b"<FONT",
893                mask: b"\xFF\xDF\xDF\xDF\xDF",
894                content_type: mime::TEXT_HTML,
895                leading_ignore: b"\t\n\x0C\r ",
896            },
897        }
898    }
899
900    // table terminated with Tag Terminating (TT) Byte
901    fn text_html_table() -> TagTerminatedByteMatcher {
902        TagTerminatedByteMatcher {
903            matcher: ByteMatcher {
904                pattern: b"<TABLE",
905                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
906                content_type: mime::TEXT_HTML,
907                leading_ignore: b"\t\n\x0C\r ",
908            },
909        }
910    }
911
912    // a terminated with Tag Terminating (TT) Byte
913    fn text_html_a() -> TagTerminatedByteMatcher {
914        TagTerminatedByteMatcher {
915            matcher: ByteMatcher {
916                pattern: b"<A",
917                mask: b"\xFF\xDF",
918                content_type: mime::TEXT_HTML,
919                leading_ignore: b"\t\n\x0C\r ",
920            },
921        }
922    }
923
924    // style terminated with Tag Terminating (TT) Byte
925    fn text_html_style() -> TagTerminatedByteMatcher {
926        TagTerminatedByteMatcher {
927            matcher: ByteMatcher {
928                pattern: b"<STYLE",
929                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
930                content_type: mime::TEXT_HTML,
931                leading_ignore: b"\t\n\x0C\r ",
932            },
933        }
934    }
935
936    // title terminated with Tag Terminating (TT) Byte
937    fn text_html_title() -> TagTerminatedByteMatcher {
938        TagTerminatedByteMatcher {
939            matcher: ByteMatcher {
940                pattern: b"<TITLE",
941                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
942                content_type: mime::TEXT_HTML,
943                leading_ignore: b"\t\n\x0C\r ",
944            },
945        }
946    }
947
948    // b terminated with Tag Terminating (TT) Byte
949    fn text_html_b() -> TagTerminatedByteMatcher {
950        TagTerminatedByteMatcher {
951            matcher: ByteMatcher {
952                pattern: b"<B",
953                mask: b"\xFF\xDF",
954                content_type: mime::TEXT_HTML,
955                leading_ignore: b"\t\n\x0C\r ",
956            },
957        }
958    }
959
960    // body terminated with Tag Terminating (TT) Byte
961    fn text_html_body() -> TagTerminatedByteMatcher {
962        TagTerminatedByteMatcher {
963            matcher: ByteMatcher {
964                pattern: b"<BODY",
965                mask: b"\xFF\xDF\xDF\xDF\xDF",
966                content_type: mime::TEXT_HTML,
967                leading_ignore: b"\t\n\x0C\r ",
968            },
969        }
970    }
971
972    // br terminated with Tag Terminating (TT) Byte
973    fn text_html_br() -> TagTerminatedByteMatcher {
974        TagTerminatedByteMatcher {
975            matcher: ByteMatcher {
976                pattern: b"<BR",
977                mask: b"\xFF\xDF\xDF",
978                content_type: mime::TEXT_HTML,
979                leading_ignore: b"\t\n\x0C\r ",
980            },
981        }
982    }
983
984    // p terminated with Tag Terminating (TT) Byte
985    fn text_html_p() -> TagTerminatedByteMatcher {
986        TagTerminatedByteMatcher {
987            matcher: ByteMatcher {
988                pattern: b"<P",
989                mask: b"\xFF\xDF",
990                content_type: mime::TEXT_HTML,
991                leading_ignore: b"\t\n\x0C\r ",
992            },
993        }
994    }
995
996    // comment terminated with Tag Terminating (TT) Byte
997    fn text_html_comment() -> TagTerminatedByteMatcher {
998        TagTerminatedByteMatcher {
999            matcher: ByteMatcher {
1000                pattern: b"<!--",
1001                mask: b"\xFF\xFF\xFF\xFF",
1002                content_type: mime::TEXT_HTML,
1003                leading_ignore: b"\t\n\x0C\r ",
1004            },
1005        }
1006    }
1007
1008    // The string "<?xml".
1009    fn text_xml() -> ByteMatcher {
1010        ByteMatcher {
1011            pattern: b"<?xml",
1012            mask: b"\xFF\xFF\xFF\xFF\xFF",
1013            content_type: mime::TEXT_XML,
1014            leading_ignore: b"\t\n\x0C\r ",
1015        }
1016    }
1017    // The string "%PDF-", the PDF signature.
1018    fn application_pdf() -> ByteMatcher {
1019        ByteMatcher {
1020            pattern: b"%PDF-",
1021            mask: b"\xFF\xFF\xFF\xFF\xFF",
1022            content_type: mime::APPLICATION_PDF,
1023            leading_ignore: &[],
1024        }
1025    }
1026    // 34 bytes followed by the string "LP", the Embedded OpenType signature.
1027    fn application_vnd_ms_font_object() -> ByteMatcher {
1028        ByteMatcher {
1029            pattern: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1030                       \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1031                       \x00\x00LP",
1032            mask: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1033                    \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1034                    \x00\x00\xFF\xFF",
1035            content_type: "application/vnd.ms-fontobject".parse().unwrap(),
1036            leading_ignore: &[],
1037        }
1038    }
1039    // 4 bytes representing the version number 1.0, a TrueType signature.
1040    fn true_type() -> ByteMatcher {
1041        ByteMatcher {
1042            pattern: b"\x00\x01\x00\x00",
1043            mask: b"\xFF\xFF\xFF\xFF",
1044            content_type: "application/font-sfnt".parse().unwrap(),
1045            leading_ignore: &[],
1046        }
1047    }
1048    // The string "OTTO", the OpenType signature.
1049    fn open_type() -> ByteMatcher {
1050        ByteMatcher {
1051            pattern: b"OTTO",
1052            mask: b"\xFF\xFF\xFF\xFF",
1053            content_type: "application/font-sfnt".parse().unwrap(),
1054            leading_ignore: &[],
1055        }
1056    }
1057    // The string "ttcf", the TrueType Collection signature.
1058    fn true_type_collection() -> ByteMatcher {
1059        ByteMatcher {
1060            pattern: b"ttcf",
1061            mask: b"\xFF\xFF\xFF\xFF",
1062            content_type: "application/font-sfnt".parse().unwrap(),
1063            leading_ignore: &[],
1064        }
1065    }
1066    // The string "wOFF", the Web Open Font Format signature.
1067    fn application_font_woff() -> ByteMatcher {
1068        ByteMatcher {
1069            pattern: b"wOFF",
1070            mask: b"\xFF\xFF\xFF\xFF",
1071            content_type: "application/font-woff".parse().unwrap(),
1072            leading_ignore: &[],
1073        }
1074    }
1075    // The GZIP archive signature.
1076    fn application_x_gzip() -> ByteMatcher {
1077        ByteMatcher {
1078            pattern: b"\x1F\x8B\x08",
1079            mask: b"\xFF\xFF\xFF",
1080            content_type: "application/x-gzip".parse().unwrap(),
1081            leading_ignore: &[],
1082        }
1083    }
1084    // The string "PK" followed by ETX EOT, the ZIP archive signature.
1085    fn application_zip() -> ByteMatcher {
1086        ByteMatcher {
1087            pattern: b"PK\x03\x04",
1088            mask: b"\xFF\xFF\xFF\xFF",
1089            content_type: "application/zip".parse().unwrap(),
1090            leading_ignore: &[],
1091        }
1092    }
1093    // The string "Rar " followed by SUB BEL NUL, the RAR archive signature.
1094    fn application_x_rar_compressed() -> ByteMatcher {
1095        ByteMatcher {
1096            pattern: b"Rar \x1A\x07\x00",
1097            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
1098            content_type: "application/x-rar-compressed".parse().unwrap(),
1099            leading_ignore: &[],
1100        }
1101    }
1102    // The string "%!PS-Adobe-", the PostScript signature.
1103    fn application_postscript() -> ByteMatcher {
1104        ByteMatcher {
1105            pattern: b"%!PS-Adobe-",
1106            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
1107            content_type: "application/postscript".parse().unwrap(),
1108            leading_ignore: &[],
1109        }
1110    }
1111    // UTF-16BE BOM
1112    fn text_plain_utf_16be_bom() -> ByteMatcher {
1113        ByteMatcher {
1114            pattern: b"\xFE\xFF\x00\x00",
1115            mask: b"\xFF\xFF\x00\x00",
1116            content_type: mime::TEXT_PLAIN,
1117            leading_ignore: &[],
1118        }
1119    }
1120    // UTF-16LE BOM
1121    fn text_plain_utf_16le_bom() -> ByteMatcher {
1122        ByteMatcher {
1123            pattern: b"\xFF\xFE\x00\x00",
1124            mask: b"\xFF\xFF\x00\x00",
1125            content_type: mime::TEXT_PLAIN,
1126            leading_ignore: &[],
1127        }
1128    }
1129    // UTF-8 BOM
1130    fn text_plain_utf_8_bom() -> ByteMatcher {
1131        ByteMatcher {
1132            pattern: b"\xEF\xBB\xBF\x00",
1133            mask: b"\xFF\xFF\xFF\x00",
1134            content_type: mime::TEXT_PLAIN,
1135            leading_ignore: &[],
1136        }
1137    }
1138}