net_traits/
mime_classifier.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use mime::{self, Mime};
6
7use crate::LoadContext;
8
9pub struct MimeClassifier {
10    image_classifier: GroupedClassifier,
11    audio_video_classifier: GroupedClassifier,
12    scriptable_classifier: GroupedClassifier,
13    plaintext_classifier: GroupedClassifier,
14    archive_classifier: GroupedClassifier,
15    binary_or_plaintext: BinaryOrPlaintextClassifier,
16    font_classifier: GroupedClassifier,
17}
18
19#[derive(PartialEq)]
20pub enum MediaType {
21    Xml,
22    Html,
23    AudioVideo,
24    Image,
25    JavaScript,
26    Json,
27    Font,
28    Text,
29    Css,
30}
31
32#[derive(PartialEq)]
33pub enum ApacheBugFlag {
34    On,
35    Off,
36}
37
38impl ApacheBugFlag {
39    /// <https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm>
40    pub fn from_content_type(mime_type: Option<&Mime>) -> ApacheBugFlag {
41        // TODO(36801): also handle charset ISO-8859-1
42        if mime_type.is_some_and(|mime_type| {
43            *mime_type == mime::TEXT_PLAIN || *mime_type == mime::TEXT_PLAIN_UTF_8
44        }) {
45            ApacheBugFlag::On
46        } else {
47            ApacheBugFlag::Off
48        }
49    }
50}
51
52#[derive(PartialEq)]
53pub enum NoSniffFlag {
54    On,
55    Off,
56}
57
58impl From<bool> for NoSniffFlag {
59    fn from(boolean: bool) -> Self {
60        if boolean {
61            NoSniffFlag::On
62        } else {
63            NoSniffFlag::Off
64        }
65    }
66}
67
68impl Default for MimeClassifier {
69    fn default() -> Self {
70        Self {
71            image_classifier: GroupedClassifier::image_classifer(),
72            audio_video_classifier: GroupedClassifier::audio_video_classifier(),
73            scriptable_classifier: GroupedClassifier::scriptable_classifier(),
74            plaintext_classifier: GroupedClassifier::plaintext_classifier(),
75            archive_classifier: GroupedClassifier::archive_classifier(),
76            binary_or_plaintext: BinaryOrPlaintextClassifier,
77            font_classifier: GroupedClassifier::font_classifier(),
78        }
79    }
80}
81
82impl MimeClassifier {
83    /// <https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm>
84    pub fn classify<'a>(
85        &'a self,
86        context: LoadContext,
87        no_sniff_flag: NoSniffFlag,
88        apache_bug_flag: ApacheBugFlag,
89        supplied_type: &Option<Mime>,
90        data: &'a [u8],
91    ) -> Mime {
92        let supplied_type_or_octet_stream = supplied_type
93            .clone()
94            .unwrap_or(mime::APPLICATION_OCTET_STREAM);
95        // Step 1. If the supplied MIME type is an XML MIME type or HTML MIME type,
96        // the computed MIME type is the supplied MIME type.
97        if Self::is_xml(&supplied_type_or_octet_stream) ||
98            Self::is_html(&supplied_type_or_octet_stream)
99        {
100            return supplied_type_or_octet_stream;
101        }
102        match context {
103            LoadContext::Browsing => match *supplied_type {
104                // Step 2. If the supplied MIME type is undefined or if the supplied MIME type’s essence is "unknown/unknown",
105                // "application/unknown", or "*/*", execute the rules for identifying
106                // an unknown MIME type with the sniff-scriptable flag equal to the inverse of the no-sniff flag and abort these steps.
107                None => self.sniff_unknown_type(no_sniff_flag, data),
108                Some(ref supplied_type) => {
109                    if MimeClassifier::is_explicit_unknown(supplied_type) {
110                        return self.sniff_unknown_type(no_sniff_flag, data);
111                    }
112                    // Step 3. If the no-sniff flag is set, the computed MIME type is the supplied MIME type.
113                    // Abort these steps.
114                    if no_sniff_flag == NoSniffFlag::On {
115                        return supplied_type.clone();
116                    }
117                    // Step 4. If the check-for-apache-bug flag is set,
118                    // execute the rules for distinguishing if a resource is text or binary and abort these steps.
119                    if apache_bug_flag == ApacheBugFlag::On {
120                        return self.sniff_text_or_data(data);
121                    }
122                    match MimeClassifier::get_media_type(supplied_type) {
123                        // Step 5. If the supplied MIME type is an image MIME type supported by the user agent,
124                        // let matched-type be the result of executing the image type pattern matching algorithm with
125                        // the resource header as the byte sequence to be matched.
126                        Some(MediaType::Image) => {
127                            // Step 6. If matched-type is not undefined, the computed MIME type is matched-type.
128                            self.image_classifier.classify(data)
129                        },
130                        // Step 7. If the supplied MIME type is an audio or video MIME type supported by the user agent,
131                        // let matched-type be the result of executing the audio or video type pattern matching algorithm
132                        // with the resource header as the byte sequence to be matched.
133                        Some(MediaType::AudioVideo) => {
134                            // Step 8. If matched-type is not undefined, the computed MIME type is matched-type.
135                            self.audio_video_classifier.classify(data)
136                        },
137                        Some(MediaType::Html) | Some(MediaType::Xml) => unreachable!(),
138                        _ => None,
139                    }
140                    // Step 9. The computed MIME type is the supplied MIME type.
141                    .unwrap_or(supplied_type.clone())
142                },
143            },
144            LoadContext::Image => {
145                // Section 8.2 Sniffing an image context
146                match MimeClassifier::maybe_get_media_type(supplied_type) {
147                    Some(MediaType::Xml) => None,
148                    _ => self.image_classifier.classify(data),
149                }
150                .unwrap_or(supplied_type_or_octet_stream)
151            },
152            LoadContext::AudioVideo => {
153                // Section 8.3 Sniffing an image context
154                match MimeClassifier::maybe_get_media_type(supplied_type) {
155                    Some(MediaType::Xml) => None,
156                    _ => self.audio_video_classifier.classify(data),
157                }
158                .unwrap_or(supplied_type_or_octet_stream)
159            },
160            LoadContext::Plugin => {
161                // 8.4 Sniffing in a plugin context
162                //
163                // This section was *not* finalized in the specs at the time
164                // of this implementation.
165                match *supplied_type {
166                    None => mime::APPLICATION_OCTET_STREAM,
167                    _ => supplied_type_or_octet_stream,
168                }
169            },
170            LoadContext::Style => {
171                // 8.5 Sniffing in a style context
172                //
173                // This section was *not* finalized in the specs at the time
174                // of this implementation.
175                supplied_type.clone().unwrap_or_else(|| {
176                    if no_sniff_flag == NoSniffFlag::On {
177                        mime::APPLICATION_OCTET_STREAM
178                    } else {
179                        mime::TEXT_CSS
180                    }
181                })
182            },
183            LoadContext::Script => {
184                // 8.6 Sniffing in a script context
185                //
186                // This section was *not* finalized in the specs at the time
187                // of this implementation.
188                match *supplied_type {
189                    None => mime::TEXT_JAVASCRIPT,
190                    _ => supplied_type_or_octet_stream,
191                }
192            },
193            LoadContext::Font => {
194                // 8.7 Sniffing in a font context
195                match MimeClassifier::maybe_get_media_type(supplied_type) {
196                    Some(MediaType::Xml) => None,
197                    _ => self.font_classifier.classify(data),
198                }
199                .unwrap_or(supplied_type_or_octet_stream)
200            },
201            LoadContext::TextTrack => {
202                // 8.8 Sniffing in a text track context
203                //
204                // This section was *not* finalized in the specs at the time
205                // of this implementation.
206                "text/vtt".parse().unwrap()
207            },
208            LoadContext::CacheManifest => {
209                // 8.9 Sniffing in a cache manifest context
210                //
211                // This section was *not* finalized in the specs at the time
212                // of this implementation.
213                "text/cache-manifest".parse().unwrap()
214            },
215        }
216    }
217
218    pub fn validate(&self) -> Result<(), String> {
219        self.image_classifier.validate()?;
220        self.audio_video_classifier.validate()?;
221        self.scriptable_classifier.validate()?;
222        self.plaintext_classifier.validate()?;
223        self.archive_classifier.validate()?;
224        self.binary_or_plaintext.validate()?;
225        self.font_classifier.validate()?;
226        Ok(())
227    }
228
229    // some sort of iterator over the classifiers might be better?
230    fn sniff_unknown_type(&self, no_sniff_flag: NoSniffFlag, data: &[u8]) -> Mime {
231        let should_sniff_scriptable = no_sniff_flag == NoSniffFlag::Off;
232        let sniffed = if should_sniff_scriptable {
233            self.scriptable_classifier.classify(data)
234        } else {
235            None
236        };
237
238        sniffed
239            .or_else(|| self.plaintext_classifier.classify(data))
240            .or_else(|| self.image_classifier.classify(data))
241            .or_else(|| self.audio_video_classifier.classify(data))
242            .or_else(|| self.archive_classifier.classify(data))
243            .or_else(|| self.binary_or_plaintext.classify(data))
244            .expect("BinaryOrPlaintextClassifier always succeeds")
245    }
246
247    fn sniff_text_or_data<'a>(&'a self, data: &'a [u8]) -> Mime {
248        self.binary_or_plaintext
249            .classify(data)
250            .expect("BinaryOrPlaintextClassifier always succeeds")
251    }
252
253    /// <https://mimesniff.spec.whatwg.org/#xml-mime-type>
254    /// SVG is worth distinguishing from other XML MIME types:
255    /// <https://mimesniff.spec.whatwg.org/#mime-type-miscellaneous>
256    fn is_xml(mt: &Mime) -> bool {
257        !Self::is_image(mt) &&
258            (mt.suffix() == Some(mime::XML) ||
259                mt.essence_str() == "text/xml" ||
260                mt.essence_str() == "application/xml")
261    }
262
263    /// <https://mimesniff.spec.whatwg.org/#html-mime-type>
264    fn is_html(mt: &Mime) -> bool {
265        mt.essence_str() == "text/html"
266    }
267
268    /// <https://mimesniff.spec.whatwg.org/#image-mime-type>
269    fn is_image(mt: &Mime) -> bool {
270        mt.type_() == mime::IMAGE
271    }
272
273    /// <https://mimesniff.spec.whatwg.org/#audio-or-video-mime-type>
274    fn is_audio_video(mt: &Mime) -> bool {
275        mt.type_() == mime::AUDIO ||
276            mt.type_() == mime::VIDEO ||
277            mt.essence_str() == "application/ogg"
278    }
279
280    fn is_explicit_unknown(mt: &Mime) -> bool {
281        mt.type_().as_str() == "unknown" && mt.subtype().as_str() == "unknown" ||
282            mt.type_() == mime::APPLICATION && mt.subtype().as_str() == "unknown" ||
283            mt.type_() == mime::STAR && mt.subtype() == mime::STAR
284    }
285
286    /// <https://mimesniff.spec.whatwg.org/#javascript-mime-type>
287    pub fn is_javascript(mt: &Mime) -> bool {
288        (mt.type_() == mime::APPLICATION &&
289            (["ecmascript", "javascript", "x-ecmascript", "x-javascript"]
290                .contains(&mt.subtype().as_str()))) ||
291            (mt.type_() == mime::TEXT &&
292                ([
293                    "ecmascript",
294                    "javascript",
295                    "javascript1.0",
296                    "javascript1.1",
297                    "javascript1.2",
298                    "javascript1.3",
299                    "javascript1.4",
300                    "javascript1.5",
301                    "jscript",
302                    "livescript",
303                    "x-ecmascript",
304                    "x-javascript",
305                ]
306                .contains(&mt.subtype().as_str())))
307    }
308
309    /// <https://mimesniff.spec.whatwg.org/#json-mime-type>
310    pub fn is_json(mt: &Mime) -> bool {
311        mt.suffix() == Some(mime::JSON) ||
312            mt.essence_str() == "application/json" ||
313            mt.essence_str() == "text/json"
314    }
315
316    /// <https://mimesniff.spec.whatwg.org/#font-mime-type>
317    fn is_font(mt: &Mime) -> bool {
318        mt.type_() == mime::FONT ||
319            (mt.type_() == mime::APPLICATION &&
320                ([
321                    "font-cff",
322                    "font-off",
323                    "font-sfnt",
324                    "font-ttf",
325                    "font-woff",
326                    "vnd.ms-fontobject",
327                    "vnd.ms-opentype",
328                ]
329                .contains(&mt.subtype().as_str())))
330    }
331
332    fn is_text(mt: &Mime) -> bool {
333        *mt == mime::TEXT_PLAIN || mt.essence_str() == "text/vtt"
334    }
335
336    pub fn is_css(mt: &Mime) -> bool {
337        mt.essence_str() == "text/css"
338    }
339
340    pub fn get_media_type(mime: &Mime) -> Option<MediaType> {
341        if MimeClassifier::is_xml(mime) {
342            Some(MediaType::Xml)
343        } else if MimeClassifier::is_html(mime) {
344            Some(MediaType::Html)
345        } else if MimeClassifier::is_image(mime) {
346            Some(MediaType::Image)
347        } else if MimeClassifier::is_audio_video(mime) {
348            Some(MediaType::AudioVideo)
349        } else if MimeClassifier::is_javascript(mime) {
350            Some(MediaType::JavaScript)
351        } else if MimeClassifier::is_font(mime) {
352            Some(MediaType::Font)
353        } else if MimeClassifier::is_json(mime) {
354            Some(MediaType::Json)
355        } else if MimeClassifier::is_text(mime) {
356            Some(MediaType::Text)
357        } else if MimeClassifier::is_css(mime) {
358            Some(MediaType::Css)
359        } else {
360            None
361        }
362    }
363
364    fn maybe_get_media_type(supplied_type: &Option<Mime>) -> Option<MediaType> {
365        supplied_type
366            .as_ref()
367            .and_then(MimeClassifier::get_media_type)
368    }
369}
370
371// Interface used for composite types
372trait MIMEChecker {
373    fn classify(&self, data: &[u8]) -> Option<Mime>;
374    /// Validate the MIME checker configuration
375    fn validate(&self) -> Result<(), String>;
376}
377
378struct ByteMatcher {
379    pattern: &'static [u8],
380    mask: &'static [u8],
381    leading_ignore: &'static [u8],
382    content_type: Mime,
383}
384
385impl ByteMatcher {
386    fn matches(&self, data: &[u8]) -> Option<usize> {
387        if data.len() < self.pattern.len() {
388            None
389        } else if data == self.pattern {
390            Some(self.pattern.len())
391        } else {
392            data[..data.len() - self.pattern.len() + 1]
393                .iter()
394                .position(|x| !self.leading_ignore.contains(x))
395                .and_then(|start| {
396                    if data[start..]
397                        .iter()
398                        .zip(self.pattern.iter())
399                        .zip(self.mask.iter())
400                        .all(|((&data, &pattern), &mask)| (data & mask) == pattern)
401                    {
402                        Some(start + self.pattern.len())
403                    } else {
404                        None
405                    }
406                })
407        }
408    }
409}
410
411impl MIMEChecker for ByteMatcher {
412    fn classify(&self, data: &[u8]) -> Option<Mime> {
413        self.matches(data).map(|_| self.content_type.clone())
414    }
415
416    fn validate(&self) -> Result<(), String> {
417        if self.pattern.is_empty() {
418            return Err(format!("Zero length pattern for {:?}", self.content_type));
419        }
420        if self.pattern.len() != self.mask.len() {
421            return Err(format!(
422                "Unequal pattern and mask length for {:?}",
423                self.content_type
424            ));
425        }
426        if self
427            .pattern
428            .iter()
429            .zip(self.mask.iter())
430            .any(|(&pattern, &mask)| pattern & mask != pattern)
431        {
432            return Err(format!(
433                "Pattern not pre-masked for {:?}",
434                self.content_type
435            ));
436        }
437        Ok(())
438    }
439}
440
441struct TagTerminatedByteMatcher {
442    matcher: ByteMatcher,
443}
444
445impl MIMEChecker for TagTerminatedByteMatcher {
446    fn classify(&self, data: &[u8]) -> Option<Mime> {
447        self.matcher.matches(data).and_then(|j| {
448            if j < data.len() && (data[j] == b' ' || data[j] == b'>') {
449                Some(self.matcher.content_type.clone())
450            } else {
451                None
452            }
453        })
454    }
455
456    fn validate(&self) -> Result<(), String> {
457        self.matcher.validate()
458    }
459}
460
461pub struct Mp4Matcher;
462
463impl Mp4Matcher {
464    /// <https://mimesniff.spec.whatwg.org/#matches-the-signature-for-mp4>
465    pub fn matches(&self, data: &[u8]) -> bool {
466        // Step 1. Let sequence be the byte sequence to be matched,
467        // where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence.
468        // Step 2. Let length be the number of bytes in sequence.
469        // Step 3. If length is less than 12, return false.
470        if data.len() < 12 {
471            return false;
472        }
473
474        // Step 4. Let box-size be the four bytes from sequence[0] to sequence[3],
475        // interpreted as a 32-bit unsigned big-endian integer.
476        let box_size = (((data[0] as u32) << 24) |
477            ((data[1] as u32) << 16) |
478            ((data[2] as u32) << 8) |
479            (data[3] as u32)) as usize;
480        // Step 5. If length is less than box-size or if box-size modulo 4 is not equal to 0, return false.
481        if (data.len() < box_size) || (box_size % 4 != 0) {
482            return false;
483        }
484
485        // Step 6. If the four bytes from sequence[4] to sequence[7] are not equal to 0x66 0x74 0x79 0x70 ("ftyp"), return false.
486        let ftyp = [0x66, 0x74, 0x79, 0x70];
487        if !data[4..].starts_with(&ftyp) {
488            return false;
489        }
490
491        // Step 7. If the three bytes from sequence[8] to sequence[10] are equal to 0x6D 0x70 0x34 ("mp4"), return true.
492        let mp4 = [0x6D, 0x70, 0x34];
493        data[8..].starts_with(&mp4) ||
494        // Step 8. Let bytes-read be 16.
495        // Step 9. While bytes-read is less than box-size, continuously loop through these steps:
496            data[16..box_size]
497            // Step 11. Increment bytes-read by 4.
498                .chunks(4)
499                // Step 10. If the three bytes from sequence[bytes-read] to sequence[bytes-read + 2]
500                // are equal to 0x6D 0x70 0x34 ("mp4"), return true.
501                .any(|chunk| chunk.starts_with(&mp4))
502        // Step 12. Return false.
503    }
504}
505impl MIMEChecker for Mp4Matcher {
506    fn classify(&self, data: &[u8]) -> Option<Mime> {
507        if self.matches(data) {
508            Some("video/mp4".parse().unwrap())
509        } else {
510            None
511        }
512    }
513
514    fn validate(&self) -> Result<(), String> {
515        Ok(())
516    }
517}
518
519struct BinaryOrPlaintextClassifier;
520
521impl BinaryOrPlaintextClassifier {
522    /// <https://mimesniff.spec.whatwg.org/#rules-for-text-or-binary>
523    fn classify_impl(&self, data: &[u8]) -> Mime {
524        // Step 1. Let length be the number of bytes in the resource header.
525        // Step 2. If length is greater than or equal to 2 and
526        // the first 2 bytes of the resource header are equal to 0xFE 0xFF (UTF-16BE BOM)
527        // or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
528        // Step 3. If length is greater than or equal to 3
529        // and the first 3 bytes of the resource header are equal to
530        // 0xEF 0xBB 0xBF (UTF-8 BOM), the computed MIME type is "text/plain".
531        if data.starts_with(&[0xFFu8, 0xFEu8]) ||
532            data.starts_with(&[0xFEu8, 0xFFu8]) ||
533            data.starts_with(&[0xEFu8, 0xBBu8, 0xBFu8])
534        {
535            mime::TEXT_PLAIN
536        } else if data.iter().any(|&x| {
537            x <= 0x08u8 ||
538                x == 0x0Bu8 ||
539                (0x0Eu8..=0x1Au8).contains(&x) ||
540                (0x1Cu8..=0x1Fu8).contains(&x)
541        }) {
542            // Step 5. The computed MIME type is "application/octet-stream".
543            mime::APPLICATION_OCTET_STREAM
544        } else {
545            // Step 4. If the resource header contains no binary data bytes,
546            // the computed MIME type is "text/plain".
547            mime::TEXT_PLAIN
548        }
549    }
550}
551impl MIMEChecker for BinaryOrPlaintextClassifier {
552    fn classify(&self, data: &[u8]) -> Option<Mime> {
553        Some(self.classify_impl(data))
554    }
555
556    fn validate(&self) -> Result<(), String> {
557        Ok(())
558    }
559}
560struct GroupedClassifier {
561    byte_matchers: Vec<Box<dyn MIMEChecker + Send + Sync>>,
562}
563impl GroupedClassifier {
564    fn image_classifer() -> GroupedClassifier {
565        GroupedClassifier {
566            byte_matchers: vec![
567                // Keep this in sync with 'is_supported_mime_type' from
568                // components/style/servo/media_queries.rs
569                Box::new(ByteMatcher::image_x_icon()),
570                Box::new(ByteMatcher::image_x_icon_cursor()),
571                Box::new(ByteMatcher::image_bmp()),
572                Box::new(ByteMatcher::image_gif89a()),
573                Box::new(ByteMatcher::image_gif87a()),
574                Box::new(ByteMatcher::image_webp()),
575                Box::new(ByteMatcher::image_png()),
576                Box::new(ByteMatcher::image_jpeg()),
577            ],
578        }
579    }
580    fn audio_video_classifier() -> GroupedClassifier {
581        GroupedClassifier {
582            byte_matchers: vec![
583                Box::new(ByteMatcher::video_webm()),
584                Box::new(ByteMatcher::audio_basic()),
585                Box::new(ByteMatcher::audio_aiff()),
586                Box::new(ByteMatcher::audio_mpeg()),
587                Box::new(ByteMatcher::application_ogg()),
588                Box::new(ByteMatcher::audio_midi()),
589                Box::new(ByteMatcher::video_avi()),
590                Box::new(ByteMatcher::audio_wave()),
591                Box::new(Mp4Matcher),
592            ],
593        }
594    }
595    fn scriptable_classifier() -> GroupedClassifier {
596        GroupedClassifier {
597            byte_matchers: vec![
598                Box::new(ByteMatcher::text_html_doctype()),
599                Box::new(ByteMatcher::text_html_page()),
600                Box::new(ByteMatcher::text_html_head()),
601                Box::new(ByteMatcher::text_html_script()),
602                Box::new(ByteMatcher::text_html_iframe()),
603                Box::new(ByteMatcher::text_html_h1()),
604                Box::new(ByteMatcher::text_html_div()),
605                Box::new(ByteMatcher::text_html_font()),
606                Box::new(ByteMatcher::text_html_table()),
607                Box::new(ByteMatcher::text_html_a()),
608                Box::new(ByteMatcher::text_html_style()),
609                Box::new(ByteMatcher::text_html_title()),
610                Box::new(ByteMatcher::text_html_b()),
611                Box::new(ByteMatcher::text_html_body()),
612                Box::new(ByteMatcher::text_html_br()),
613                Box::new(ByteMatcher::text_html_p()),
614                Box::new(ByteMatcher::text_html_comment()),
615                Box::new(ByteMatcher::text_xml()),
616                Box::new(ByteMatcher::application_pdf()),
617            ],
618        }
619    }
620    fn plaintext_classifier() -> GroupedClassifier {
621        GroupedClassifier {
622            byte_matchers: vec![
623                Box::new(ByteMatcher::text_plain_utf_8_bom()),
624                Box::new(ByteMatcher::text_plain_utf_16le_bom()),
625                Box::new(ByteMatcher::text_plain_utf_16be_bom()),
626                Box::new(ByteMatcher::application_postscript()),
627            ],
628        }
629    }
630    fn archive_classifier() -> GroupedClassifier {
631        GroupedClassifier {
632            byte_matchers: vec![
633                Box::new(ByteMatcher::application_x_gzip()),
634                Box::new(ByteMatcher::application_zip()),
635                Box::new(ByteMatcher::application_x_rar_compressed()),
636            ],
637        }
638    }
639
640    fn font_classifier() -> GroupedClassifier {
641        GroupedClassifier {
642            byte_matchers: vec![
643                Box::new(ByteMatcher::application_font_woff()),
644                Box::new(ByteMatcher::true_type_collection()),
645                Box::new(ByteMatcher::open_type()),
646                Box::new(ByteMatcher::true_type()),
647                Box::new(ByteMatcher::application_vnd_ms_font_object()),
648            ],
649        }
650    }
651}
652impl MIMEChecker for GroupedClassifier {
653    fn classify(&self, data: &[u8]) -> Option<Mime> {
654        self.byte_matchers
655            .iter()
656            .find_map(|matcher| matcher.classify(data))
657    }
658
659    fn validate(&self) -> Result<(), String> {
660        for byte_matcher in &self.byte_matchers {
661            byte_matcher.validate()?
662        }
663        Ok(())
664    }
665}
666
667// Contains hard coded byte matchers
668// TODO: These should be configured and not hard coded
669impl ByteMatcher {
670    // A Windows Icon signature
671    fn image_x_icon() -> ByteMatcher {
672        ByteMatcher {
673            pattern: b"\x00\x00\x01\x00",
674            mask: b"\xFF\xFF\xFF\xFF",
675            content_type: "image/x-icon".parse().unwrap(),
676            leading_ignore: &[],
677        }
678    }
679    // A Windows Cursor signature.
680    fn image_x_icon_cursor() -> ByteMatcher {
681        ByteMatcher {
682            pattern: b"\x00\x00\x02\x00",
683            mask: b"\xFF\xFF\xFF\xFF",
684            content_type: "image/x-icon".parse().unwrap(),
685            leading_ignore: &[],
686        }
687    }
688    // The string "BM", a BMP signature.
689    fn image_bmp() -> ByteMatcher {
690        ByteMatcher {
691            pattern: b"BM",
692            mask: b"\xFF\xFF",
693            content_type: mime::IMAGE_BMP,
694            leading_ignore: &[],
695        }
696    }
697    // The string "GIF89a", a GIF signature.
698    fn image_gif89a() -> ByteMatcher {
699        ByteMatcher {
700            pattern: b"GIF89a",
701            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF",
702            content_type: mime::IMAGE_GIF,
703            leading_ignore: &[],
704        }
705    }
706    // The string "GIF87a", a GIF signature.
707    fn image_gif87a() -> ByteMatcher {
708        ByteMatcher {
709            pattern: b"GIF87a",
710            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF",
711            content_type: mime::IMAGE_GIF,
712            leading_ignore: &[],
713        }
714    }
715    // The string "RIFF" followed by four bytes followed by the string "WEBPVP".
716    fn image_webp() -> ByteMatcher {
717        ByteMatcher {
718            pattern: b"RIFF\x00\x00\x00\x00WEBPVP",
719            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
720            content_type: "image/webp".parse().unwrap(),
721            leading_ignore: &[],
722        }
723    }
724    // An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG
725    // signature.
726    fn image_png() -> ByteMatcher {
727        ByteMatcher {
728            pattern: b"\x89PNG\r\n\x1A\n",
729            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
730            content_type: mime::IMAGE_PNG,
731            leading_ignore: &[],
732        }
733    }
734    // The JPEG Start of Image marker followed by the indicator byte of another marker.
735    fn image_jpeg() -> ByteMatcher {
736        ByteMatcher {
737            pattern: b"\xFF\xD8\xFF",
738            mask: b"\xFF\xFF\xFF",
739            content_type: mime::IMAGE_JPEG,
740            leading_ignore: &[],
741        }
742    }
743    // The WebM signature. [TODO: Use more bytes?]
744    fn video_webm() -> ByteMatcher {
745        ByteMatcher {
746            pattern: b"\x1A\x45\xDF\xA3",
747            mask: b"\xFF\xFF\xFF\xFF",
748            content_type: "video/webm".parse().unwrap(),
749            leading_ignore: &[],
750        }
751    }
752    // The string ".snd", the basic audio signature.
753    fn audio_basic() -> ByteMatcher {
754        ByteMatcher {
755            pattern: b".snd",
756            mask: b"\xFF\xFF\xFF\xFF",
757            content_type: "audio/basic".parse().unwrap(),
758            leading_ignore: &[],
759        }
760    }
761    // The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature.
762    fn audio_aiff() -> ByteMatcher {
763        ByteMatcher {
764            pattern: b"FORM\x00\x00\x00\x00AIFF",
765            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
766            content_type: "audio/aiff".parse().unwrap(),
767            leading_ignore: &[],
768        }
769    }
770    // The string "ID3", the ID3v2-tagged MP3 signature.
771    fn audio_mpeg() -> ByteMatcher {
772        ByteMatcher {
773            pattern: b"ID3",
774            mask: b"\xFF\xFF\xFF",
775            content_type: "audio/mpeg".parse().unwrap(),
776            leading_ignore: &[],
777        }
778    }
779    // The string "OggS" followed by NUL, the Ogg container signature.
780    fn application_ogg() -> ByteMatcher {
781        ByteMatcher {
782            pattern: b"OggS\x00",
783            mask: b"\xFF\xFF\xFF\xFF\xFF",
784            content_type: "application/ogg".parse().unwrap(),
785            leading_ignore: &[],
786        }
787    }
788    // The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian),
789    // the MIDI signature.
790    fn audio_midi() -> ByteMatcher {
791        ByteMatcher {
792            pattern: b"MThd\x00\x00\x00\x06",
793            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
794            content_type: "audio/midi".parse().unwrap(),
795            leading_ignore: &[],
796        }
797    }
798    // The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature.
799    fn video_avi() -> ByteMatcher {
800        ByteMatcher {
801            pattern: b"RIFF\x00\x00\x00\x00AVI ",
802            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
803            content_type: "video/avi".parse().unwrap(),
804            leading_ignore: &[],
805        }
806    }
807    // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature.
808    fn audio_wave() -> ByteMatcher {
809        ByteMatcher {
810            pattern: b"RIFF\x00\x00\x00\x00WAVE",
811            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
812            content_type: "audio/wave".parse().unwrap(),
813            leading_ignore: &[],
814        }
815    }
816    // doctype terminated with Tag terminating (TT) Byte
817    fn text_html_doctype() -> TagTerminatedByteMatcher {
818        TagTerminatedByteMatcher {
819            matcher: ByteMatcher {
820                pattern: b"<!DOCTYPE HTML",
821                mask: b"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
822                content_type: mime::TEXT_HTML,
823                leading_ignore: b"\t\n\x0C\r ",
824            },
825        }
826    }
827
828    // HTML terminated with Tag terminating (TT) Byte: 0x20 (SP)
829    fn text_html_page() -> TagTerminatedByteMatcher {
830        TagTerminatedByteMatcher {
831            matcher: ByteMatcher {
832                pattern: b"<HTML",
833                mask: b"\xFF\xDF\xDF\xDF\xDF",
834                content_type: mime::TEXT_HTML,
835                leading_ignore: b"\t\n\x0C\r ",
836            },
837        }
838    }
839
840    // head terminated with Tag Terminating (TT) Byte
841    fn text_html_head() -> TagTerminatedByteMatcher {
842        TagTerminatedByteMatcher {
843            matcher: ByteMatcher {
844                pattern: b"<HEAD",
845                mask: b"\xFF\xDF\xDF\xDF\xDF",
846                content_type: mime::TEXT_HTML,
847                leading_ignore: b"\t\n\x0C\r ",
848            },
849        }
850    }
851
852    // script terminated with Tag Terminating (TT) Byte
853    fn text_html_script() -> TagTerminatedByteMatcher {
854        TagTerminatedByteMatcher {
855            matcher: ByteMatcher {
856                pattern: b"<SCRIPT",
857                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
858                content_type: mime::TEXT_HTML,
859                leading_ignore: b"\t\n\x0C\r ",
860            },
861        }
862    }
863
864    // iframe terminated with Tag Terminating (TT) Byte
865    fn text_html_iframe() -> TagTerminatedByteMatcher {
866        TagTerminatedByteMatcher {
867            matcher: ByteMatcher {
868                pattern: b"<IFRAME",
869                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
870                content_type: mime::TEXT_HTML,
871                leading_ignore: b"\t\n\x0C\r ",
872            },
873        }
874    }
875
876    // h1 terminated with Tag Terminating (TT) Byte
877    fn text_html_h1() -> TagTerminatedByteMatcher {
878        TagTerminatedByteMatcher {
879            matcher: ByteMatcher {
880                pattern: b"<H1",
881                mask: b"\xFF\xDF\xFF",
882                content_type: mime::TEXT_HTML,
883                leading_ignore: b"\t\n\x0C\r ",
884            },
885        }
886    }
887
888    // div terminated with Tag Terminating (TT) Byte
889    fn text_html_div() -> TagTerminatedByteMatcher {
890        TagTerminatedByteMatcher {
891            matcher: ByteMatcher {
892                pattern: b"<DIV",
893                mask: b"\xFF\xDF\xDF\xDF",
894                content_type: mime::TEXT_HTML,
895                leading_ignore: b"\t\n\x0C\r ",
896            },
897        }
898    }
899
900    // font terminated with Tag Terminating (TT) Byte
901    fn text_html_font() -> TagTerminatedByteMatcher {
902        TagTerminatedByteMatcher {
903            matcher: ByteMatcher {
904                pattern: b"<FONT",
905                mask: b"\xFF\xDF\xDF\xDF\xDF",
906                content_type: mime::TEXT_HTML,
907                leading_ignore: b"\t\n\x0C\r ",
908            },
909        }
910    }
911
912    // table terminated with Tag Terminating (TT) Byte
913    fn text_html_table() -> TagTerminatedByteMatcher {
914        TagTerminatedByteMatcher {
915            matcher: ByteMatcher {
916                pattern: b"<TABLE",
917                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
918                content_type: mime::TEXT_HTML,
919                leading_ignore: b"\t\n\x0C\r ",
920            },
921        }
922    }
923
924    // a terminated with Tag Terminating (TT) Byte
925    fn text_html_a() -> TagTerminatedByteMatcher {
926        TagTerminatedByteMatcher {
927            matcher: ByteMatcher {
928                pattern: b"<A",
929                mask: b"\xFF\xDF",
930                content_type: mime::TEXT_HTML,
931                leading_ignore: b"\t\n\x0C\r ",
932            },
933        }
934    }
935
936    // style terminated with Tag Terminating (TT) Byte
937    fn text_html_style() -> TagTerminatedByteMatcher {
938        TagTerminatedByteMatcher {
939            matcher: ByteMatcher {
940                pattern: b"<STYLE",
941                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
942                content_type: mime::TEXT_HTML,
943                leading_ignore: b"\t\n\x0C\r ",
944            },
945        }
946    }
947
948    // title terminated with Tag Terminating (TT) Byte
949    fn text_html_title() -> TagTerminatedByteMatcher {
950        TagTerminatedByteMatcher {
951            matcher: ByteMatcher {
952                pattern: b"<TITLE",
953                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
954                content_type: mime::TEXT_HTML,
955                leading_ignore: b"\t\n\x0C\r ",
956            },
957        }
958    }
959
960    // b terminated with Tag Terminating (TT) Byte
961    fn text_html_b() -> TagTerminatedByteMatcher {
962        TagTerminatedByteMatcher {
963            matcher: ByteMatcher {
964                pattern: b"<B",
965                mask: b"\xFF\xDF",
966                content_type: mime::TEXT_HTML,
967                leading_ignore: b"\t\n\x0C\r ",
968            },
969        }
970    }
971
972    // body terminated with Tag Terminating (TT) Byte
973    fn text_html_body() -> TagTerminatedByteMatcher {
974        TagTerminatedByteMatcher {
975            matcher: ByteMatcher {
976                pattern: b"<BODY",
977                mask: b"\xFF\xDF\xDF\xDF\xDF",
978                content_type: mime::TEXT_HTML,
979                leading_ignore: b"\t\n\x0C\r ",
980            },
981        }
982    }
983
984    // br terminated with Tag Terminating (TT) Byte
985    fn text_html_br() -> TagTerminatedByteMatcher {
986        TagTerminatedByteMatcher {
987            matcher: ByteMatcher {
988                pattern: b"<BR",
989                mask: b"\xFF\xDF\xDF",
990                content_type: mime::TEXT_HTML,
991                leading_ignore: b"\t\n\x0C\r ",
992            },
993        }
994    }
995
996    // p terminated with Tag Terminating (TT) Byte
997    fn text_html_p() -> TagTerminatedByteMatcher {
998        TagTerminatedByteMatcher {
999            matcher: ByteMatcher {
1000                pattern: b"<P",
1001                mask: b"\xFF\xDF",
1002                content_type: mime::TEXT_HTML,
1003                leading_ignore: b"\t\n\x0C\r ",
1004            },
1005        }
1006    }
1007
1008    // comment terminated with Tag Terminating (TT) Byte
1009    fn text_html_comment() -> TagTerminatedByteMatcher {
1010        TagTerminatedByteMatcher {
1011            matcher: ByteMatcher {
1012                pattern: b"<!--",
1013                mask: b"\xFF\xFF\xFF\xFF",
1014                content_type: mime::TEXT_HTML,
1015                leading_ignore: b"\t\n\x0C\r ",
1016            },
1017        }
1018    }
1019
1020    // The string "<?xml".
1021    fn text_xml() -> ByteMatcher {
1022        ByteMatcher {
1023            pattern: b"<?xml",
1024            mask: b"\xFF\xFF\xFF\xFF\xFF",
1025            content_type: mime::TEXT_XML,
1026            leading_ignore: b"\t\n\x0C\r ",
1027        }
1028    }
1029    // The string "%PDF-", the PDF signature.
1030    fn application_pdf() -> ByteMatcher {
1031        ByteMatcher {
1032            pattern: b"%PDF-",
1033            mask: b"\xFF\xFF\xFF\xFF\xFF",
1034            content_type: mime::APPLICATION_PDF,
1035            leading_ignore: &[],
1036        }
1037    }
1038    // 34 bytes followed by the string "LP", the Embedded OpenType signature.
1039    fn application_vnd_ms_font_object() -> ByteMatcher {
1040        ByteMatcher {
1041            pattern: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1042                       \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1043                       \x00\x00LP",
1044            mask: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1045                    \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1046                    \x00\x00\xFF\xFF",
1047            content_type: "application/vnd.ms-fontobject".parse().unwrap(),
1048            leading_ignore: &[],
1049        }
1050    }
1051    // 4 bytes representing the version number 1.0, a TrueType signature.
1052    fn true_type() -> ByteMatcher {
1053        ByteMatcher {
1054            pattern: b"\x00\x01\x00\x00",
1055            mask: b"\xFF\xFF\xFF\xFF",
1056            content_type: "application/font-sfnt".parse().unwrap(),
1057            leading_ignore: &[],
1058        }
1059    }
1060    // The string "OTTO", the OpenType signature.
1061    fn open_type() -> ByteMatcher {
1062        ByteMatcher {
1063            pattern: b"OTTO",
1064            mask: b"\xFF\xFF\xFF\xFF",
1065            content_type: "application/font-sfnt".parse().unwrap(),
1066            leading_ignore: &[],
1067        }
1068    }
1069    // The string "ttcf", the TrueType Collection signature.
1070    fn true_type_collection() -> ByteMatcher {
1071        ByteMatcher {
1072            pattern: b"ttcf",
1073            mask: b"\xFF\xFF\xFF\xFF",
1074            content_type: "application/font-sfnt".parse().unwrap(),
1075            leading_ignore: &[],
1076        }
1077    }
1078    // The string "wOFF", the Web Open Font Format signature.
1079    fn application_font_woff() -> ByteMatcher {
1080        ByteMatcher {
1081            pattern: b"wOFF",
1082            mask: b"\xFF\xFF\xFF\xFF",
1083            content_type: "application/font-woff".parse().unwrap(),
1084            leading_ignore: &[],
1085        }
1086    }
1087    // The GZIP archive signature.
1088    fn application_x_gzip() -> ByteMatcher {
1089        ByteMatcher {
1090            pattern: b"\x1F\x8B\x08",
1091            mask: b"\xFF\xFF\xFF",
1092            content_type: "application/x-gzip".parse().unwrap(),
1093            leading_ignore: &[],
1094        }
1095    }
1096    // The string "PK" followed by ETX EOT, the ZIP archive signature.
1097    fn application_zip() -> ByteMatcher {
1098        ByteMatcher {
1099            pattern: b"PK\x03\x04",
1100            mask: b"\xFF\xFF\xFF\xFF",
1101            content_type: "application/zip".parse().unwrap(),
1102            leading_ignore: &[],
1103        }
1104    }
1105    // The string "Rar " followed by SUB BEL NUL, the RAR archive signature.
1106    fn application_x_rar_compressed() -> ByteMatcher {
1107        ByteMatcher {
1108            pattern: b"Rar \x1A\x07\x00",
1109            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
1110            content_type: "application/x-rar-compressed".parse().unwrap(),
1111            leading_ignore: &[],
1112        }
1113    }
1114    // The string "%!PS-Adobe-", the PostScript signature.
1115    fn application_postscript() -> ByteMatcher {
1116        ByteMatcher {
1117            pattern: b"%!PS-Adobe-",
1118            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
1119            content_type: "application/postscript".parse().unwrap(),
1120            leading_ignore: &[],
1121        }
1122    }
1123    // UTF-16BE BOM
1124    fn text_plain_utf_16be_bom() -> ByteMatcher {
1125        ByteMatcher {
1126            pattern: b"\xFE\xFF\x00\x00",
1127            mask: b"\xFF\xFF\x00\x00",
1128            content_type: mime::TEXT_PLAIN,
1129            leading_ignore: &[],
1130        }
1131    }
1132    // UTF-16LE BOM
1133    fn text_plain_utf_16le_bom() -> ByteMatcher {
1134        ByteMatcher {
1135            pattern: b"\xFF\xFE\x00\x00",
1136            mask: b"\xFF\xFF\x00\x00",
1137            content_type: mime::TEXT_PLAIN,
1138            leading_ignore: &[],
1139        }
1140    }
1141    // UTF-8 BOM
1142    fn text_plain_utf_8_bom() -> ByteMatcher {
1143        ByteMatcher {
1144            pattern: b"\xEF\xBB\xBF\x00",
1145            mask: b"\xFF\xFF\xFF\x00",
1146            content_type: mime::TEXT_PLAIN,
1147            leading_ignore: &[],
1148        }
1149    }
1150}