net_traits/
mime_classifier.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use mime::{self, Mime};
6
7use crate::LoadContext;
8
9pub struct MimeClassifier {
10    image_classifier: GroupedClassifier,
11    audio_video_classifier: GroupedClassifier,
12    scriptable_classifier: GroupedClassifier,
13    plaintext_classifier: GroupedClassifier,
14    archive_classifier: GroupedClassifier,
15    binary_or_plaintext: BinaryOrPlaintextClassifier,
16    font_classifier: GroupedClassifier,
17}
18
19#[derive(PartialEq)]
20pub enum MediaType {
21    Xml,
22    Html,
23    AudioVideo,
24    Image,
25    JavaScript,
26    Json,
27    Font,
28    Text,
29    Css,
30}
31
32#[derive(PartialEq)]
33pub enum ApacheBugFlag {
34    On,
35    Off,
36}
37
38impl ApacheBugFlag {
39    /// <https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm>
40    pub fn from_content_type(mime_type: Option<&Mime>) -> ApacheBugFlag {
41        // TODO(36801): also handle charset ISO-8859-1
42        if mime_type.is_some_and(|mime_type| {
43            *mime_type == mime::TEXT_PLAIN || *mime_type == mime::TEXT_PLAIN_UTF_8
44        }) {
45            ApacheBugFlag::On
46        } else {
47            ApacheBugFlag::Off
48        }
49    }
50}
51
52#[derive(PartialEq)]
53pub enum NoSniffFlag {
54    On,
55    Off,
56}
57
58impl Default for MimeClassifier {
59    fn default() -> Self {
60        Self {
61            image_classifier: GroupedClassifier::image_classifer(),
62            audio_video_classifier: GroupedClassifier::audio_video_classifier(),
63            scriptable_classifier: GroupedClassifier::scriptable_classifier(),
64            plaintext_classifier: GroupedClassifier::plaintext_classifier(),
65            archive_classifier: GroupedClassifier::archive_classifier(),
66            binary_or_plaintext: BinaryOrPlaintextClassifier,
67            font_classifier: GroupedClassifier::font_classifier(),
68        }
69    }
70}
71
72impl MimeClassifier {
73    /// <https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm>
74    pub fn classify<'a>(
75        &'a self,
76        context: LoadContext,
77        no_sniff_flag: NoSniffFlag,
78        apache_bug_flag: ApacheBugFlag,
79        supplied_type: &Option<Mime>,
80        data: &'a [u8],
81    ) -> Mime {
82        let supplied_type_or_octet_stream = supplied_type
83            .clone()
84            .unwrap_or(mime::APPLICATION_OCTET_STREAM);
85        // Step 1. If the supplied MIME type is an XML MIME type or HTML MIME type,
86        // the computed MIME type is the supplied MIME type.
87        if Self::is_xml(&supplied_type_or_octet_stream) ||
88            Self::is_html(&supplied_type_or_octet_stream)
89        {
90            return supplied_type_or_octet_stream;
91        }
92        match context {
93            LoadContext::Browsing => match *supplied_type {
94                // Step 2. If the supplied MIME type is undefined or if the supplied MIME type’s essence is "unknown/unknown",
95                // "application/unknown", or "*/*", execute the rules for identifying
96                // an unknown MIME type with the sniff-scriptable flag equal to the inverse of the no-sniff flag and abort these steps.
97                None => self.sniff_unknown_type(no_sniff_flag, data),
98                Some(ref supplied_type) => {
99                    if MimeClassifier::is_explicit_unknown(supplied_type) {
100                        return self.sniff_unknown_type(no_sniff_flag, data);
101                    }
102                    // Step 3. If the no-sniff flag is set, the computed MIME type is the supplied MIME type.
103                    // Abort these steps.
104                    if no_sniff_flag == NoSniffFlag::On {
105                        return supplied_type.clone();
106                    }
107                    // Step 4. If the check-for-apache-bug flag is set,
108                    // execute the rules for distinguishing if a resource is text or binary and abort these steps.
109                    if apache_bug_flag == ApacheBugFlag::On {
110                        return self.sniff_text_or_data(data);
111                    }
112                    match MimeClassifier::get_media_type(supplied_type) {
113                        // Step 5. If the supplied MIME type is an image MIME type supported by the user agent,
114                        // let matched-type be the result of executing the image type pattern matching algorithm with
115                        // the resource header as the byte sequence to be matched.
116                        Some(MediaType::Image) => {
117                            // Step 6. If matched-type is not undefined, the computed MIME type is matched-type.
118                            self.image_classifier.classify(data)
119                        },
120                        // Step 7. If the supplied MIME type is an audio or video MIME type supported by the user agent,
121                        // let matched-type be the result of executing the audio or video type pattern matching algorithm
122                        // with the resource header as the byte sequence to be matched.
123                        Some(MediaType::AudioVideo) => {
124                            // Step 8. If matched-type is not undefined, the computed MIME type is matched-type.
125                            self.audio_video_classifier.classify(data)
126                        },
127                        Some(MediaType::Html) | Some(MediaType::Xml) => unreachable!(),
128                        _ => None,
129                    }
130                    // Step 9. The computed MIME type is the supplied MIME type.
131                    .unwrap_or(supplied_type.clone())
132                },
133            },
134            LoadContext::Image => {
135                // Section 8.2 Sniffing an image context
136                match MimeClassifier::maybe_get_media_type(supplied_type) {
137                    Some(MediaType::Xml) => None,
138                    _ => self.image_classifier.classify(data),
139                }
140                .unwrap_or(supplied_type_or_octet_stream)
141            },
142            LoadContext::AudioVideo => {
143                // Section 8.3 Sniffing an image context
144                match MimeClassifier::maybe_get_media_type(supplied_type) {
145                    Some(MediaType::Xml) => None,
146                    _ => self.audio_video_classifier.classify(data),
147                }
148                .unwrap_or(supplied_type_or_octet_stream)
149            },
150            LoadContext::Plugin => {
151                // 8.4 Sniffing in a plugin context
152                //
153                // This section was *not* finalized in the specs at the time
154                // of this implementation.
155                match *supplied_type {
156                    None => mime::APPLICATION_OCTET_STREAM,
157                    _ => supplied_type_or_octet_stream,
158                }
159            },
160            LoadContext::Style => {
161                // 8.5 Sniffing in a style context
162                //
163                // This section was *not* finalized in the specs at the time
164                // of this implementation.
165                match *supplied_type {
166                    None => mime::TEXT_CSS,
167                    _ => supplied_type_or_octet_stream,
168                }
169            },
170            LoadContext::Script => {
171                // 8.6 Sniffing in a script context
172                //
173                // This section was *not* finalized in the specs at the time
174                // of this implementation.
175                match *supplied_type {
176                    None => mime::TEXT_JAVASCRIPT,
177                    _ => supplied_type_or_octet_stream,
178                }
179            },
180            LoadContext::Font => {
181                // 8.7 Sniffing in a font context
182                match MimeClassifier::maybe_get_media_type(supplied_type) {
183                    Some(MediaType::Xml) => None,
184                    _ => self.font_classifier.classify(data),
185                }
186                .unwrap_or(supplied_type_or_octet_stream)
187            },
188            LoadContext::TextTrack => {
189                // 8.8 Sniffing in a text track context
190                //
191                // This section was *not* finalized in the specs at the time
192                // of this implementation.
193                "text/vtt".parse().unwrap()
194            },
195            LoadContext::CacheManifest => {
196                // 8.9 Sniffing in a cache manifest context
197                //
198                // This section was *not* finalized in the specs at the time
199                // of this implementation.
200                "text/cache-manifest".parse().unwrap()
201            },
202        }
203    }
204
205    pub fn validate(&self) -> Result<(), String> {
206        self.image_classifier.validate()?;
207        self.audio_video_classifier.validate()?;
208        self.scriptable_classifier.validate()?;
209        self.plaintext_classifier.validate()?;
210        self.archive_classifier.validate()?;
211        self.binary_or_plaintext.validate()?;
212        self.font_classifier.validate()?;
213        Ok(())
214    }
215
216    // some sort of iterator over the classifiers might be better?
217    fn sniff_unknown_type(&self, no_sniff_flag: NoSniffFlag, data: &[u8]) -> Mime {
218        let should_sniff_scriptable = no_sniff_flag == NoSniffFlag::Off;
219        let sniffed = if should_sniff_scriptable {
220            self.scriptable_classifier.classify(data)
221        } else {
222            None
223        };
224
225        sniffed
226            .or_else(|| self.plaintext_classifier.classify(data))
227            .or_else(|| self.image_classifier.classify(data))
228            .or_else(|| self.audio_video_classifier.classify(data))
229            .or_else(|| self.archive_classifier.classify(data))
230            .or_else(|| self.binary_or_plaintext.classify(data))
231            .expect("BinaryOrPlaintextClassifier always succeeds")
232    }
233
234    fn sniff_text_or_data<'a>(&'a self, data: &'a [u8]) -> Mime {
235        self.binary_or_plaintext
236            .classify(data)
237            .expect("BinaryOrPlaintextClassifier always succeeds")
238    }
239
240    /// <https://mimesniff.spec.whatwg.org/#xml-mime-type>
241    fn is_xml(mt: &Mime) -> bool {
242        mt.suffix() == Some(mime::XML) ||
243            mt.essence_str() == "text/xml" ||
244            mt.essence_str() == "application/xml"
245    }
246
247    /// <https://mimesniff.spec.whatwg.org/#html-mime-type>
248    fn is_html(mt: &Mime) -> bool {
249        mt.essence_str() == "text/html"
250    }
251
252    /// <https://mimesniff.spec.whatwg.org/#image-mime-type>
253    fn is_image(mt: &Mime) -> bool {
254        mt.type_() == mime::IMAGE
255    }
256
257    /// <https://mimesniff.spec.whatwg.org/#audio-or-video-mime-type>
258    fn is_audio_video(mt: &Mime) -> bool {
259        mt.type_() == mime::AUDIO ||
260            mt.type_() == mime::VIDEO ||
261            mt.essence_str() == "application/ogg"
262    }
263
264    fn is_explicit_unknown(mt: &Mime) -> bool {
265        mt.type_().as_str() == "unknown" && mt.subtype().as_str() == "unknown" ||
266            mt.type_() == mime::APPLICATION && mt.subtype().as_str() == "unknown" ||
267            mt.type_() == mime::STAR && mt.subtype() == mime::STAR
268    }
269
270    /// <https://mimesniff.spec.whatwg.org/#javascript-mime-type>
271    fn is_javascript(mt: &Mime) -> bool {
272        (mt.type_() == mime::APPLICATION &&
273            (["ecmascript", "javascript", "x-ecmascript", "x-javascript"]
274                .contains(&mt.subtype().as_str()))) ||
275            (mt.type_() == mime::TEXT &&
276                ([
277                    "ecmascript",
278                    "javascript",
279                    "javascript1.0",
280                    "javascript1.1",
281                    "javascript1.2",
282                    "javascript1.3",
283                    "javascript1.4",
284                    "javascript1.5",
285                    "jscript",
286                    "livescript",
287                    "x-ecmascript",
288                    "x-javascript",
289                ]
290                .contains(&mt.subtype().as_str())))
291    }
292
293    /// <https://mimesniff.spec.whatwg.org/#json-mime-type>
294    fn is_json(mt: &Mime) -> bool {
295        mt.suffix() == Some(mime::JSON) ||
296            (mt.subtype() == mime::JSON &&
297                (mt.type_() == mime::APPLICATION || mt.type_() == mime::TEXT))
298    }
299
300    /// <https://mimesniff.spec.whatwg.org/#font-mime-type>
301    fn is_font(mt: &Mime) -> bool {
302        mt.type_() == mime::FONT ||
303            (mt.type_() == mime::APPLICATION &&
304                ([
305                    "font-cff",
306                    "font-off",
307                    "font-sfnt",
308                    "font-ttf",
309                    "font-woff",
310                    "vnd.ms-fontobject",
311                    "vnd.ms-opentype",
312                ]
313                .contains(&mt.subtype().as_str())))
314    }
315
316    fn is_text(mt: &Mime) -> bool {
317        *mt == mime::TEXT_PLAIN || mt.essence_str() == "text/vtt"
318    }
319
320    fn is_css(mt: &Mime) -> bool {
321        mt.essence_str() == "text/css"
322    }
323
324    pub fn get_media_type(mime: &Mime) -> Option<MediaType> {
325        if MimeClassifier::is_xml(mime) {
326            Some(MediaType::Xml)
327        } else if MimeClassifier::is_html(mime) {
328            Some(MediaType::Html)
329        } else if MimeClassifier::is_image(mime) {
330            Some(MediaType::Image)
331        } else if MimeClassifier::is_audio_video(mime) {
332            Some(MediaType::AudioVideo)
333        } else if MimeClassifier::is_javascript(mime) {
334            Some(MediaType::JavaScript)
335        } else if MimeClassifier::is_font(mime) {
336            Some(MediaType::Font)
337        } else if MimeClassifier::is_json(mime) {
338            Some(MediaType::Json)
339        } else if MimeClassifier::is_text(mime) {
340            Some(MediaType::Text)
341        } else if MimeClassifier::is_css(mime) {
342            Some(MediaType::Css)
343        } else {
344            None
345        }
346    }
347
348    fn maybe_get_media_type(supplied_type: &Option<Mime>) -> Option<MediaType> {
349        supplied_type
350            .as_ref()
351            .and_then(MimeClassifier::get_media_type)
352    }
353}
354
355// Interface used for composite types
356trait MIMEChecker {
357    fn classify(&self, data: &[u8]) -> Option<Mime>;
358    /// Validate the MIME checker configuration
359    fn validate(&self) -> Result<(), String>;
360}
361
362struct ByteMatcher {
363    pattern: &'static [u8],
364    mask: &'static [u8],
365    leading_ignore: &'static [u8],
366    content_type: Mime,
367}
368
369impl ByteMatcher {
370    fn matches(&self, data: &[u8]) -> Option<usize> {
371        if data.len() < self.pattern.len() {
372            None
373        } else if data == self.pattern {
374            Some(self.pattern.len())
375        } else {
376            data[..data.len() - self.pattern.len() + 1]
377                .iter()
378                .position(|x| !self.leading_ignore.contains(x))
379                .and_then(|start| {
380                    if data[start..]
381                        .iter()
382                        .zip(self.pattern.iter())
383                        .zip(self.mask.iter())
384                        .all(|((&data, &pattern), &mask)| (data & mask) == pattern)
385                    {
386                        Some(start + self.pattern.len())
387                    } else {
388                        None
389                    }
390                })
391        }
392    }
393}
394
395impl MIMEChecker for ByteMatcher {
396    fn classify(&self, data: &[u8]) -> Option<Mime> {
397        self.matches(data).map(|_| self.content_type.clone())
398    }
399
400    fn validate(&self) -> Result<(), String> {
401        if self.pattern.is_empty() {
402            return Err(format!("Zero length pattern for {:?}", self.content_type));
403        }
404        if self.pattern.len() != self.mask.len() {
405            return Err(format!(
406                "Unequal pattern and mask length for {:?}",
407                self.content_type
408            ));
409        }
410        if self
411            .pattern
412            .iter()
413            .zip(self.mask.iter())
414            .any(|(&pattern, &mask)| pattern & mask != pattern)
415        {
416            return Err(format!(
417                "Pattern not pre-masked for {:?}",
418                self.content_type
419            ));
420        }
421        Ok(())
422    }
423}
424
425struct TagTerminatedByteMatcher {
426    matcher: ByteMatcher,
427}
428
429impl MIMEChecker for TagTerminatedByteMatcher {
430    fn classify(&self, data: &[u8]) -> Option<Mime> {
431        self.matcher.matches(data).and_then(|j| {
432            if j < data.len() && (data[j] == b' ' || data[j] == b'>') {
433                Some(self.matcher.content_type.clone())
434            } else {
435                None
436            }
437        })
438    }
439
440    fn validate(&self) -> Result<(), String> {
441        self.matcher.validate()
442    }
443}
444
445pub struct Mp4Matcher;
446
447impl Mp4Matcher {
448    /// <https://mimesniff.spec.whatwg.org/#matches-the-signature-for-mp4>
449    pub fn matches(&self, data: &[u8]) -> bool {
450        // Step 1. Let sequence be the byte sequence to be matched,
451        // where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence.
452        // Step 2. Let length be the number of bytes in sequence.
453        // Step 3. If length is less than 12, return false.
454        if data.len() < 12 {
455            return false;
456        }
457
458        // Step 4. Let box-size be the four bytes from sequence[0] to sequence[3],
459        // interpreted as a 32-bit unsigned big-endian integer.
460        let box_size = (((data[0] as u32) << 24) |
461            ((data[1] as u32) << 16) |
462            ((data[2] as u32) << 8) |
463            (data[3] as u32)) as usize;
464        // Step 5. If length is less than box-size or if box-size modulo 4 is not equal to 0, return false.
465        if (data.len() < box_size) || (box_size % 4 != 0) {
466            return false;
467        }
468
469        // Step 6. If the four bytes from sequence[4] to sequence[7] are not equal to 0x66 0x74 0x79 0x70 ("ftyp"), return false.
470        let ftyp = [0x66, 0x74, 0x79, 0x70];
471        if !data[4..].starts_with(&ftyp) {
472            return false;
473        }
474
475        // Step 7. If the three bytes from sequence[8] to sequence[10] are equal to 0x6D 0x70 0x34 ("mp4"), return true.
476        let mp4 = [0x6D, 0x70, 0x34];
477        data[8..].starts_with(&mp4) ||
478        // Step 8. Let bytes-read be 16.
479        // Step 9. While bytes-read is less than box-size, continuously loop through these steps:
480            data[16..box_size]
481            // Step 11. Increment bytes-read by 4.
482                .chunks(4)
483                // Step 10. If the three bytes from sequence[bytes-read] to sequence[bytes-read + 2]
484                // are equal to 0x6D 0x70 0x34 ("mp4"), return true.
485                .any(|chunk| chunk.starts_with(&mp4))
486        // Step 12. Return false.
487    }
488}
489impl MIMEChecker for Mp4Matcher {
490    fn classify(&self, data: &[u8]) -> Option<Mime> {
491        if self.matches(data) {
492            Some("video/mp4".parse().unwrap())
493        } else {
494            None
495        }
496    }
497
498    fn validate(&self) -> Result<(), String> {
499        Ok(())
500    }
501}
502
503struct BinaryOrPlaintextClassifier;
504
505impl BinaryOrPlaintextClassifier {
506    /// <https://mimesniff.spec.whatwg.org/#rules-for-text-or-binary>
507    fn classify_impl(&self, data: &[u8]) -> Mime {
508        // Step 1. Let length be the number of bytes in the resource header.
509        // Step 2. If length is greater than or equal to 2 and
510        // the first 2 bytes of the resource header are equal to 0xFE 0xFF (UTF-16BE BOM)
511        // or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
512        // Step 3. If length is greater than or equal to 3
513        // and the first 3 bytes of the resource header are equal to
514        // 0xEF 0xBB 0xBF (UTF-8 BOM), the computed MIME type is "text/plain".
515        if data.starts_with(&[0xFFu8, 0xFEu8]) ||
516            data.starts_with(&[0xFEu8, 0xFFu8]) ||
517            data.starts_with(&[0xEFu8, 0xBBu8, 0xBFu8])
518        {
519            mime::TEXT_PLAIN
520        } else if data.iter().any(|&x| {
521            x <= 0x08u8 ||
522                x == 0x0Bu8 ||
523                (0x0Eu8..=0x1Au8).contains(&x) ||
524                (0x1Cu8..=0x1Fu8).contains(&x)
525        }) {
526            // Step 5. The computed MIME type is "application/octet-stream".
527            mime::APPLICATION_OCTET_STREAM
528        } else {
529            // Step 4. If the resource header contains no binary data bytes,
530            // the computed MIME type is "text/plain".
531            mime::TEXT_PLAIN
532        }
533    }
534}
535impl MIMEChecker for BinaryOrPlaintextClassifier {
536    fn classify(&self, data: &[u8]) -> Option<Mime> {
537        Some(self.classify_impl(data))
538    }
539
540    fn validate(&self) -> Result<(), String> {
541        Ok(())
542    }
543}
544struct GroupedClassifier {
545    byte_matchers: Vec<Box<dyn MIMEChecker + Send + Sync>>,
546}
547impl GroupedClassifier {
548    fn image_classifer() -> GroupedClassifier {
549        GroupedClassifier {
550            byte_matchers: vec![
551                // Keep this in sync with 'is_supported_mime_type' from
552                // components/style/servo/media_queries.rs
553                Box::new(ByteMatcher::image_x_icon()),
554                Box::new(ByteMatcher::image_x_icon_cursor()),
555                Box::new(ByteMatcher::image_bmp()),
556                Box::new(ByteMatcher::image_gif89a()),
557                Box::new(ByteMatcher::image_gif87a()),
558                Box::new(ByteMatcher::image_webp()),
559                Box::new(ByteMatcher::image_png()),
560                Box::new(ByteMatcher::image_jpeg()),
561            ],
562        }
563    }
564    fn audio_video_classifier() -> GroupedClassifier {
565        GroupedClassifier {
566            byte_matchers: vec![
567                Box::new(ByteMatcher::video_webm()),
568                Box::new(ByteMatcher::audio_basic()),
569                Box::new(ByteMatcher::audio_aiff()),
570                Box::new(ByteMatcher::audio_mpeg()),
571                Box::new(ByteMatcher::application_ogg()),
572                Box::new(ByteMatcher::audio_midi()),
573                Box::new(ByteMatcher::video_avi()),
574                Box::new(ByteMatcher::audio_wave()),
575                Box::new(Mp4Matcher),
576            ],
577        }
578    }
579    fn scriptable_classifier() -> GroupedClassifier {
580        GroupedClassifier {
581            byte_matchers: vec![
582                Box::new(ByteMatcher::text_html_doctype()),
583                Box::new(ByteMatcher::text_html_page()),
584                Box::new(ByteMatcher::text_html_head()),
585                Box::new(ByteMatcher::text_html_script()),
586                Box::new(ByteMatcher::text_html_iframe()),
587                Box::new(ByteMatcher::text_html_h1()),
588                Box::new(ByteMatcher::text_html_div()),
589                Box::new(ByteMatcher::text_html_font()),
590                Box::new(ByteMatcher::text_html_table()),
591                Box::new(ByteMatcher::text_html_a()),
592                Box::new(ByteMatcher::text_html_style()),
593                Box::new(ByteMatcher::text_html_title()),
594                Box::new(ByteMatcher::text_html_b()),
595                Box::new(ByteMatcher::text_html_body()),
596                Box::new(ByteMatcher::text_html_br()),
597                Box::new(ByteMatcher::text_html_p()),
598                Box::new(ByteMatcher::text_html_comment()),
599                Box::new(ByteMatcher::text_xml()),
600                Box::new(ByteMatcher::application_pdf()),
601            ],
602        }
603    }
604    fn plaintext_classifier() -> GroupedClassifier {
605        GroupedClassifier {
606            byte_matchers: vec![
607                Box::new(ByteMatcher::text_plain_utf_8_bom()),
608                Box::new(ByteMatcher::text_plain_utf_16le_bom()),
609                Box::new(ByteMatcher::text_plain_utf_16be_bom()),
610                Box::new(ByteMatcher::application_postscript()),
611            ],
612        }
613    }
614    fn archive_classifier() -> GroupedClassifier {
615        GroupedClassifier {
616            byte_matchers: vec![
617                Box::new(ByteMatcher::application_x_gzip()),
618                Box::new(ByteMatcher::application_zip()),
619                Box::new(ByteMatcher::application_x_rar_compressed()),
620            ],
621        }
622    }
623
624    fn font_classifier() -> GroupedClassifier {
625        GroupedClassifier {
626            byte_matchers: vec![
627                Box::new(ByteMatcher::application_font_woff()),
628                Box::new(ByteMatcher::true_type_collection()),
629                Box::new(ByteMatcher::open_type()),
630                Box::new(ByteMatcher::true_type()),
631                Box::new(ByteMatcher::application_vnd_ms_font_object()),
632            ],
633        }
634    }
635}
636impl MIMEChecker for GroupedClassifier {
637    fn classify(&self, data: &[u8]) -> Option<Mime> {
638        self.byte_matchers
639            .iter()
640            .filter_map(|matcher| matcher.classify(data))
641            .next()
642    }
643
644    fn validate(&self) -> Result<(), String> {
645        for byte_matcher in &self.byte_matchers {
646            byte_matcher.validate()?
647        }
648        Ok(())
649    }
650}
651
652// Contains hard coded byte matchers
653// TODO: These should be configured and not hard coded
654impl ByteMatcher {
655    // A Windows Icon signature
656    fn image_x_icon() -> ByteMatcher {
657        ByteMatcher {
658            pattern: b"\x00\x00\x01\x00",
659            mask: b"\xFF\xFF\xFF\xFF",
660            content_type: "image/x-icon".parse().unwrap(),
661            leading_ignore: &[],
662        }
663    }
664    // A Windows Cursor signature.
665    fn image_x_icon_cursor() -> ByteMatcher {
666        ByteMatcher {
667            pattern: b"\x00\x00\x02\x00",
668            mask: b"\xFF\xFF\xFF\xFF",
669            content_type: "image/x-icon".parse().unwrap(),
670            leading_ignore: &[],
671        }
672    }
673    // The string "BM", a BMP signature.
674    fn image_bmp() -> ByteMatcher {
675        ByteMatcher {
676            pattern: b"BM",
677            mask: b"\xFF\xFF",
678            content_type: mime::IMAGE_BMP,
679            leading_ignore: &[],
680        }
681    }
682    // The string "GIF89a", a GIF signature.
683    fn image_gif89a() -> ByteMatcher {
684        ByteMatcher {
685            pattern: b"GIF89a",
686            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF",
687            content_type: mime::IMAGE_GIF,
688            leading_ignore: &[],
689        }
690    }
691    // The string "GIF87a", a GIF signature.
692    fn image_gif87a() -> ByteMatcher {
693        ByteMatcher {
694            pattern: b"GIF87a",
695            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF",
696            content_type: mime::IMAGE_GIF,
697            leading_ignore: &[],
698        }
699    }
700    // The string "RIFF" followed by four bytes followed by the string "WEBPVP".
701    fn image_webp() -> ByteMatcher {
702        ByteMatcher {
703            pattern: b"RIFF\x00\x00\x00\x00WEBPVP",
704            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
705            content_type: "image/webp".parse().unwrap(),
706            leading_ignore: &[],
707        }
708    }
709    // An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG
710    // signature.
711    fn image_png() -> ByteMatcher {
712        ByteMatcher {
713            pattern: b"\x89PNG\r\n\x1A\n",
714            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
715            content_type: mime::IMAGE_PNG,
716            leading_ignore: &[],
717        }
718    }
719    // The JPEG Start of Image marker followed by the indicator byte of another marker.
720    fn image_jpeg() -> ByteMatcher {
721        ByteMatcher {
722            pattern: b"\xFF\xD8\xFF",
723            mask: b"\xFF\xFF\xFF",
724            content_type: mime::IMAGE_JPEG,
725            leading_ignore: &[],
726        }
727    }
728    // The WebM signature. [TODO: Use more bytes?]
729    fn video_webm() -> ByteMatcher {
730        ByteMatcher {
731            pattern: b"\x1A\x45\xDF\xA3",
732            mask: b"\xFF\xFF\xFF\xFF",
733            content_type: "video/webm".parse().unwrap(),
734            leading_ignore: &[],
735        }
736    }
737    // The string ".snd", the basic audio signature.
738    fn audio_basic() -> ByteMatcher {
739        ByteMatcher {
740            pattern: b".snd",
741            mask: b"\xFF\xFF\xFF\xFF",
742            content_type: "audio/basic".parse().unwrap(),
743            leading_ignore: &[],
744        }
745    }
746    // The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature.
747    fn audio_aiff() -> ByteMatcher {
748        ByteMatcher {
749            pattern: b"FORM\x00\x00\x00\x00AIFF",
750            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
751            content_type: "audio/aiff".parse().unwrap(),
752            leading_ignore: &[],
753        }
754    }
755    // The string "ID3", the ID3v2-tagged MP3 signature.
756    fn audio_mpeg() -> ByteMatcher {
757        ByteMatcher {
758            pattern: b"ID3",
759            mask: b"\xFF\xFF\xFF",
760            content_type: "audio/mpeg".parse().unwrap(),
761            leading_ignore: &[],
762        }
763    }
764    // The string "OggS" followed by NUL, the Ogg container signature.
765    fn application_ogg() -> ByteMatcher {
766        ByteMatcher {
767            pattern: b"OggS\x00",
768            mask: b"\xFF\xFF\xFF\xFF\xFF",
769            content_type: "application/ogg".parse().unwrap(),
770            leading_ignore: &[],
771        }
772    }
773    // The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian),
774    // the MIDI signature.
775    fn audio_midi() -> ByteMatcher {
776        ByteMatcher {
777            pattern: b"MThd\x00\x00\x00\x06",
778            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
779            content_type: "audio/midi".parse().unwrap(),
780            leading_ignore: &[],
781        }
782    }
783    // The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature.
784    fn video_avi() -> ByteMatcher {
785        ByteMatcher {
786            pattern: b"RIFF\x00\x00\x00\x00AVI ",
787            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
788            content_type: "video/avi".parse().unwrap(),
789            leading_ignore: &[],
790        }
791    }
792    // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature.
793    fn audio_wave() -> ByteMatcher {
794        ByteMatcher {
795            pattern: b"RIFF\x00\x00\x00\x00WAVE",
796            mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
797            content_type: "audio/wave".parse().unwrap(),
798            leading_ignore: &[],
799        }
800    }
801    // doctype terminated with Tag terminating (TT) Byte
802    fn text_html_doctype() -> TagTerminatedByteMatcher {
803        TagTerminatedByteMatcher {
804            matcher: ByteMatcher {
805                pattern: b"<!DOCTYPE HTML",
806                mask: b"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
807                content_type: mime::TEXT_HTML,
808                leading_ignore: b"\t\n\x0C\r ",
809            },
810        }
811    }
812
813    // HTML terminated with Tag terminating (TT) Byte: 0x20 (SP)
814    fn text_html_page() -> TagTerminatedByteMatcher {
815        TagTerminatedByteMatcher {
816            matcher: ByteMatcher {
817                pattern: b"<HTML",
818                mask: b"\xFF\xDF\xDF\xDF\xDF",
819                content_type: mime::TEXT_HTML,
820                leading_ignore: b"\t\n\x0C\r ",
821            },
822        }
823    }
824
825    // head terminated with Tag Terminating (TT) Byte
826    fn text_html_head() -> TagTerminatedByteMatcher {
827        TagTerminatedByteMatcher {
828            matcher: ByteMatcher {
829                pattern: b"<HEAD",
830                mask: b"\xFF\xDF\xDF\xDF\xDF",
831                content_type: mime::TEXT_HTML,
832                leading_ignore: b"\t\n\x0C\r ",
833            },
834        }
835    }
836
837    // script terminated with Tag Terminating (TT) Byte
838    fn text_html_script() -> TagTerminatedByteMatcher {
839        TagTerminatedByteMatcher {
840            matcher: ByteMatcher {
841                pattern: b"<SCRIPT",
842                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
843                content_type: mime::TEXT_HTML,
844                leading_ignore: b"\t\n\x0C\r ",
845            },
846        }
847    }
848
849    // iframe terminated with Tag Terminating (TT) Byte
850    fn text_html_iframe() -> TagTerminatedByteMatcher {
851        TagTerminatedByteMatcher {
852            matcher: ByteMatcher {
853                pattern: b"<IFRAME",
854                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
855                content_type: mime::TEXT_HTML,
856                leading_ignore: b"\t\n\x0C\r ",
857            },
858        }
859    }
860
861    // h1 terminated with Tag Terminating (TT) Byte
862    fn text_html_h1() -> TagTerminatedByteMatcher {
863        TagTerminatedByteMatcher {
864            matcher: ByteMatcher {
865                pattern: b"<H1",
866                mask: b"\xFF\xDF\xFF",
867                content_type: mime::TEXT_HTML,
868                leading_ignore: b"\t\n\x0C\r ",
869            },
870        }
871    }
872
873    // div terminated with Tag Terminating (TT) Byte
874    fn text_html_div() -> TagTerminatedByteMatcher {
875        TagTerminatedByteMatcher {
876            matcher: ByteMatcher {
877                pattern: b"<DIV",
878                mask: b"\xFF\xDF\xDF\xDF",
879                content_type: mime::TEXT_HTML,
880                leading_ignore: b"\t\n\x0C\r ",
881            },
882        }
883    }
884
885    // font terminated with Tag Terminating (TT) Byte
886    fn text_html_font() -> TagTerminatedByteMatcher {
887        TagTerminatedByteMatcher {
888            matcher: ByteMatcher {
889                pattern: b"<FONT",
890                mask: b"\xFF\xDF\xDF\xDF\xDF",
891                content_type: mime::TEXT_HTML,
892                leading_ignore: b"\t\n\x0C\r ",
893            },
894        }
895    }
896
897    // table terminated with Tag Terminating (TT) Byte
898    fn text_html_table() -> TagTerminatedByteMatcher {
899        TagTerminatedByteMatcher {
900            matcher: ByteMatcher {
901                pattern: b"<TABLE",
902                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
903                content_type: mime::TEXT_HTML,
904                leading_ignore: b"\t\n\x0C\r ",
905            },
906        }
907    }
908
909    // a terminated with Tag Terminating (TT) Byte
910    fn text_html_a() -> TagTerminatedByteMatcher {
911        TagTerminatedByteMatcher {
912            matcher: ByteMatcher {
913                pattern: b"<A",
914                mask: b"\xFF\xDF",
915                content_type: mime::TEXT_HTML,
916                leading_ignore: b"\t\n\x0C\r ",
917            },
918        }
919    }
920
921    // style terminated with Tag Terminating (TT) Byte
922    fn text_html_style() -> TagTerminatedByteMatcher {
923        TagTerminatedByteMatcher {
924            matcher: ByteMatcher {
925                pattern: b"<STYLE",
926                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
927                content_type: mime::TEXT_HTML,
928                leading_ignore: b"\t\n\x0C\r ",
929            },
930        }
931    }
932
933    // title terminated with Tag Terminating (TT) Byte
934    fn text_html_title() -> TagTerminatedByteMatcher {
935        TagTerminatedByteMatcher {
936            matcher: ByteMatcher {
937                pattern: b"<TITLE",
938                mask: b"\xFF\xDF\xDF\xDF\xDF\xDF",
939                content_type: mime::TEXT_HTML,
940                leading_ignore: b"\t\n\x0C\r ",
941            },
942        }
943    }
944
945    // b terminated with Tag Terminating (TT) Byte
946    fn text_html_b() -> TagTerminatedByteMatcher {
947        TagTerminatedByteMatcher {
948            matcher: ByteMatcher {
949                pattern: b"<B",
950                mask: b"\xFF\xDF",
951                content_type: mime::TEXT_HTML,
952                leading_ignore: b"\t\n\x0C\r ",
953            },
954        }
955    }
956
957    // body terminated with Tag Terminating (TT) Byte
958    fn text_html_body() -> TagTerminatedByteMatcher {
959        TagTerminatedByteMatcher {
960            matcher: ByteMatcher {
961                pattern: b"<BODY",
962                mask: b"\xFF\xDF\xDF\xDF\xDF",
963                content_type: mime::TEXT_HTML,
964                leading_ignore: b"\t\n\x0C\r ",
965            },
966        }
967    }
968
969    // br terminated with Tag Terminating (TT) Byte
970    fn text_html_br() -> TagTerminatedByteMatcher {
971        TagTerminatedByteMatcher {
972            matcher: ByteMatcher {
973                pattern: b"<BR",
974                mask: b"\xFF\xDF\xDF",
975                content_type: mime::TEXT_HTML,
976                leading_ignore: b"\t\n\x0C\r ",
977            },
978        }
979    }
980
981    // p terminated with Tag Terminating (TT) Byte
982    fn text_html_p() -> TagTerminatedByteMatcher {
983        TagTerminatedByteMatcher {
984            matcher: ByteMatcher {
985                pattern: b"<P",
986                mask: b"\xFF\xDF",
987                content_type: mime::TEXT_HTML,
988                leading_ignore: b"\t\n\x0C\r ",
989            },
990        }
991    }
992
993    // comment terminated with Tag Terminating (TT) Byte
994    fn text_html_comment() -> TagTerminatedByteMatcher {
995        TagTerminatedByteMatcher {
996            matcher: ByteMatcher {
997                pattern: b"<!--",
998                mask: b"\xFF\xFF\xFF\xFF",
999                content_type: mime::TEXT_HTML,
1000                leading_ignore: b"\t\n\x0C\r ",
1001            },
1002        }
1003    }
1004
1005    // The string "<?xml".
1006    fn text_xml() -> ByteMatcher {
1007        ByteMatcher {
1008            pattern: b"<?xml",
1009            mask: b"\xFF\xFF\xFF\xFF\xFF",
1010            content_type: mime::TEXT_XML,
1011            leading_ignore: b"\t\n\x0C\r ",
1012        }
1013    }
1014    // The string "%PDF-", the PDF signature.
1015    fn application_pdf() -> ByteMatcher {
1016        ByteMatcher {
1017            pattern: b"%PDF-",
1018            mask: b"\xFF\xFF\xFF\xFF\xFF",
1019            content_type: mime::APPLICATION_PDF,
1020            leading_ignore: &[],
1021        }
1022    }
1023    // 34 bytes followed by the string "LP", the Embedded OpenType signature.
1024    fn application_vnd_ms_font_object() -> ByteMatcher {
1025        ByteMatcher {
1026            pattern: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1027                       \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1028                       \x00\x00LP",
1029            mask: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1030                    \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\
1031                    \x00\x00\xFF\xFF",
1032            content_type: "application/vnd.ms-fontobject".parse().unwrap(),
1033            leading_ignore: &[],
1034        }
1035    }
1036    // 4 bytes representing the version number 1.0, a TrueType signature.
1037    fn true_type() -> ByteMatcher {
1038        ByteMatcher {
1039            pattern: b"\x00\x01\x00\x00",
1040            mask: b"\xFF\xFF\xFF\xFF",
1041            content_type: "application/font-sfnt".parse().unwrap(),
1042            leading_ignore: &[],
1043        }
1044    }
1045    // The string "OTTO", the OpenType signature.
1046    fn open_type() -> ByteMatcher {
1047        ByteMatcher {
1048            pattern: b"OTTO",
1049            mask: b"\xFF\xFF\xFF\xFF",
1050            content_type: "application/font-sfnt".parse().unwrap(),
1051            leading_ignore: &[],
1052        }
1053    }
1054    // The string "ttcf", the TrueType Collection signature.
1055    fn true_type_collection() -> ByteMatcher {
1056        ByteMatcher {
1057            pattern: b"ttcf",
1058            mask: b"\xFF\xFF\xFF\xFF",
1059            content_type: "application/font-sfnt".parse().unwrap(),
1060            leading_ignore: &[],
1061        }
1062    }
1063    // The string "wOFF", the Web Open Font Format signature.
1064    fn application_font_woff() -> ByteMatcher {
1065        ByteMatcher {
1066            pattern: b"wOFF",
1067            mask: b"\xFF\xFF\xFF\xFF",
1068            content_type: "application/font-woff".parse().unwrap(),
1069            leading_ignore: &[],
1070        }
1071    }
1072    // The GZIP archive signature.
1073    fn application_x_gzip() -> ByteMatcher {
1074        ByteMatcher {
1075            pattern: b"\x1F\x8B\x08",
1076            mask: b"\xFF\xFF\xFF",
1077            content_type: "application/x-gzip".parse().unwrap(),
1078            leading_ignore: &[],
1079        }
1080    }
1081    // The string "PK" followed by ETX EOT, the ZIP archive signature.
1082    fn application_zip() -> ByteMatcher {
1083        ByteMatcher {
1084            pattern: b"PK\x03\x04",
1085            mask: b"\xFF\xFF\xFF\xFF",
1086            content_type: "application/zip".parse().unwrap(),
1087            leading_ignore: &[],
1088        }
1089    }
1090    // The string "Rar " followed by SUB BEL NUL, the RAR archive signature.
1091    fn application_x_rar_compressed() -> ByteMatcher {
1092        ByteMatcher {
1093            pattern: b"Rar \x1A\x07\x00",
1094            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
1095            content_type: "application/x-rar-compressed".parse().unwrap(),
1096            leading_ignore: &[],
1097        }
1098    }
1099    // The string "%!PS-Adobe-", the PostScript signature.
1100    fn application_postscript() -> ByteMatcher {
1101        ByteMatcher {
1102            pattern: b"%!PS-Adobe-",
1103            mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
1104            content_type: "application/postscript".parse().unwrap(),
1105            leading_ignore: &[],
1106        }
1107    }
1108    // UTF-16BE BOM
1109    fn text_plain_utf_16be_bom() -> ByteMatcher {
1110        ByteMatcher {
1111            pattern: b"\xFE\xFF\x00\x00",
1112            mask: b"\xFF\xFF\x00\x00",
1113            content_type: mime::TEXT_PLAIN,
1114            leading_ignore: &[],
1115        }
1116    }
1117    // UTF-16LE BOM
1118    fn text_plain_utf_16le_bom() -> ByteMatcher {
1119        ByteMatcher {
1120            pattern: b"\xFF\xFE\x00\x00",
1121            mask: b"\xFF\xFF\x00\x00",
1122            content_type: mime::TEXT_PLAIN,
1123            leading_ignore: &[],
1124        }
1125    }
1126    // UTF-8 BOM
1127    fn text_plain_utf_8_bom() -> ByteMatcher {
1128        ByteMatcher {
1129            pattern: b"\xEF\xBB\xBF\x00",
1130            mask: b"\xFF\xFF\xFF\x00",
1131            content_type: mime::TEXT_PLAIN,
1132            leading_ignore: &[],
1133        }
1134    }
1135}