html5ever/tokenizer/char_ref/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::{TokenSink, Tokenizer};
11use crate::buffer_queue::BufferQueue;
12use crate::data;
13use crate::tendril::StrTendril;
14
15use log::debug;
16use std::borrow::Cow::{self, Borrowed};
17use std::char::from_u32;
18
19//ยง tokenizing-character-references
20pub(super) struct CharRef {
21    /// The resulting character(s)
22    pub(super) chars: [char; 2],
23
24    /// How many slots in `chars` are valid?
25    pub(super) num_chars: u8,
26}
27
28pub(super) enum Status {
29    Stuck,
30    Progress,
31    Done(CharRef),
32}
33
34#[derive(Debug)]
35enum State {
36    Begin,
37    Octothorpe,
38    Numeric(u32), // base
39    NumericSemicolon,
40    Named,
41    BogusName,
42}
43
44pub(super) struct CharRefTokenizer {
45    state: State,
46    is_consumed_in_attribute: bool,
47
48    num: u32,
49    num_too_big: bool,
50    seen_digit: bool,
51    hex_marker: Option<char>,
52
53    name_buf_opt: Option<StrTendril>,
54    name_match: Option<(u32, u32)>,
55    name_len: usize,
56}
57
58impl CharRef {
59    const EMPTY: CharRef = CharRef {
60        chars: ['\0', '\0'],
61        num_chars: 0,
62    };
63}
64
65impl CharRefTokenizer {
66    pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
67        CharRefTokenizer {
68            is_consumed_in_attribute,
69            state: State::Begin,
70            num: 0,
71            num_too_big: false,
72            seen_digit: false,
73            hex_marker: None,
74            name_buf_opt: None,
75            name_match: None,
76            name_len: 0,
77        }
78    }
79
80    fn name_buf(&self) -> &StrTendril {
81        self.name_buf_opt
82            .as_ref()
83            .expect("name_buf missing in named character reference")
84    }
85
86    fn name_buf_mut(&mut self) -> &mut StrTendril {
87        self.name_buf_opt
88            .as_mut()
89            .expect("name_buf missing in named character reference")
90    }
91
92    fn finish_one(&mut self, c: char) -> Status {
93        Status::Done(CharRef {
94            chars: [c, '\0'],
95            num_chars: 1,
96        })
97    }
98}
99
100impl CharRefTokenizer {
101    pub(super) fn step<Sink: TokenSink>(
102        &mut self,
103        tokenizer: &Tokenizer<Sink>,
104        input: &BufferQueue,
105    ) -> Status {
106        debug!("char ref tokenizer stepping in state {:?}", self.state);
107        match self.state {
108            State::Begin => self.do_begin(tokenizer, input),
109            State::Octothorpe => self.do_octothorpe(tokenizer, input),
110            State::Numeric(base) => self.do_numeric(tokenizer, input, base),
111            State::NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
112            State::Named => self.do_named(tokenizer, input),
113            State::BogusName => self.do_bogus_name(tokenizer, input),
114        }
115    }
116
117    fn do_begin<Sink: TokenSink>(
118        &mut self,
119        tokenizer: &Tokenizer<Sink>,
120        input: &BufferQueue,
121    ) -> Status {
122        match tokenizer.peek(input) {
123            Some('a'..='z' | 'A'..='Z' | '0'..='9') => {
124                self.state = State::Named;
125                self.name_buf_opt = Some(StrTendril::new());
126                Status::Progress
127            },
128            Some('#') => {
129                tokenizer.discard_char(input);
130                self.state = State::Octothorpe;
131                Status::Progress
132            },
133            Some(_) => Status::Done(CharRef::EMPTY),
134            None => Status::Stuck,
135        }
136    }
137
138    fn do_octothorpe<Sink: TokenSink>(
139        &mut self,
140        tokenizer: &Tokenizer<Sink>,
141        input: &BufferQueue,
142    ) -> Status {
143        match tokenizer.peek(input) {
144            Some(c @ ('x' | 'X')) => {
145                tokenizer.discard_char(input);
146                self.hex_marker = Some(c);
147                self.state = State::Numeric(16);
148            },
149            Some(_) => {
150                self.hex_marker = None;
151                self.state = State::Numeric(10);
152            },
153            None => return Status::Stuck,
154        }
155        Status::Progress
156    }
157
158    fn do_numeric<Sink: TokenSink>(
159        &mut self,
160        tokenizer: &Tokenizer<Sink>,
161        input: &BufferQueue,
162        base: u32,
163    ) -> Status {
164        let Some(c) = tokenizer.peek(input) else {
165            return Status::Stuck;
166        };
167        match c.to_digit(base) {
168            Some(n) => {
169                tokenizer.discard_char(input);
170                self.num = self.num.wrapping_mul(base);
171                if self.num > 0x10FFFF {
172                    // We might overflow, and the character is definitely invalid.
173                    // We still parse digits and semicolon, but don't use the result.
174                    self.num_too_big = true;
175                }
176                self.num = self.num.wrapping_add(n);
177                self.seen_digit = true;
178                Status::Progress
179            },
180
181            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
182
183            None => {
184                self.state = State::NumericSemicolon;
185                Status::Progress
186            },
187        }
188    }
189
190    fn do_numeric_semicolon<Sink: TokenSink>(
191        &mut self,
192        tokenizer: &Tokenizer<Sink>,
193        input: &BufferQueue,
194    ) -> Status {
195        match tokenizer.peek(input) {
196            Some(';') => tokenizer.discard_char(input),
197            Some(_) => tokenizer.emit_error(Borrowed(
198                "Semicolon missing after numeric character reference",
199            )),
200            None => return Status::Stuck,
201        };
202        self.finish_numeric(tokenizer)
203    }
204
205    fn unconsume_numeric<Sink: TokenSink>(
206        &mut self,
207        tokenizer: &Tokenizer<Sink>,
208        input: &BufferQueue,
209    ) -> Status {
210        let mut unconsume = StrTendril::from_char('#');
211        if let Some(c) = self.hex_marker {
212            unconsume.push_char(c)
213        }
214
215        input.push_front(unconsume);
216        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
217        Status::Done(CharRef::EMPTY)
218    }
219
220    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status {
221        fn conv(n: u32) -> char {
222            from_u32(n).expect("invalid char missed by error handling cases")
223        }
224
225        let (c, error) = match self.num {
226            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
227            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
228
229            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
230                Some(c) => (c, true),
231                None => (conv(self.num), true),
232            },
233
234            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
235
236            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
237
238            n => (conv(n), false),
239        };
240
241        if error {
242            let msg = if tokenizer.opts.exact_errors {
243                Cow::from(format!(
244                    "Invalid numeric character reference value 0x{:06X}",
245                    self.num
246                ))
247            } else {
248                Cow::from("Invalid numeric character reference")
249            };
250            tokenizer.emit_error(msg);
251        }
252
253        self.finish_one(c)
254    }
255
256    fn do_named<Sink: TokenSink>(
257        &mut self,
258        tokenizer: &Tokenizer<Sink>,
259        input: &BufferQueue,
260    ) -> Status {
261        // peek + discard skips over newline normalization, therefore making it easier to
262        // un-consume
263        let Some(c) = tokenizer.peek(input) else {
264            return Status::Stuck;
265        };
266        tokenizer.discard_char(input);
267        self.name_buf_mut().push_char(c);
268        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
269            // We have either a full match or a prefix of one.
270            Some(&m) => {
271                if m.0 != 0 {
272                    // We have a full match, but there might be a longer one to come.
273                    self.name_match = Some(m);
274                    self.name_len = self.name_buf().len();
275                }
276                // Otherwise we just have a prefix match.
277                Status::Progress
278            },
279
280            // Can't continue the match.
281            None => self.finish_named(tokenizer, input, Some(c)),
282        }
283    }
284
285    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
286        let msg = if tokenizer.opts.exact_errors {
287            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
288        } else {
289            Cow::from("Invalid character reference")
290        };
291        tokenizer.emit_error(msg);
292    }
293
294    fn unconsume_name(&mut self, input: &BufferQueue) {
295        input.push_front(self.name_buf_opt.take().unwrap());
296    }
297
298    fn finish_named<Sink: TokenSink>(
299        &mut self,
300        tokenizer: &Tokenizer<Sink>,
301        input: &BufferQueue,
302        end_char: Option<char>,
303    ) -> Status {
304        match self.name_match {
305            None => {
306                match end_char {
307                    Some(c) if c.is_ascii_alphanumeric() => {
308                        // Keep looking for a semicolon, to determine whether
309                        // we emit a parse error.
310                        self.state = State::BogusName;
311                        return Status::Progress;
312                    },
313
314                    // Check length because &; is not a parse error.
315                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
316
317                    _ => (),
318                }
319                self.unconsume_name(input);
320                Status::Done(CharRef::EMPTY)
321            },
322
323            Some((c1, c2)) => {
324                // We have a complete match, but we may have consumed
325                // additional characters into self.name_buf.  Usually
326                // at least one, but several in cases like
327                //
328                //     &not    => match for U+00AC
329                //     &noti   => valid prefix for &notin
330                //     &notit  => can't continue match
331
332                let name_len = self.name_len;
333                assert!(name_len > 0);
334                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
335
336                // There might not be a next character after the match, if
337                // we had a full match and then hit EOF.
338                let next_after = if name_len == self.name_buf().len() {
339                    None
340                } else {
341                    Some(self.name_buf()[name_len..].chars().next().unwrap())
342                };
343
344                // If the character reference was consumed as part of an attribute, and the last
345                // character matched is not a U+003B SEMICOLON character (;), and the next input
346                // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
347                // then, for historical reasons, flush code points consumed as a character
348                // reference and switch to the return state.
349
350                let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
351                {
352                    (_, ';', _) => false,
353                    (true, _, Some('=')) => true,
354                    (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
355                    _ => {
356                        // 1. If the last character matched is not a U+003B SEMICOLON character
357                        //    (;), then this is a missing-semicolon-after-character-reference parse
358                        //    error.
359                        tokenizer.emit_error(Borrowed(
360                            "Character reference does not end with semicolon",
361                        ));
362                        false
363                    },
364                };
365
366                if unconsume_all {
367                    self.unconsume_name(input);
368                    Status::Done(CharRef::EMPTY)
369                } else {
370                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
371                    tokenizer.ignore_lf.set(false);
372                    Status::Done(CharRef {
373                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
374                        num_chars: if c2 == 0 { 1 } else { 2 },
375                    })
376                }
377            },
378        }
379    }
380
381    fn do_bogus_name<Sink: TokenSink>(
382        &mut self,
383        tokenizer: &Tokenizer<Sink>,
384        input: &BufferQueue,
385    ) -> Status {
386        // peek + discard skips over newline normalization, therefore making it easier to
387        // un-consume
388        let Some(c) = tokenizer.peek(input) else {
389            return Status::Stuck;
390        };
391        tokenizer.discard_char(input);
392        self.name_buf_mut().push_char(c);
393        match c {
394            _ if c.is_ascii_alphanumeric() => return Status::Progress,
395            ';' => self.emit_name_error(tokenizer),
396            _ => (),
397        }
398        self.unconsume_name(input);
399        Status::Done(CharRef::EMPTY)
400    }
401
402    pub(super) fn end_of_file<Sink: TokenSink>(
403        &mut self,
404        tokenizer: &Tokenizer<Sink>,
405        input: &BufferQueue,
406    ) -> CharRef {
407        loop {
408            let status = match self.state {
409                State::Begin => Status::Done(CharRef::EMPTY),
410                State::Numeric(_) if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
411                State::Numeric(_) | State::NumericSemicolon => {
412                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
413                    self.finish_numeric(tokenizer)
414                },
415                State::Named => self.finish_named(tokenizer, input, None),
416                State::BogusName => {
417                    self.unconsume_name(input);
418                    Status::Done(CharRef::EMPTY)
419                },
420                State::Octothorpe => {
421                    input.push_front(StrTendril::from_slice("#"));
422                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
423                    Status::Done(CharRef::EMPTY)
424                },
425            };
426
427            match status {
428                Status::Done(char_ref) => {
429                    return char_ref;
430                },
431                Status::Stuck => {
432                    return CharRef::EMPTY;
433                },
434                Status::Progress => {},
435            }
436        }
437    }
438}