html5ever/tokenizer/char_ref/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::{TokenSink, Tokenizer};
11use crate::buffer_queue::BufferQueue;
12use crate::data;
13use crate::tendril::StrTendril;
14
15use log::debug;
16use std::borrow::Cow::{self, Borrowed};
17use std::char::from_u32;
18
19use self::State::*;
20pub(super) use self::Status::*;
21
22//ยง tokenizing-character-references
23pub(super) struct CharRef {
24    /// The resulting character(s)
25    pub(super) chars: [char; 2],
26
27    /// How many slots in `chars` are valid?
28    pub(super) num_chars: u8,
29}
30
31pub(super) enum Status {
32    Stuck,
33    Progress,
34    Done,
35}
36
37#[derive(Debug)]
38enum State {
39    Begin,
40    Octothorpe,
41    Numeric(u32), // base
42    NumericSemicolon,
43    Named,
44    BogusName,
45}
46
47pub(super) struct CharRefTokenizer {
48    state: State,
49    result: Option<CharRef>,
50    is_consumed_in_attribute: bool,
51
52    num: u32,
53    num_too_big: bool,
54    seen_digit: bool,
55    hex_marker: Option<char>,
56
57    name_buf_opt: Option<StrTendril>,
58    name_match: Option<(u32, u32)>,
59    name_len: usize,
60}
61
62impl CharRefTokenizer {
63    pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
64        CharRefTokenizer {
65            is_consumed_in_attribute,
66            state: Begin,
67            result: None,
68            num: 0,
69            num_too_big: false,
70            seen_digit: false,
71            hex_marker: None,
72            name_buf_opt: None,
73            name_match: None,
74            name_len: 0,
75        }
76    }
77
78    // A CharRefTokenizer can only tokenize one character reference,
79    // so this method consumes the tokenizer.
80    pub(super) fn get_result(self) -> CharRef {
81        self.result.expect("get_result called before done")
82    }
83
84    fn name_buf(&self) -> &StrTendril {
85        self.name_buf_opt
86            .as_ref()
87            .expect("name_buf missing in named character reference")
88    }
89
90    fn name_buf_mut(&mut self) -> &mut StrTendril {
91        self.name_buf_opt
92            .as_mut()
93            .expect("name_buf missing in named character reference")
94    }
95
96    fn finish_none(&mut self) -> Status {
97        self.result = Some(CharRef {
98            chars: ['\0', '\0'],
99            num_chars: 0,
100        });
101        Done
102    }
103
104    fn finish_one(&mut self, c: char) -> Status {
105        self.result = Some(CharRef {
106            chars: [c, '\0'],
107            num_chars: 1,
108        });
109        Done
110    }
111}
112
113impl CharRefTokenizer {
114    pub(super) fn step<Sink: TokenSink>(
115        &mut self,
116        tokenizer: &Tokenizer<Sink>,
117        input: &BufferQueue,
118    ) -> Status {
119        if self.result.is_some() {
120            return Done;
121        }
122
123        debug!("char ref tokenizer stepping in state {:?}", self.state);
124        match self.state {
125            Begin => self.do_begin(tokenizer, input),
126            Octothorpe => self.do_octothorpe(tokenizer, input),
127            Numeric(base) => self.do_numeric(tokenizer, input, base),
128            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
129            Named => self.do_named(tokenizer, input),
130            BogusName => self.do_bogus_name(tokenizer, input),
131        }
132    }
133
134    fn do_begin<Sink: TokenSink>(
135        &mut self,
136        tokenizer: &Tokenizer<Sink>,
137        input: &BufferQueue,
138    ) -> Status {
139        match tokenizer.peek(input) {
140            Some('a'..='z' | 'A'..='Z' | '0'..='9') => {
141                self.state = Named;
142                self.name_buf_opt = Some(StrTendril::new());
143                Progress
144            },
145            Some('#') => {
146                tokenizer.discard_char(input);
147                self.state = Octothorpe;
148                Progress
149            },
150            Some(_) => self.finish_none(),
151            None => Stuck,
152        }
153    }
154
155    fn do_octothorpe<Sink: TokenSink>(
156        &mut self,
157        tokenizer: &Tokenizer<Sink>,
158        input: &BufferQueue,
159    ) -> Status {
160        match tokenizer.peek(input) {
161            Some(c @ ('x' | 'X')) => {
162                tokenizer.discard_char(input);
163                self.hex_marker = Some(c);
164                self.state = Numeric(16);
165            },
166            Some(_) => {
167                self.hex_marker = None;
168                self.state = Numeric(10);
169            },
170            None => return Stuck,
171        }
172        Progress
173    }
174
175    fn do_numeric<Sink: TokenSink>(
176        &mut self,
177        tokenizer: &Tokenizer<Sink>,
178        input: &BufferQueue,
179        base: u32,
180    ) -> Status {
181        let Some(c) = tokenizer.peek(input) else {
182            return Stuck;
183        };
184        match c.to_digit(base) {
185            Some(n) => {
186                tokenizer.discard_char(input);
187                self.num = self.num.wrapping_mul(base);
188                if self.num > 0x10FFFF {
189                    // We might overflow, and the character is definitely invalid.
190                    // We still parse digits and semicolon, but don't use the result.
191                    self.num_too_big = true;
192                }
193                self.num = self.num.wrapping_add(n);
194                self.seen_digit = true;
195                Progress
196            },
197
198            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
199
200            None => {
201                self.state = NumericSemicolon;
202                Progress
203            },
204        }
205    }
206
207    fn do_numeric_semicolon<Sink: TokenSink>(
208        &mut self,
209        tokenizer: &Tokenizer<Sink>,
210        input: &BufferQueue,
211    ) -> Status {
212        match tokenizer.peek(input) {
213            Some(';') => tokenizer.discard_char(input),
214            Some(_) => tokenizer.emit_error(Borrowed(
215                "Semicolon missing after numeric character reference",
216            )),
217            None => return Stuck,
218        };
219        self.finish_numeric(tokenizer)
220    }
221
222    fn unconsume_numeric<Sink: TokenSink>(
223        &mut self,
224        tokenizer: &Tokenizer<Sink>,
225        input: &BufferQueue,
226    ) -> Status {
227        let mut unconsume = StrTendril::from_char('#');
228        if let Some(c) = self.hex_marker {
229            unconsume.push_char(c)
230        }
231
232        input.push_front(unconsume);
233        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
234        self.finish_none()
235    }
236
237    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status {
238        fn conv(n: u32) -> char {
239            from_u32(n).expect("invalid char missed by error handling cases")
240        }
241
242        let (c, error) = match self.num {
243            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
244            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
245
246            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
247                Some(c) => (c, true),
248                None => (conv(self.num), true),
249            },
250
251            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
252
253            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
254
255            n => (conv(n), false),
256        };
257
258        if error {
259            let msg = if tokenizer.opts.exact_errors {
260                Cow::from(format!(
261                    "Invalid numeric character reference value 0x{:06X}",
262                    self.num
263                ))
264            } else {
265                Cow::from("Invalid numeric character reference")
266            };
267            tokenizer.emit_error(msg);
268        }
269
270        self.finish_one(c)
271    }
272
273    fn do_named<Sink: TokenSink>(
274        &mut self,
275        tokenizer: &Tokenizer<Sink>,
276        input: &BufferQueue,
277    ) -> Status {
278        // peek + discard skips over newline normalization, therefore making it easier to
279        // un-consume
280        let Some(c) = tokenizer.peek(input) else {
281            return Stuck;
282        };
283        tokenizer.discard_char(input);
284        self.name_buf_mut().push_char(c);
285        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
286            // We have either a full match or a prefix of one.
287            Some(&m) => {
288                if m.0 != 0 {
289                    // We have a full match, but there might be a longer one to come.
290                    self.name_match = Some(m);
291                    self.name_len = self.name_buf().len();
292                }
293                // Otherwise we just have a prefix match.
294                Progress
295            },
296
297            // Can't continue the match.
298            None => self.finish_named(tokenizer, input, Some(c)),
299        }
300    }
301
302    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
303        let msg = if tokenizer.opts.exact_errors {
304            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
305        } else {
306            Cow::from("Invalid character reference")
307        };
308        tokenizer.emit_error(msg);
309    }
310
311    fn unconsume_name(&mut self, input: &BufferQueue) {
312        input.push_front(self.name_buf_opt.take().unwrap());
313    }
314
315    fn finish_named<Sink: TokenSink>(
316        &mut self,
317        tokenizer: &Tokenizer<Sink>,
318        input: &BufferQueue,
319        end_char: Option<char>,
320    ) -> Status {
321        match self.name_match {
322            None => {
323                match end_char {
324                    Some(c) if c.is_ascii_alphanumeric() => {
325                        // Keep looking for a semicolon, to determine whether
326                        // we emit a parse error.
327                        self.state = BogusName;
328                        return Progress;
329                    },
330
331                    // Check length because &; is not a parse error.
332                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
333
334                    _ => (),
335                }
336                self.unconsume_name(input);
337                self.finish_none()
338            },
339
340            Some((c1, c2)) => {
341                // We have a complete match, but we may have consumed
342                // additional characters into self.name_buf.  Usually
343                // at least one, but several in cases like
344                //
345                //     &not    => match for U+00AC
346                //     &noti   => valid prefix for &notin
347                //     &notit  => can't continue match
348
349                let name_len = self.name_len;
350                assert!(name_len > 0);
351                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
352
353                // There might not be a next character after the match, if
354                // we had a full match and then hit EOF.
355                let next_after = if name_len == self.name_buf().len() {
356                    None
357                } else {
358                    Some(self.name_buf()[name_len..].chars().next().unwrap())
359                };
360
361                // If the character reference was consumed as part of an attribute, and the last
362                // character matched is not a U+003B SEMICOLON character (;), and the next input
363                // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
364                // then, for historical reasons, flush code points consumed as a character
365                // reference and switch to the return state.
366
367                let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
368                {
369                    (_, ';', _) => false,
370                    (true, _, Some('=')) => true,
371                    (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
372                    _ => {
373                        // 1. If the last character matched is not a U+003B SEMICOLON character
374                        //    (;), then this is a missing-semicolon-after-character-reference parse
375                        //    error.
376                        tokenizer.emit_error(Borrowed(
377                            "Character reference does not end with semicolon",
378                        ));
379                        false
380                    },
381                };
382
383                if unconsume_all {
384                    self.unconsume_name(input);
385                    self.finish_none()
386                } else {
387                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
388                    tokenizer.ignore_lf.set(false);
389                    self.result = Some(CharRef {
390                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
391                        num_chars: if c2 == 0 { 1 } else { 2 },
392                    });
393                    Done
394                }
395            },
396        }
397    }
398
399    fn do_bogus_name<Sink: TokenSink>(
400        &mut self,
401        tokenizer: &Tokenizer<Sink>,
402        input: &BufferQueue,
403    ) -> Status {
404        // peek + discard skips over newline normalization, therefore making it easier to
405        // un-consume
406        let Some(c) = tokenizer.peek(input) else {
407            return Stuck;
408        };
409        tokenizer.discard_char(input);
410        self.name_buf_mut().push_char(c);
411        match c {
412            _ if c.is_ascii_alphanumeric() => return Progress,
413            ';' => self.emit_name_error(tokenizer),
414            _ => (),
415        }
416        self.unconsume_name(input);
417        self.finish_none()
418    }
419
420    pub(super) fn end_of_file<Sink: TokenSink>(
421        &mut self,
422        tokenizer: &Tokenizer<Sink>,
423        input: &BufferQueue,
424    ) {
425        while self.result.is_none() {
426            match self.state {
427                Begin => drop(self.finish_none()),
428
429                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
430
431                Numeric(_) | NumericSemicolon => {
432                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
433                    self.finish_numeric(tokenizer);
434                },
435
436                Named => drop(self.finish_named(tokenizer, input, None)),
437
438                BogusName => {
439                    self.unconsume_name(input);
440                    self.finish_none();
441                },
442
443                Octothorpe => {
444                    input.push_front(StrTendril::from_slice("#"));
445                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
446                    self.finish_none();
447                },
448            }
449        }
450    }
451}