xml5ever/tokenizer/char_ref/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::{TokenSink, XmlTokenizer};
11use crate::data;
12use crate::tendril::StrTendril;
13use log::debug;
14use markup5ever::buffer_queue::BufferQueue;
15use std::borrow::Cow::{self, Borrowed};
16use std::char::from_u32;
17
18use self::State::*;
19pub use self::Status::*;
20
21//ยง tokenizing-character-references
22pub struct CharRef {
23    /// The resulting character(s)
24    pub chars: [char; 2],
25
26    /// How many slots in `chars` are valid?
27    pub num_chars: u8,
28}
29
30pub enum Status {
31    Stuck,
32    Progress,
33    Done,
34}
35
36#[derive(Debug)]
37enum State {
38    Begin,
39    Octothorpe,
40    Numeric(u32), // base
41    NumericSemicolon,
42    Named,
43    BogusName,
44}
45
46pub struct CharRefTokenizer {
47    state: State,
48    addnl_allowed: Option<char>,
49    result: Option<CharRef>,
50
51    num: u32,
52    num_too_big: bool,
53    seen_digit: bool,
54    hex_marker: Option<char>,
55
56    name_buf_opt: Option<StrTendril>,
57    name_match: Option<(u32, u32)>,
58    name_len: usize,
59}
60
61impl CharRefTokenizer {
62    // NB: We assume that we have an additional allowed character iff we're
63    // tokenizing in an attribute value.
64    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
65        CharRefTokenizer {
66            state: Begin,
67            addnl_allowed,
68            result: None,
69            num: 0,
70            num_too_big: false,
71            seen_digit: false,
72            hex_marker: None,
73            name_buf_opt: None,
74            name_match: None,
75            name_len: 0,
76        }
77    }
78
79    // A CharRefTokenizer can only tokenize one character reference,
80    // so this method consumes the tokenizer.
81    pub fn get_result(self) -> CharRef {
82        self.result.expect("get_result called before done")
83    }
84
85    fn name_buf(&self) -> &StrTendril {
86        self.name_buf_opt
87            .as_ref()
88            .expect("name_buf missing in named character reference")
89    }
90
91    fn name_buf_mut(&mut self) -> &mut StrTendril {
92        self.name_buf_opt
93            .as_mut()
94            .expect("name_buf missing in named character reference")
95    }
96
97    fn finish_none(&mut self) -> Status {
98        self.result = Some(CharRef {
99            chars: ['\0', '\0'],
100            num_chars: 0,
101        });
102        Done
103    }
104
105    fn finish_one(&mut self, c: char) -> Status {
106        self.result = Some(CharRef {
107            chars: [c, '\0'],
108            num_chars: 1,
109        });
110        Done
111    }
112}
113
114impl CharRefTokenizer {
115    pub fn step<Sink: TokenSink>(
116        &mut self,
117        tokenizer: &XmlTokenizer<Sink>,
118        input: &BufferQueue,
119    ) -> Status {
120        if self.result.is_some() {
121            return Done;
122        }
123
124        debug!("char ref tokenizer stepping in state {:?}", self.state);
125        match self.state {
126            Begin => self.do_begin(tokenizer, input),
127            Octothorpe => self.do_octothorpe(tokenizer, input),
128            Numeric(base) => self.do_numeric(tokenizer, base, input),
129            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
130            Named => self.do_named(tokenizer, input),
131            BogusName => self.do_bogus_name(tokenizer, input),
132        }
133    }
134
135    fn do_begin<Sink: TokenSink>(
136        &mut self,
137        tokenizer: &XmlTokenizer<Sink>,
138        input: &BufferQueue,
139    ) -> Status {
140        match tokenizer.peek(input) {
141            Some('\t' | '\n' | '\x0C' | ' ' | '<' | '&') => self.finish_none(),
142            Some(c) if Some(c) == self.addnl_allowed => self.finish_none(),
143            Some('#') => {
144                tokenizer.discard_char(input);
145                self.state = Octothorpe;
146                Progress
147            },
148            Some(_) => {
149                self.state = Named;
150                self.name_buf_opt = Some(StrTendril::new());
151                Progress
152            },
153            None => Stuck,
154        }
155    }
156
157    fn do_octothorpe<Sink: TokenSink>(
158        &mut self,
159        tokenizer: &XmlTokenizer<Sink>,
160        input: &BufferQueue,
161    ) -> Status {
162        match tokenizer.peek(input) {
163            Some(c @ ('x' | 'X')) => {
164                tokenizer.discard_char(input);
165                self.hex_marker = Some(c);
166                self.state = Numeric(16);
167            },
168            Some(_) => {
169                self.hex_marker = None;
170                self.state = Numeric(10);
171            },
172            None => return Stuck,
173        }
174        Progress
175    }
176
177    fn do_numeric<Sink: TokenSink>(
178        &mut self,
179        tokenizer: &XmlTokenizer<Sink>,
180        base: u32,
181        input: &BufferQueue,
182    ) -> Status {
183        let Some(c) = tokenizer.peek(input) else {
184            return Stuck;
185        };
186        match c.to_digit(base) {
187            Some(n) => {
188                tokenizer.discard_char(input);
189                self.num = self.num.wrapping_mul(base);
190                if self.num > 0x10FFFF {
191                    // We might overflow, and the character is definitely invalid.
192                    // We still parse digits and semicolon, but don't use the result.
193                    self.num_too_big = true;
194                }
195                self.num = self.num.wrapping_add(n);
196                self.seen_digit = true;
197                Progress
198            },
199
200            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
201
202            None => {
203                self.state = NumericSemicolon;
204                Progress
205            },
206        }
207    }
208
209    fn do_numeric_semicolon<Sink: TokenSink>(
210        &mut self,
211        tokenizer: &XmlTokenizer<Sink>,
212        input: &BufferQueue,
213    ) -> Status {
214        match tokenizer.peek(input) {
215            Some(';') => tokenizer.discard_char(input),
216            Some(_) => tokenizer.emit_error(Borrowed(
217                "Semicolon missing after numeric character reference",
218            )),
219            None => return Stuck,
220        };
221        self.finish_numeric(tokenizer)
222    }
223
224    fn unconsume_numeric<Sink: TokenSink>(
225        &mut self,
226        tokenizer: &XmlTokenizer<Sink>,
227        input: &BufferQueue,
228    ) -> Status {
229        let mut unconsume = StrTendril::from_char('#');
230        if let Some(c) = self.hex_marker {
231            unconsume.push_char(c);
232        }
233
234        tokenizer.unconsume(input, unconsume);
235        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236        self.finish_none()
237    }
238
239    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240        fn conv(n: u32) -> char {
241            from_u32(n).expect("invalid char missed by error handling cases")
242        }
243
244        let (c, error) = match self.num {
245            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
246            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
247
248            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249                Some(c) => (c, true),
250                None => (conv(self.num), true),
251            },
252
253            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
254
255            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
257            n => (conv(n), false),
258        };
259
260        if error {
261            let msg = if tokenizer.opts.exact_errors {
262                Cow::from(format!(
263                    "Invalid numeric character reference value 0x{:06X}",
264                    self.num
265                ))
266            } else {
267                Cow::from("Invalid numeric character reference")
268            };
269            tokenizer.emit_error(msg);
270        }
271
272        self.finish_one(c)
273    }
274
275    fn do_named<Sink: TokenSink>(
276        &mut self,
277        tokenizer: &XmlTokenizer<Sink>,
278        input: &BufferQueue,
279    ) -> Status {
280        let Some(c) = tokenizer.get_char(input) else {
281            return Stuck;
282        };
283        self.name_buf_mut().push_char(c);
284        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
285            // We have either a full match or a prefix of one.
286            Some(&m) => {
287                if m.0 != 0 {
288                    // We have a full match, but there might be a longer one to come.
289                    self.name_match = Some(m);
290                    self.name_len = self.name_buf().len();
291                }
292                // Otherwise we just have a prefix match.
293                Progress
294            },
295
296            // Can't continue the match.
297            None => self.finish_named(tokenizer, Some(c), input),
298        }
299    }
300
301    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
302        let msg = if tokenizer.opts.exact_errors {
303            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
304        } else {
305            Cow::from("Invalid character reference")
306        };
307        tokenizer.emit_error(msg);
308    }
309
310    fn unconsume_name<Sink: TokenSink>(
311        &mut self,
312        tokenizer: &XmlTokenizer<Sink>,
313        input: &BufferQueue,
314    ) {
315        tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
316    }
317
318    fn finish_named<Sink: TokenSink>(
319        &mut self,
320        tokenizer: &XmlTokenizer<Sink>,
321        end_char: Option<char>,
322        input: &BufferQueue,
323    ) -> Status {
324        match self.name_match {
325            None => {
326                match end_char {
327                    Some(c) if c.is_ascii_alphanumeric() => {
328                        // Keep looking for a semicolon, to determine whether
329                        // we emit a parse error.
330                        self.state = BogusName;
331                        return Progress;
332                    },
333
334                    // Check length because &; is not a parse error.
335                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
336
337                    _ => (),
338                }
339                self.unconsume_name(tokenizer, input);
340                self.finish_none()
341            },
342
343            Some((c1, c2)) => {
344                // We have a complete match, but we may have consumed
345                // additional characters into self.name_buf.  Usually
346                // at least one, but several in cases like
347                //
348                //     &not    => match for U+00AC
349                //     &noti   => valid prefix for &notin
350                //     &notit  => can't continue match
351
352                let name_len = self.name_len;
353                assert!(name_len > 0);
354                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
355
356                // There might not be a next character after the match, if
357                // we had a full match and then hit EOF.
358                let next_after = if name_len == self.name_buf().len() {
359                    None
360                } else {
361                    Some(self.name_buf()[name_len..].chars().next().unwrap())
362                };
363
364                // "If the character reference is being consumed as part of an
365                // attribute, and the last character matched is not a U+003B
366                // SEMICOLON character (;), and the next character is either a
367                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
368                // character, then, for historical reasons, all the characters
369                // that were matched after the U+0026 AMPERSAND character (&)
370                // must be unconsumed, and nothing is returned. However, if
371                // this next character is in fact a U+003D EQUALS SIGN
372                // character (=), then this is a parse error"
373
374                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
375                    (_, ';', _) => false,
376                    (Some(_), _, Some('=')) => {
377                        tokenizer.emit_error(Borrowed(
378                            "Equals sign after character reference in attribute",
379                        ));
380                        true
381                    },
382                    (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
383                    _ => {
384                        tokenizer.emit_error(Borrowed(
385                            "Character reference does not end with semicolon",
386                        ));
387                        false
388                    },
389                };
390
391                if unconsume_all {
392                    self.unconsume_name(tokenizer, input);
393                    self.finish_none()
394                } else {
395                    tokenizer
396                        .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
397                    self.result = Some(CharRef {
398                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
399                        num_chars: if c2 == 0 { 1 } else { 2 },
400                    });
401                    Done
402                }
403            },
404        }
405    }
406
407    fn do_bogus_name<Sink: TokenSink>(
408        &mut self,
409        tokenizer: &XmlTokenizer<Sink>,
410        input: &BufferQueue,
411    ) -> Status {
412        let Some(c) = tokenizer.get_char(input) else {
413            return Stuck;
414        };
415        self.name_buf_mut().push_char(c);
416        match c {
417            _ if c.is_ascii_alphanumeric() => return Progress,
418            ';' => self.emit_name_error(tokenizer),
419            _ => (),
420        }
421        self.unconsume_name(tokenizer, input);
422        self.finish_none()
423    }
424
425    pub fn end_of_file<Sink: TokenSink>(
426        &mut self,
427        tokenizer: &XmlTokenizer<Sink>,
428        input: &BufferQueue,
429    ) {
430        while self.result.is_none() {
431            match self.state {
432                Begin => drop(self.finish_none()),
433
434                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
435
436                Numeric(_) | NumericSemicolon => {
437                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
438                    self.finish_numeric(tokenizer);
439                },
440
441                Named => drop(self.finish_named(tokenizer, None, input)),
442
443                BogusName => {
444                    self.unconsume_name(tokenizer, input);
445                    self.finish_none();
446                },
447
448                Octothorpe => {
449                    tokenizer.unconsume(input, StrTendril::from_slice("#"));
450                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
451                    self.finish_none();
452                },
453            }
454        }
455    }
456}