html5ever/tokenizer/char_ref/
mod.rs1use super::{TokenSink, Tokenizer};
11use crate::buffer_queue::BufferQueue;
12use crate::data;
13use crate::tendril::StrTendril;
14
15use log::debug;
16use std::borrow::Cow::{self, Borrowed};
17use std::char::from_u32;
18
19pub(super) struct CharRef {
21 pub(super) chars: [char; 2],
23
24 pub(super) num_chars: u8,
26}
27
28pub(super) enum Status {
29 Stuck,
30 Progress,
31 Done(CharRef),
32}
33
34#[derive(Debug)]
35enum State {
36 Begin,
37 Octothorpe,
38 Numeric(u32), NumericSemicolon,
40 Named,
41 BogusName,
42}
43
44pub(super) struct CharRefTokenizer {
45 state: State,
46 is_consumed_in_attribute: bool,
47
48 num: u32,
49 num_too_big: bool,
50 seen_digit: bool,
51 hex_marker: Option<char>,
52
53 name_buf_opt: Option<StrTendril>,
54 name_match: Option<(u32, u32)>,
55 name_len: usize,
56}
57
58impl CharRef {
59 const EMPTY: CharRef = CharRef {
60 chars: ['\0', '\0'],
61 num_chars: 0,
62 };
63}
64
65impl CharRefTokenizer {
66 pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
67 CharRefTokenizer {
68 is_consumed_in_attribute,
69 state: State::Begin,
70 num: 0,
71 num_too_big: false,
72 seen_digit: false,
73 hex_marker: None,
74 name_buf_opt: None,
75 name_match: None,
76 name_len: 0,
77 }
78 }
79
80 fn name_buf(&self) -> &StrTendril {
81 self.name_buf_opt
82 .as_ref()
83 .expect("name_buf missing in named character reference")
84 }
85
86 fn name_buf_mut(&mut self) -> &mut StrTendril {
87 self.name_buf_opt
88 .as_mut()
89 .expect("name_buf missing in named character reference")
90 }
91
92 fn finish_one(&mut self, c: char) -> Status {
93 Status::Done(CharRef {
94 chars: [c, '\0'],
95 num_chars: 1,
96 })
97 }
98}
99
100impl CharRefTokenizer {
101 pub(super) fn step<Sink: TokenSink>(
102 &mut self,
103 tokenizer: &Tokenizer<Sink>,
104 input: &BufferQueue,
105 ) -> Status {
106 debug!("char ref tokenizer stepping in state {:?}", self.state);
107 match self.state {
108 State::Begin => self.do_begin(tokenizer, input),
109 State::Octothorpe => self.do_octothorpe(tokenizer, input),
110 State::Numeric(base) => self.do_numeric(tokenizer, input, base),
111 State::NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
112 State::Named => self.do_named(tokenizer, input),
113 State::BogusName => self.do_bogus_name(tokenizer, input),
114 }
115 }
116
117 fn do_begin<Sink: TokenSink>(
118 &mut self,
119 tokenizer: &Tokenizer<Sink>,
120 input: &BufferQueue,
121 ) -> Status {
122 match tokenizer.peek(input) {
123 Some('a'..='z' | 'A'..='Z' | '0'..='9') => {
124 self.state = State::Named;
125 self.name_buf_opt = Some(StrTendril::new());
126 Status::Progress
127 },
128 Some('#') => {
129 tokenizer.discard_char(input);
130 self.state = State::Octothorpe;
131 Status::Progress
132 },
133 Some(_) => Status::Done(CharRef::EMPTY),
134 None => Status::Stuck,
135 }
136 }
137
138 fn do_octothorpe<Sink: TokenSink>(
139 &mut self,
140 tokenizer: &Tokenizer<Sink>,
141 input: &BufferQueue,
142 ) -> Status {
143 match tokenizer.peek(input) {
144 Some(c @ ('x' | 'X')) => {
145 tokenizer.discard_char(input);
146 self.hex_marker = Some(c);
147 self.state = State::Numeric(16);
148 },
149 Some(_) => {
150 self.hex_marker = None;
151 self.state = State::Numeric(10);
152 },
153 None => return Status::Stuck,
154 }
155 Status::Progress
156 }
157
158 fn do_numeric<Sink: TokenSink>(
159 &mut self,
160 tokenizer: &Tokenizer<Sink>,
161 input: &BufferQueue,
162 base: u32,
163 ) -> Status {
164 let Some(c) = tokenizer.peek(input) else {
165 return Status::Stuck;
166 };
167 match c.to_digit(base) {
168 Some(n) => {
169 tokenizer.discard_char(input);
170 self.num = self.num.wrapping_mul(base);
171 if self.num > 0x10FFFF {
172 self.num_too_big = true;
175 }
176 self.num = self.num.wrapping_add(n);
177 self.seen_digit = true;
178 Status::Progress
179 },
180
181 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
182
183 None => {
184 self.state = State::NumericSemicolon;
185 Status::Progress
186 },
187 }
188 }
189
190 fn do_numeric_semicolon<Sink: TokenSink>(
191 &mut self,
192 tokenizer: &Tokenizer<Sink>,
193 input: &BufferQueue,
194 ) -> Status {
195 match tokenizer.peek(input) {
196 Some(';') => tokenizer.discard_char(input),
197 Some(_) => tokenizer.emit_error(Borrowed(
198 "Semicolon missing after numeric character reference",
199 )),
200 None => return Status::Stuck,
201 };
202 self.finish_numeric(tokenizer)
203 }
204
205 fn unconsume_numeric<Sink: TokenSink>(
206 &mut self,
207 tokenizer: &Tokenizer<Sink>,
208 input: &BufferQueue,
209 ) -> Status {
210 let mut unconsume = StrTendril::from_char('#');
211 if let Some(c) = self.hex_marker {
212 unconsume.push_char(c)
213 }
214
215 input.push_front(unconsume);
216 tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
217 Status::Done(CharRef::EMPTY)
218 }
219
220 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status {
221 fn conv(n: u32) -> char {
222 from_u32(n).expect("invalid char missed by error handling cases")
223 }
224
225 let (c, error) = match self.num {
226 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
227 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
228
229 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
230 Some(c) => (c, true),
231 None => (conv(self.num), true),
232 },
233
234 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
235
236 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
237
238 n => (conv(n), false),
239 };
240
241 if error {
242 let msg = if tokenizer.opts.exact_errors {
243 Cow::from(format!(
244 "Invalid numeric character reference value 0x{:06X}",
245 self.num
246 ))
247 } else {
248 Cow::from("Invalid numeric character reference")
249 };
250 tokenizer.emit_error(msg);
251 }
252
253 self.finish_one(c)
254 }
255
256 fn do_named<Sink: TokenSink>(
257 &mut self,
258 tokenizer: &Tokenizer<Sink>,
259 input: &BufferQueue,
260 ) -> Status {
261 let Some(c) = tokenizer.peek(input) else {
264 return Status::Stuck;
265 };
266 tokenizer.discard_char(input);
267 self.name_buf_mut().push_char(c);
268 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
269 Some(&m) => {
271 if m.0 != 0 {
272 self.name_match = Some(m);
274 self.name_len = self.name_buf().len();
275 }
276 Status::Progress
278 },
279
280 None => self.finish_named(tokenizer, input, Some(c)),
282 }
283 }
284
285 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
286 let msg = if tokenizer.opts.exact_errors {
287 Cow::from(format!("Invalid character reference &{}", self.name_buf()))
288 } else {
289 Cow::from("Invalid character reference")
290 };
291 tokenizer.emit_error(msg);
292 }
293
294 fn unconsume_name(&mut self, input: &BufferQueue) {
295 input.push_front(self.name_buf_opt.take().unwrap());
296 }
297
298 fn finish_named<Sink: TokenSink>(
299 &mut self,
300 tokenizer: &Tokenizer<Sink>,
301 input: &BufferQueue,
302 end_char: Option<char>,
303 ) -> Status {
304 match self.name_match {
305 None => {
306 match end_char {
307 Some(c) if c.is_ascii_alphanumeric() => {
308 self.state = State::BogusName;
311 return Status::Progress;
312 },
313
314 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
316
317 _ => (),
318 }
319 self.unconsume_name(input);
320 Status::Done(CharRef::EMPTY)
321 },
322
323 Some((c1, c2)) => {
324 let name_len = self.name_len;
333 assert!(name_len > 0);
334 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
335
336 let next_after = if name_len == self.name_buf().len() {
339 None
340 } else {
341 Some(self.name_buf()[name_len..].chars().next().unwrap())
342 };
343
344 let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
351 {
352 (_, ';', _) => false,
353 (true, _, Some('=')) => true,
354 (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
355 _ => {
356 tokenizer.emit_error(Borrowed(
360 "Character reference does not end with semicolon",
361 ));
362 false
363 },
364 };
365
366 if unconsume_all {
367 self.unconsume_name(input);
368 Status::Done(CharRef::EMPTY)
369 } else {
370 input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
371 tokenizer.ignore_lf.set(false);
372 Status::Done(CharRef {
373 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
374 num_chars: if c2 == 0 { 1 } else { 2 },
375 })
376 }
377 },
378 }
379 }
380
381 fn do_bogus_name<Sink: TokenSink>(
382 &mut self,
383 tokenizer: &Tokenizer<Sink>,
384 input: &BufferQueue,
385 ) -> Status {
386 let Some(c) = tokenizer.peek(input) else {
389 return Status::Stuck;
390 };
391 tokenizer.discard_char(input);
392 self.name_buf_mut().push_char(c);
393 match c {
394 _ if c.is_ascii_alphanumeric() => return Status::Progress,
395 ';' => self.emit_name_error(tokenizer),
396 _ => (),
397 }
398 self.unconsume_name(input);
399 Status::Done(CharRef::EMPTY)
400 }
401
402 pub(super) fn end_of_file<Sink: TokenSink>(
403 &mut self,
404 tokenizer: &Tokenizer<Sink>,
405 input: &BufferQueue,
406 ) -> CharRef {
407 loop {
408 let status = match self.state {
409 State::Begin => Status::Done(CharRef::EMPTY),
410 State::Numeric(_) if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
411 State::Numeric(_) | State::NumericSemicolon => {
412 tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
413 self.finish_numeric(tokenizer)
414 },
415 State::Named => self.finish_named(tokenizer, input, None),
416 State::BogusName => {
417 self.unconsume_name(input);
418 Status::Done(CharRef::EMPTY)
419 },
420 State::Octothorpe => {
421 input.push_front(StrTendril::from_slice("#"));
422 tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
423 Status::Done(CharRef::EMPTY)
424 },
425 };
426
427 match status {
428 Status::Done(char_ref) => {
429 return char_ref;
430 },
431 Status::Stuck => {
432 return CharRef::EMPTY;
433 },
434 Status::Progress => {},
435 }
436 }
437 }
438}