html5ever/tokenizer/char_ref/
mod.rs1use super::{TokenSink, Tokenizer};
11use crate::buffer_queue::BufferQueue;
12use crate::data;
13use crate::tendril::StrTendril;
14
15use log::debug;
16use std::borrow::Cow::{self, Borrowed};
17use std::char::from_u32;
18
19use self::State::*;
20pub(super) use self::Status::*;
21
22pub(super) struct CharRef {
24 pub(super) chars: [char; 2],
26
27 pub(super) num_chars: u8,
29}
30
31pub(super) enum Status {
32 Stuck,
33 Progress,
34 Done,
35}
36
37#[derive(Debug)]
38enum State {
39 Begin,
40 Octothorpe,
41 Numeric(u32), NumericSemicolon,
43 Named,
44 BogusName,
45}
46
47pub(super) struct CharRefTokenizer {
48 state: State,
49 result: Option<CharRef>,
50 is_consumed_in_attribute: bool,
51
52 num: u32,
53 num_too_big: bool,
54 seen_digit: bool,
55 hex_marker: Option<char>,
56
57 name_buf_opt: Option<StrTendril>,
58 name_match: Option<(u32, u32)>,
59 name_len: usize,
60}
61
62impl CharRefTokenizer {
63 pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
64 CharRefTokenizer {
65 is_consumed_in_attribute,
66 state: Begin,
67 result: None,
68 num: 0,
69 num_too_big: false,
70 seen_digit: false,
71 hex_marker: None,
72 name_buf_opt: None,
73 name_match: None,
74 name_len: 0,
75 }
76 }
77
78 pub(super) fn get_result(self) -> CharRef {
81 self.result.expect("get_result called before done")
82 }
83
84 fn name_buf(&self) -> &StrTendril {
85 self.name_buf_opt
86 .as_ref()
87 .expect("name_buf missing in named character reference")
88 }
89
90 fn name_buf_mut(&mut self) -> &mut StrTendril {
91 self.name_buf_opt
92 .as_mut()
93 .expect("name_buf missing in named character reference")
94 }
95
96 fn finish_none(&mut self) -> Status {
97 self.result = Some(CharRef {
98 chars: ['\0', '\0'],
99 num_chars: 0,
100 });
101 Done
102 }
103
104 fn finish_one(&mut self, c: char) -> Status {
105 self.result = Some(CharRef {
106 chars: [c, '\0'],
107 num_chars: 1,
108 });
109 Done
110 }
111}
112
113impl CharRefTokenizer {
114 pub(super) fn step<Sink: TokenSink>(
115 &mut self,
116 tokenizer: &Tokenizer<Sink>,
117 input: &BufferQueue,
118 ) -> Status {
119 if self.result.is_some() {
120 return Done;
121 }
122
123 debug!("char ref tokenizer stepping in state {:?}", self.state);
124 match self.state {
125 Begin => self.do_begin(tokenizer, input),
126 Octothorpe => self.do_octothorpe(tokenizer, input),
127 Numeric(base) => self.do_numeric(tokenizer, input, base),
128 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
129 Named => self.do_named(tokenizer, input),
130 BogusName => self.do_bogus_name(tokenizer, input),
131 }
132 }
133
134 fn do_begin<Sink: TokenSink>(
135 &mut self,
136 tokenizer: &Tokenizer<Sink>,
137 input: &BufferQueue,
138 ) -> Status {
139 match tokenizer.peek(input) {
140 Some('a'..='z' | 'A'..='Z' | '0'..='9') => {
141 self.state = Named;
142 self.name_buf_opt = Some(StrTendril::new());
143 Progress
144 },
145 Some('#') => {
146 tokenizer.discard_char(input);
147 self.state = Octothorpe;
148 Progress
149 },
150 Some(_) => self.finish_none(),
151 None => Stuck,
152 }
153 }
154
155 fn do_octothorpe<Sink: TokenSink>(
156 &mut self,
157 tokenizer: &Tokenizer<Sink>,
158 input: &BufferQueue,
159 ) -> Status {
160 match tokenizer.peek(input) {
161 Some(c @ ('x' | 'X')) => {
162 tokenizer.discard_char(input);
163 self.hex_marker = Some(c);
164 self.state = Numeric(16);
165 },
166 Some(_) => {
167 self.hex_marker = None;
168 self.state = Numeric(10);
169 },
170 None => return Stuck,
171 }
172 Progress
173 }
174
175 fn do_numeric<Sink: TokenSink>(
176 &mut self,
177 tokenizer: &Tokenizer<Sink>,
178 input: &BufferQueue,
179 base: u32,
180 ) -> Status {
181 let Some(c) = tokenizer.peek(input) else {
182 return Stuck;
183 };
184 match c.to_digit(base) {
185 Some(n) => {
186 tokenizer.discard_char(input);
187 self.num = self.num.wrapping_mul(base);
188 if self.num > 0x10FFFF {
189 self.num_too_big = true;
192 }
193 self.num = self.num.wrapping_add(n);
194 self.seen_digit = true;
195 Progress
196 },
197
198 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
199
200 None => {
201 self.state = NumericSemicolon;
202 Progress
203 },
204 }
205 }
206
207 fn do_numeric_semicolon<Sink: TokenSink>(
208 &mut self,
209 tokenizer: &Tokenizer<Sink>,
210 input: &BufferQueue,
211 ) -> Status {
212 match tokenizer.peek(input) {
213 Some(';') => tokenizer.discard_char(input),
214 Some(_) => tokenizer.emit_error(Borrowed(
215 "Semicolon missing after numeric character reference",
216 )),
217 None => return Stuck,
218 };
219 self.finish_numeric(tokenizer)
220 }
221
222 fn unconsume_numeric<Sink: TokenSink>(
223 &mut self,
224 tokenizer: &Tokenizer<Sink>,
225 input: &BufferQueue,
226 ) -> Status {
227 let mut unconsume = StrTendril::from_char('#');
228 if let Some(c) = self.hex_marker {
229 unconsume.push_char(c)
230 }
231
232 input.push_front(unconsume);
233 tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
234 self.finish_none()
235 }
236
237 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status {
238 fn conv(n: u32) -> char {
239 from_u32(n).expect("invalid char missed by error handling cases")
240 }
241
242 let (c, error) = match self.num {
243 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
244 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
245
246 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
247 Some(c) => (c, true),
248 None => (conv(self.num), true),
249 },
250
251 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
252
253 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
254
255 n => (conv(n), false),
256 };
257
258 if error {
259 let msg = if tokenizer.opts.exact_errors {
260 Cow::from(format!(
261 "Invalid numeric character reference value 0x{:06X}",
262 self.num
263 ))
264 } else {
265 Cow::from("Invalid numeric character reference")
266 };
267 tokenizer.emit_error(msg);
268 }
269
270 self.finish_one(c)
271 }
272
273 fn do_named<Sink: TokenSink>(
274 &mut self,
275 tokenizer: &Tokenizer<Sink>,
276 input: &BufferQueue,
277 ) -> Status {
278 let Some(c) = tokenizer.peek(input) else {
281 return Stuck;
282 };
283 tokenizer.discard_char(input);
284 self.name_buf_mut().push_char(c);
285 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
286 Some(&m) => {
288 if m.0 != 0 {
289 self.name_match = Some(m);
291 self.name_len = self.name_buf().len();
292 }
293 Progress
295 },
296
297 None => self.finish_named(tokenizer, input, Some(c)),
299 }
300 }
301
302 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
303 let msg = if tokenizer.opts.exact_errors {
304 Cow::from(format!("Invalid character reference &{}", self.name_buf()))
305 } else {
306 Cow::from("Invalid character reference")
307 };
308 tokenizer.emit_error(msg);
309 }
310
311 fn unconsume_name(&mut self, input: &BufferQueue) {
312 input.push_front(self.name_buf_opt.take().unwrap());
313 }
314
315 fn finish_named<Sink: TokenSink>(
316 &mut self,
317 tokenizer: &Tokenizer<Sink>,
318 input: &BufferQueue,
319 end_char: Option<char>,
320 ) -> Status {
321 match self.name_match {
322 None => {
323 match end_char {
324 Some(c) if c.is_ascii_alphanumeric() => {
325 self.state = BogusName;
328 return Progress;
329 },
330
331 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
333
334 _ => (),
335 }
336 self.unconsume_name(input);
337 self.finish_none()
338 },
339
340 Some((c1, c2)) => {
341 let name_len = self.name_len;
350 assert!(name_len > 0);
351 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
352
353 let next_after = if name_len == self.name_buf().len() {
356 None
357 } else {
358 Some(self.name_buf()[name_len..].chars().next().unwrap())
359 };
360
361 let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
368 {
369 (_, ';', _) => false,
370 (true, _, Some('=')) => true,
371 (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
372 _ => {
373 tokenizer.emit_error(Borrowed(
377 "Character reference does not end with semicolon",
378 ));
379 false
380 },
381 };
382
383 if unconsume_all {
384 self.unconsume_name(input);
385 self.finish_none()
386 } else {
387 input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
388 tokenizer.ignore_lf.set(false);
389 self.result = Some(CharRef {
390 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
391 num_chars: if c2 == 0 { 1 } else { 2 },
392 });
393 Done
394 }
395 },
396 }
397 }
398
399 fn do_bogus_name<Sink: TokenSink>(
400 &mut self,
401 tokenizer: &Tokenizer<Sink>,
402 input: &BufferQueue,
403 ) -> Status {
404 let Some(c) = tokenizer.peek(input) else {
407 return Stuck;
408 };
409 tokenizer.discard_char(input);
410 self.name_buf_mut().push_char(c);
411 match c {
412 _ if c.is_ascii_alphanumeric() => return Progress,
413 ';' => self.emit_name_error(tokenizer),
414 _ => (),
415 }
416 self.unconsume_name(input);
417 self.finish_none()
418 }
419
420 pub(super) fn end_of_file<Sink: TokenSink>(
421 &mut self,
422 tokenizer: &Tokenizer<Sink>,
423 input: &BufferQueue,
424 ) {
425 while self.result.is_none() {
426 match self.state {
427 Begin => drop(self.finish_none()),
428
429 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
430
431 Numeric(_) | NumericSemicolon => {
432 tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
433 self.finish_numeric(tokenizer);
434 },
435
436 Named => drop(self.finish_named(tokenizer, input, None)),
437
438 BogusName => {
439 self.unconsume_name(input);
440 self.finish_none();
441 },
442
443 Octothorpe => {
444 input.push_front(StrTendril::from_slice("#"));
445 tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
446 self.finish_none();
447 },
448 }
449 }
450 }
451}