litrs/
escape.rs

1use crate::{
2    err::{perr, ParseErrorKind::*},
3    parse::{check_suffix, hex_digit_value},
4    ParseError,
5};
6
7
8/// Must start with `\`. Returns the unscaped value as `E` and the number of
9/// input bytes the escape is long.
10///
11/// `unicode` and `byte_escapes` specify which types of escapes are
12/// supported. [Quote escapes] are always unescaped, [Unicode escapes] only if
13/// `unicode` is true. If `byte_escapes` is false, [ASCII escapes] are
14/// used, if it's true, [Byte escapes] are (the only difference being that the
15/// latter supports \xHH escapes > 0x7f).
16///
17/// [Quote escapes]: https://doc.rust-lang.org/reference/tokens.html#quote-escapes
18/// [Unicode escapes]: https://doc.rust-lang.org/reference/tokens.html#unicode-escapes
19/// [Ascii escapes]: https://doc.rust-lang.org/reference/tokens.html#ascii-escapes
20/// [Byte escapes]: https://doc.rust-lang.org/reference/tokens.html#byte-escapes
21pub(crate) fn unescape(
22    input: &str,
23    unicode: bool,
24    byte_escapes: bool,
25    allow_nul: bool,
26) -> Result<(Unescape, usize), ParseError> {
27    let first = input.as_bytes().get(1).ok_or(perr(0, UnterminatedEscape))?;
28    let out = match first {
29        // Quote escapes
30        b'\'' => (Unescape::Byte(b'\''), 2),
31        b'"' => (Unescape::Byte(b'"'), 2),
32
33        // Ascii escapes
34        b'n' => (Unescape::Byte(b'\n'), 2),
35        b'r' => (Unescape::Byte(b'\r'), 2),
36        b't' => (Unescape::Byte(b'\t'), 2),
37        b'\\' => (Unescape::Byte(b'\\'), 2),
38        b'0' => if allow_nul {
39            (Unescape::Byte(b'\0'), 2)
40        } else {
41            return Err(perr(0..2, DisallowedNulEscape))
42        },
43        b'x' => {
44            let hex_string = input.get(2..4)
45                .ok_or(perr(0..input.len(), UnterminatedEscape))?
46                .as_bytes();
47            let first = hex_digit_value(hex_string[0]).ok_or(perr(0..4, InvalidXEscape))?;
48            let second = hex_digit_value(hex_string[1]).ok_or(perr(0..4, InvalidXEscape))?;
49            let value = second + 16 * first;
50
51            if !byte_escapes && value > 0x7F {
52                return Err(perr(0..4, NonAsciiXEscape));
53            }
54
55            if !allow_nul && value == 0 {
56                return Err(perr(0..4, DisallowedNulEscape));
57            }
58
59            (Unescape::Byte(value), 4)
60        }
61
62        // Unicode escape
63        b'u' => {
64            if !unicode {
65                return Err(perr(0..2, UnicodeEscapeInByteLiteral));
66            }
67
68            if input.as_bytes().get(2) != Some(&b'{') {
69                return Err(perr(0..2, UnicodeEscapeWithoutBrace));
70            }
71
72            let closing_pos = input.bytes().position(|b| b == b'}')
73                .ok_or(perr(0..input.len(), UnterminatedUnicodeEscape))?;
74
75            let inner = &input[3..closing_pos];
76            if inner.as_bytes().first() == Some(&b'_') {
77                return Err(perr(3, InvalidStartOfUnicodeEscape));
78            }
79
80            let mut v: u32 = 0;
81            let mut digit_count = 0;
82            for (i, b) in inner.bytes().enumerate() {
83                if b == b'_' {
84                    continue;
85                }
86
87                let digit = hex_digit_value(b).ok_or(perr(3 + i, NonHexDigitInUnicodeEscape))?;
88
89                if digit_count == 6 {
90                    return Err(perr(3 + i, TooManyDigitInUnicodeEscape));
91                }
92                digit_count += 1;
93                v = 16 * v + digit as u32;
94            }
95
96            if !allow_nul && v == 0 {
97                return Err(perr(0..closing_pos + 1, DisallowedNulEscape));
98            }
99
100            let c = std::char::from_u32(v)
101                .ok_or(perr(0..closing_pos + 1, InvalidUnicodeEscapeChar))?;
102
103            (Unescape::Unicode(c), closing_pos + 1)
104        }
105
106        _ => return Err(perr(0..2, UnknownEscape)),
107    };
108
109    Ok(out)
110}
111
112/// Result of unescaping an escape-sequence in a string.
113pub(crate) enum Unescape {
114    Byte(u8),
115    Unicode(char),
116}
117
118impl Unescape {
119    /// Returns this value as `char`, panicking if it's a byte with a value > 0x7f.
120    pub(crate) fn unwrap_char(self) -> char {
121        match self {
122            Self::Byte(b) => {
123                assert!(b <= 0x7F, "non ASCII byte");
124                b.into()
125            }
126            Self::Unicode(c) => c,
127        }
128    }
129
130    /// Returns this value as `u8`, panicking if it was `Unicode`.
131    pub(crate) fn unwrap_byte(self) -> u8 {
132        match self {
133            Self::Byte(b) => b,
134            Self::Unicode(_) => panic!("unexpected unicode escape value"),
135        }
136    }
137}
138
139pub(crate) trait EscapeContainer {
140    fn new() -> Self;
141    fn is_empty(&self) -> bool;
142    fn push(&mut self, v: Unescape);
143    fn push_str(&mut self, s: &str);
144}
145
146impl EscapeContainer for Vec<u8> {
147    fn new() -> Self {
148        Self::new()
149    }
150    fn is_empty(&self) -> bool {
151        self.is_empty()
152    }
153    fn push_str(&mut self, s: &str) {
154        self.extend_from_slice(s.as_bytes());
155    }
156    fn push(&mut self, v: Unescape) {
157        match v {
158            Unescape::Byte(b) => self.push(b),
159            Unescape::Unicode(c) => {
160                let start = self.len();
161                self.resize(self.len() + c.len_utf8(), 0);
162                c.encode_utf8(&mut self[start..]);
163            }
164        }
165    }
166}
167
168impl EscapeContainer for String {
169    fn new() -> Self {
170        Self::new()
171    }
172    fn is_empty(&self) -> bool {
173        self.is_empty()
174    }
175    fn push_str(&mut self, s: &str) {
176        self.push_str(s);
177    }
178    fn push(&mut self, v: Unescape) {
179        self.push(v.unwrap_char());
180    }
181}
182
183
184/// Checks whether the character is skipped after a string continue start
185/// (unescaped backlash followed by `\n`).
186fn is_string_continue_skipable_whitespace(b: u8) -> bool {
187    b == b' ' || b == b'\t' || b == b'\n'
188}
189
190/// Unescapes a whole string or byte string.
191#[inline(never)]
192pub(crate) fn unescape_string<C: EscapeContainer>(
193    input: &str,
194    offset: usize,
195    unicode: bool,
196    byte_escapes: bool,
197    allow_nul: bool,
198) -> Result<(Option<C>, usize), ParseError> {
199    let mut closing_quote_pos = None;
200    let mut i = offset;
201    let mut end_last_escape = offset;
202    let mut value = C::new();
203    while i < input.len() {
204        match input.as_bytes()[i] {
205            // Handle "string continue".
206            b'\\' if input.as_bytes().get(i + 1) == Some(&b'\n') => {
207                value.push_str(&input[end_last_escape..i]);
208
209                // Find the first non-whitespace character.
210                let end_escape = input[i + 2..].bytes()
211                    .position(|b| !is_string_continue_skipable_whitespace(b))
212                    .ok_or(perr(None, UnterminatedString))?;
213
214                i += 2 + end_escape;
215                end_last_escape = i;
216            }
217            b'\\' => {
218                let rest = &input[i..input.len() - 1];
219                let (c, len) = unescape(rest, unicode, byte_escapes, allow_nul)
220                    .map_err(|e| e.offset_span(i))?;
221                value.push_str(&input[end_last_escape..i]);
222                value.push(c);
223                i += len;
224                end_last_escape = i;
225            }
226            b'\r' => return Err(perr(i, CarriageReturn)),
227            b'"' => {
228                closing_quote_pos = Some(i);
229                break;
230            }
231            b'\0' if !allow_nul => return Err(perr(i, NulByte)),
232            b if !unicode && !b.is_ascii() => return Err(perr(i, NonAsciiInByteLiteral)),
233            _ => i += 1,
234        }
235    }
236
237    let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?;
238
239    let start_suffix = closing_quote_pos + 1;
240    let suffix = &input[start_suffix..];
241    check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
242
243    // `value` is only empty if there was no escape in the input string
244    // (with the special case of the input being empty). This means the
245    // string value basically equals the input, so we store `None`.
246    let value = if value.is_empty() {
247        None
248    } else {
249        // There was an escape in the string, so we need to push the
250        // remaining unescaped part of the string still.
251        value.push_str(&input[end_last_escape..closing_quote_pos]);
252        Some(value)
253    };
254
255    Ok((value, start_suffix))
256}
257
258/// Reads and checks a raw (byte) string literal. Returns the number of hashes
259/// and the index when the suffix starts.
260#[inline(never)]
261pub(crate) fn scan_raw_string(
262    input: &str,
263    offset: usize,
264    unicode: bool,
265    allow_nul: bool,
266) -> Result<(u8, usize), ParseError> {
267    // Raw string literal
268    let num_hashes = input[offset..].bytes().position(|b| b != b'#')
269        .ok_or(perr(None, InvalidLiteral))?;
270
271    if num_hashes > 256 {
272        return Err(perr(offset..offset + num_hashes, TooManyHashes));
273    }
274
275    if input.as_bytes().get(offset + num_hashes) != Some(&b'"') {
276        return Err(perr(None, InvalidLiteral));
277    }
278    let start_inner = offset + num_hashes + 1;
279    let hashes = &input[offset..num_hashes + offset];
280
281    let mut closing_quote_pos = None;
282    let mut i = start_inner;
283    while i < input.len() {
284        let b = input.as_bytes()[i];
285        if b == b'"' && input[i + 1..].starts_with(hashes) {
286            closing_quote_pos = Some(i);
287            break;
288        }
289
290        // CR are just always disallowed in all (raw) strings. Rust performs
291        // a normalization of CR LF to just LF in a pass prior to lexing. But
292        // in lexing, it's disallowed.
293        if b == b'\r' {
294            return Err(perr(i, CarriageReturn));
295        }
296
297        if b == b'\0' && !allow_nul {
298            return Err(perr(i, NulByte));
299        }
300
301        if !unicode {
302            if !b.is_ascii() {
303                return Err(perr(i, NonAsciiInByteLiteral));
304            }
305        }
306
307        i += 1;
308    }
309
310    let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?;
311
312    let start_suffix = closing_quote_pos + num_hashes + 1;
313    let suffix = &input[start_suffix..];
314    check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?;
315
316    Ok((num_hashes as u8, start_suffix))
317}
litrs/escape.rs

litrs/
escape.rs