1 // Copyright (c) 2019, Facebook, Inc.
2 // All rights reserved.
4 // This source code is licensed under the MIT license found in the
5 // LICENSE file in the "hack" directory of this source tree.
7 // Implementation of string escaping logic.
8 // See http://php.net/manual/en/language.types.string.php
10 use std::{borrow::Cow, error::Error, fmt, io::Write};
12 use bstr::{BStr, BString};
16 pub struct InvalidString {
20 impl fmt::Display for InvalidString {
21 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
22 write!(f, "{}", self.msg)
26 impl Error for InvalidString {
27 fn description(&self) -> &str {
32 impl<'a> From<&'a str> for InvalidString {
33 fn from(x: &'a str) -> Self {
41 fn push(&mut self, byte: u8);
42 fn extend_from_slice(&mut self, slice: &[u8]);
45 impl GrowableBytes for Vec<u8> {
46 fn push(&mut self, byte: u8) {
49 fn extend_from_slice(&mut self, slice: &[u8]) {
50 self.extend_from_slice(slice)
54 impl GrowableBytes for bumpalo::collections::Vec<'_, u8> {
55 fn push(&mut self, byte: u8) {
58 fn extend_from_slice(&mut self, slice: &[u8]) {
59 self.extend_from_slice(slice)
63 fn is_printable(c: u8) -> bool {
64 (b' '..=b'~').contains(&c)
67 pub fn is_lit_printable(c: u8) -> bool {
68 is_printable(c) && c != b'\\' && c != b'\"'
71 fn is_hex(c: u8) -> bool {
72 (b'0'..=b'9').contains(&c) || (b'a'..=b'f').contains(&c) || (b'A'..=b'F').contains(&c)
75 fn is_oct(c: u8) -> bool {
76 (b'0'..=b'7').contains(&c)
79 /// This escapes a string using the format understood by the assembler
80 /// and php serialization. The assembler and php serialization probably
81 /// don't actually have the same rules but this should safely fit in both.
82 /// It will escape $ in octal so that it can also be used as a PHP double
84 pub fn escape_char(c: u8) -> Option<Cow<'static, [u8]>> {
86 b'\n' => Some((&b"\\n"[..]).into()),
87 b'\r' => Some((&b"\\r"[..]).into()),
88 b'\t' => Some((&b"\\t"[..]).into()),
89 b'\\' => Some((&b"\\\\"[..]).into()),
90 b'"' => Some((&b"\\\""[..]).into()),
92 c if is_lit_printable(c) => None,
95 write!(r, "\\{:03o}", c).unwrap();
101 /// `impl Into<..>` allows escape to take a String, consider the following,
103 /// let b = String::from("b");
107 /// Replacing `escape(b)` by `escape(&b)` leaks a reference of b to outer scope hence
108 /// compilation error.
109 pub fn escape<'a>(s: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
110 escape_by(s.into(), escape_char)
113 pub fn escape_bstr<'a>(s: impl Into<Cow<'a, BStr>>) -> Cow<'a, BStr> {
114 escape_bstr_by(s.into(), escape_char)
117 fn cow_str_to_bytes(s: Cow<'_, str>) -> Cow<'_, [u8]> {
119 Cow::Borrowed(s) => s.as_bytes().into(),
120 Cow::Owned(s) => s.into_bytes().into(),
124 fn cow_bstr_to_bytes(s: Cow<'_, BStr>) -> Cow<'_, [u8]> {
126 Cow::Borrowed(s) => s.as_ref().into(),
127 Cow::Owned(s) => <Vec<u8>>::from(s).into(),
131 pub fn escape_by<F>(s: Cow<'_, str>, f: F) -> Cow<'_, str>
133 F: Fn(u8) -> Option<Cow<'static, [u8]>>,
135 let r = escape_byte_by(cow_str_to_bytes(s), f);
137 Cow::Borrowed(s) => unsafe { std::str::from_utf8_unchecked(s) }.into(),
138 Cow::Owned(s) => unsafe { String::from_utf8_unchecked(s) }.into(),
142 pub fn escape_bstr_by<'a, F>(s: Cow<'a, BStr>, f: F) -> Cow<'a, BStr>
144 F: Fn(u8) -> Option<Cow<'static, [u8]>>,
146 let r = escape_byte_by(cow_bstr_to_bytes(s), f);
148 Cow::Borrowed(s) => <&BStr>::from(s).into(),
149 Cow::Owned(s) => BString::from(s).into(),
153 fn escape_byte_by<F: Fn(u8) -> Option<Cow<'static, [u8]>>>(
158 let mut copied = false;
159 let s = cow.as_ref();
160 for i in 0..s.len() {
162 None if copied => c.push(s[i]),
165 c.extend_from_slice(cc.as_ref());
167 c.extend_from_slice(&s[..i]);
168 c.extend_from_slice(cc.as_ref());
175 if copied { c.into() } else { cow }
178 fn codepoint_to_utf8(n: u32, output: &mut impl GrowableBytes) -> Result<(), InvalidString> {
180 output.push(n as u8);
181 } else if n <= 0x7ff {
182 output.push(0xc0 | (n >> 6) as u8);
183 output.push(0x80 | (n & 0b111111) as u8);
184 } else if n <= 0x00ffff {
185 output.push(0xe0 | (n >> 12) as u8);
186 output.push(0x80 | ((n >> 6) & 0b111111) as u8);
187 output.push(0x80 | (n & 0x3f) as u8);
188 } else if n <= 0x10ffff {
189 output.push(0xf0 | (n >> 18) as u8);
190 output.push(0x80 | ((n >> 12) & 0b111111) as u8);
191 output.push(0x80 | ((n >> 6) & 0b111111) as u8);
192 output.push(0x80 | (n & 0x3f) as u8);
194 return Err("UTF-8 codepoint too large".into());
199 fn parse_int(s: &[u8], base: u32) -> Result<u32, InvalidString> {
200 // input `s` can be assumed only contains ascii digits and 'aA' - 'fF',
201 // it is safe to call from_utf8 here.
202 let s = match std::str::from_utf8(s) {
205 return Err("invalid numeric escape".into());
208 let s = u32::from_str_radix(s, base);
211 _ => Err("invalid numeric escape".into()),
215 fn parse_numeric_escape(trim_to_byte: bool, s: &[u8], base: u32) -> Result<u8, InvalidString> {
216 match parse_int(s, base) {
218 if !trim_to_byte && (v > 255) {
219 Err("Invalid UTF-8 code point.".into())
224 Err(_) => Err("Invalid UTF-8 code point.".into()),
229 pub enum LiteralKind {
235 /// Copies `s` into `output`, replacing escape sequences with the characters
238 /// The output is NOT guaranteed to be valid UTF-8. While this function will
239 /// return `Err` in some cases where the input contains an escape sequence
240 /// specifying an invalid codepoint, it will return invalid UTF-8 in some
241 /// circumstances (e.g., for invalid UTF-8 encoded as hex or octal byte escapes,
242 /// or UTF-16 encoded as \u escapes).
244 literal_kind: LiteralKind,
246 output: &mut impl GrowableBytes,
247 ) -> Result<(), InvalidString> {
252 impl<'a> Scanner<'a> {
253 fn new(s: &'a [u8]) -> Self {
256 fn is_empty(&self) -> bool {
257 self.i >= self.s.len()
259 fn next(&mut self) -> Result<u8, InvalidString> {
260 if self.i >= self.s.len() {
261 return Err("string ended early".into());
263 let r = self.s[self.i];
267 fn take_if(&mut self, f: impl Fn(u8) -> bool, size: usize) -> &'a [u8] {
268 let l = usize::min(size + self.i, self.s.len());
270 while c < l && f(self.s[c]) {
273 let r = &self.s[self.i..c];
277 fn peek(&self) -> Option<u8> {
278 if self.i < self.s.len() {
291 let mut s = Scanner::new(s.as_bytes());
292 while !s.is_empty() {
294 if c != b'\\' || s.is_empty() {
299 b'a' if literal_kind == LiteralKind::LiteralLongString => output.push(b'\x07'),
300 b'b' if literal_kind == LiteralKind::LiteralLongString => output.push(b'\x08'),
301 b'\'' => output.extend_from_slice(b"\\\'"),
302 b'n' => match literal_kind {
303 LiteralKind::LiteralLongString => {}
304 _ => output.push(b'\n'),
306 b'r' => match literal_kind {
307 LiteralKind::LiteralLongString => {}
308 _ => output.push(b'\r'),
310 b't' => output.push(b'\t'),
311 b'v' => output.push(b'\x0b'),
312 b'e' => output.push(b'\x1b'),
313 b'f' => output.push(b'\x0c'),
314 b'\\' => output.push(b'\\'),
315 b'?' if literal_kind == LiteralKind::LiteralLongString => output.push(b'\x3f'),
316 b'$' if literal_kind != LiteralKind::LiteralLongString => output.push(b'$'),
317 b'\"' => match literal_kind {
318 LiteralKind::LiteralDoubleQuote | LiteralKind::LiteralLongString => {
321 _ => output.extend_from_slice(b"\\\""),
323 b'u' if literal_kind != LiteralKind::LiteralLongString
324 && s.peek() == Some(b'{') =>
327 let unicode = s.take_if(|c| c != b'}', 6);
328 let n = parse_int(unicode, 16)?;
329 codepoint_to_utf8(n, output)?;
332 return Err("Invalid UTF-8 escape sequence".into());
336 let hex = s.take_if(is_hex, 2);
341 let c = parse_numeric_escape(false, hex, 16)?;
342 output.push(c as u8);
347 let oct = s.take_if(is_oct, 3);
348 let c = parse_numeric_escape(true, oct, 8)?;
349 output.push(c as u8);
361 fn unescape_literal_into_string(
362 literal_kind: LiteralKind,
364 ) -> Result<BString, InvalidString> {
365 let mut output = Vec::with_capacity(s.len());
366 unescape_literal(literal_kind, s, &mut output)?;
370 fn unescape_literal_into_arena<'a>(
371 literal_kind: LiteralKind,
374 ) -> Result<&'a BStr, InvalidString> {
375 let mut output = bumpalo::collections::Vec::with_capacity_in(s.len(), arena);
376 unescape_literal(literal_kind, s, &mut output)?;
377 Ok(output.into_bump_slice().into())
380 pub fn unescape_double(s: &str) -> Result<BString, InvalidString> {
381 unescape_literal_into_string(LiteralKind::LiteralDoubleQuote, s)
384 pub fn unescape_heredoc(s: &str) -> Result<BString, InvalidString> {
385 unescape_literal_into_string(LiteralKind::LiteralHeredoc, s)
388 pub fn unescape_double_in<'a>(s: &str, arena: &'a Bump) -> Result<&'a BStr, InvalidString> {
389 unescape_literal_into_arena(LiteralKind::LiteralDoubleQuote, s, arena)
392 pub fn unescape_heredoc_in<'a>(s: &str, arena: &'a Bump) -> Result<&'a BStr, InvalidString> {
393 unescape_literal_into_arena(LiteralKind::LiteralHeredoc, s, arena)
396 /// Copies `s` into `output`, replacing escape sequences with the characters
397 /// they represent. The bytes added to `output` will be valid UTF-8.
398 fn unescape_single_or_nowdoc(
401 output: &mut impl GrowableBytes,
402 ) -> Result<(), InvalidString> {
403 let s = s.as_bytes();
408 if is_nowdoc || c != b'\\' {
413 return Err("string ended early".into());
417 b'\'' | b'\\' => output.push(c),
418 // unrecognized escapes are just copied over
430 fn unescape_single_or_nowdoc_into_string(
433 ) -> Result<String, InvalidString> {
434 let mut output = Vec::with_capacity(s.len());
435 unescape_single_or_nowdoc(is_nowdoc, s, &mut output)?;
436 // Safety: s is a valid &str, and unescape_single_or_nowdoc copies it into
437 // output, only adding and removing valid UTF-8 codepoints.
438 Ok(unsafe { String::from_utf8_unchecked(output) })
441 fn unescape_single_or_nowdoc_into_arena<'a>(
445 ) -> Result<&'a str, InvalidString> {
446 let mut output = bumpalo::collections::Vec::with_capacity_in(s.len(), arena);
447 unescape_single_or_nowdoc(is_nowdoc, s, &mut output)?;
448 // Safety: s is a valid &str, and unescape_single_or_nowdoc copies it into
449 // output, only adding and removing valid UTF-8 codepoints.
450 let string = unsafe { bumpalo::collections::String::from_utf8_unchecked(output) };
451 Ok(string.into_bump_str())
454 pub fn unescape_single(s: &str) -> Result<String, InvalidString> {
455 unescape_single_or_nowdoc_into_string(false, s)
458 pub fn unescape_nowdoc(s: &str) -> Result<String, InvalidString> {
459 unescape_single_or_nowdoc_into_string(true, s)
462 pub fn unescape_single_in<'a>(s: &str, arena: &'a Bump) -> Result<&'a str, InvalidString> {
463 unescape_single_or_nowdoc_into_arena(false, s, arena)
466 pub fn unescape_nowdoc_in<'a>(s: &str, arena: &'a Bump) -> Result<&'a str, InvalidString> {
467 unescape_single_or_nowdoc_into_arena(true, s, arena)
470 pub fn unescape_long_string(s: &str) -> Result<BString, InvalidString> {
471 unescape_literal_into_string(LiteralKind::LiteralLongString, s)
474 pub fn unescape_long_string_in<'a>(s: &str, arena: &'a Bump) -> Result<&'a BStr, InvalidString> {
475 unescape_literal_into_arena(LiteralKind::LiteralLongString, s, arena)
478 pub fn extract_unquoted_string(
482 ) -> Result<String, InvalidString> {
484 .get(start..start + len)
485 .ok_or_else(|| InvalidString::from("out of bounds or sliced at non-codepoint-boundary"))?;
486 Ok(unquote_str(substr).into())
489 /// Remove single quotes, double quotes, backticks, or heredoc/nowdoc delimiters
490 /// surrounding a string literal.
491 pub fn unquote_str(content: &str) -> &str {
492 let unquoted = unquote_slice(content.as_bytes());
493 // Safety: content is a valid &str. unquote_slice finds ASCII delimiters and
494 // removes the prefix and suffix surrounding them. Because it uses ASCII
495 // delimiters, we know it is slicing at codepoint boundaries.
496 unsafe { std::str::from_utf8_unchecked(unquoted) }
499 fn find(s: &[u8], needle: u8) -> Option<usize> {
500 for (i, &c) in s.iter().enumerate() {
508 fn rfind(s: &[u8], needle: u8) -> Option<usize> {
519 /// Remove single quotes, double quotes, backticks, or heredoc/nowdoc delimiters
520 /// surrounding a string literal. If the input slice is valid UTF-8, the output
521 /// slice will also be valid UTF-8.
522 pub fn unquote_slice(content: &[u8]) -> &[u8] {
523 if content.len() < 2 {
525 } else if content.starts_with(b"<<<") {
527 // These types of strings begin with an opening line containing <<<
528 // followed by a string to use as a terminator (which is optionally
529 // quoted), and end with a line containing only the terminator.
530 // We need to drop the opening line and terminator line.
531 match (find(content, b'\n'), rfind(content, b'\n')) {
532 (Some(start), Some(end)) => {
533 // An empty heredoc, this way, will have start >= end
537 &content[start + 1..end]
544 let c2 = content[content.len() - 1];
545 if c1 == c2 && (c1 == b'\'' || c1 == b'"' || c1 == b'`') {
546 &content[1..content.len() - 1]
557 use pretty_assertions::assert_eq; // make assert_eq print huge diffs more human-readable
560 fn unescape_single_or_nowdoc() {
561 assert_eq!(unescape_single("").unwrap(), "");
562 assert_eq!(unescape_nowdoc("").unwrap(), "");
563 assert_eq!(unescape_long_string("").unwrap(), "");
564 assert_eq!(unescape_double("").unwrap(), "");
565 assert_eq!(unescape_heredoc("").unwrap(), "");
568 unescape_single("home \\\\$").unwrap(),
569 "home \\$".to_string()
571 assert_eq!(unescape_nowdoc("home \\$").unwrap(), "home \\$".to_string());
572 assert_eq!(unescape_single("home \\'").unwrap(), "home '".to_string());
573 assert_eq!(unescape_nowdoc("home \\'").unwrap(), "home \\'".to_string());
574 assert_eq!(unescape_nowdoc("\\`").unwrap(), "\\`");
575 assert_eq!(unescape_single("\\a\\\'").unwrap(), "\\a'");
576 assert_eq!(unescape_long_string("\\a").unwrap(), "\x07");
577 assert_eq!(unescape_long_string("\\v").unwrap(), "\x0b");
578 assert_eq!(unescape_long_string("\\\'").unwrap(), "\\\'");
579 assert_eq!(unescape_long_string("\\\\").unwrap(), "\\");
580 assert_eq!(unescape_long_string("?").unwrap(), "\x3f");
581 assert_eq!(unescape_long_string("$").unwrap(), "$");
583 assert_eq!(unescape_long_string("\\b").unwrap(), "\x08");
584 assert_eq!(unescape_long_string("\\e").unwrap(), "\x1b");
585 assert_eq!(unescape_long_string("\\f").unwrap(), "\x0c");
586 assert_eq!(unescape_long_string("\\\"").unwrap(), "\"");
587 assert_eq!(unescape_long_string("\\`").unwrap(), "\\`");
588 assert_eq!(unescape_heredoc("\\\"").unwrap(), "\\\"");
589 assert_eq!(unescape_heredoc("\\p").unwrap(), "\\p");
590 assert_eq!(unescape_long_string("\\r").unwrap(), "");
591 assert_eq!(unescape_double("\\u{b1}").unwrap(), "±");
593 assert_eq!(unescape_double("\\x27\\x22").unwrap(), "\'\"");
594 assert_eq!(unescape_double("\\X27\\X22").unwrap(), "\'\"");
596 unescape_double("\\141\\156\\143\\150\\157\\162").unwrap(),
599 assert_eq!(unescape_long_string("\\xb1").unwrap(), B(&[177u8]));
601 let euro = "\u{20AC}"; // as bytes [226, 130, 172]
603 unescape_long_string(euro).unwrap(),
604 B(&[226u8, 130u8, 172u8])
606 assert_eq!(unescape_long_string("\\xb1").unwrap(), B(&[177u8]));
608 let euro = "\u{20AC}"; // as bytes [226, 130, 172]
610 unescape_long_string(euro).unwrap(),
611 B(&[226u8, 130u8, 172u8])
614 let invalid = r#"\u{D800}\u{DF1E}"#;
616 unescape_double(invalid).unwrap(),
617 B(&[237u8, 160u8, 128u8, 237u8, 188u8, 158u8])
622 fn parse_int_test() {
623 assert_eq!(parse_int(b"2", 10).unwrap(), 2);
624 assert!(parse_int(b"h", 10).is_err());
625 assert_eq!(parse_int(b"12", 8).unwrap(), 10);
626 assert_eq!(parse_int(b"b1", 16).unwrap(), 177)
630 fn escape_char_test() {
631 let escape_char_ = |c: u8| -> String {
632 let r = escape_char(c)
633 .unwrap_or_else(|| vec![c].into())
635 unsafe { String::from_utf8_unchecked(r) }
638 assert_eq!(escape_char_(b'a'), "a");
639 assert_eq!(escape_char_(b'$'), "$");
640 assert_eq!(escape_char_(b'\"'), "\\\"");
641 assert_eq!(escape_char_(0), "\\000");
642 assert_eq!(escape("house"), "house");
643 assert_eq!(escape("\n"), "\\n");
644 assert_eq!(escape("red\n\t\r$?"), "red\\n\\t\\r$?");
645 assert!(is_oct(b'5'));
646 assert!(!is_oct(b'a'));
650 fn extract_unquoted_string_test() {
651 assert_eq!(extract_unquoted_string("'a'", 0, 3).unwrap(), "a");
652 assert_eq!(extract_unquoted_string("\"a\"", 0, 3).unwrap(), "a");
653 assert_eq!(extract_unquoted_string("`a`", 0, 3).unwrap(), "a");
654 assert_eq!(extract_unquoted_string("", 0, 0).unwrap(), "");
655 assert_eq!(extract_unquoted_string("''", 0, 2).unwrap(), "");
656 assert_eq!(extract_unquoted_string("'a", 0, 2).unwrap(), "'a");
657 assert_eq!(extract_unquoted_string("a", 0, 1).unwrap(), "a");
658 assert_eq!(extract_unquoted_string("<<<EOT\n\nEOT", 0, 11).unwrap(), "");
660 extract_unquoted_string("<<<EOT\na\nEOT", 0, 12).unwrap(),
667 assert_eq!(rfind(b"", b'a'), None);
668 assert_eq!(rfind(b"a", b'a'), Some(0));
669 assert_eq!(rfind(b"b", b'a'), None);
670 assert_eq!(rfind(b"ba", b'a'), Some(1));
674 fn unquote_str_test() {
675 assert_eq!(unquote_str(""), "");
676 assert_eq!(unquote_str("''"), "");
677 assert_eq!(unquote_str("\"\""), "");
678 assert_eq!(unquote_str("``"), "");
680 assert_eq!(unquote_str("'a'"), "a");
681 assert_eq!(unquote_str("\"a\""), "a");
682 assert_eq!(unquote_str("`a`"), "a");
683 assert_eq!(unquote_str(r#"`a\``"#), r#"a\`"#);
685 assert_eq!(unquote_str("<<<EOT\nEOT"), "");
686 assert_eq!(unquote_str("<<<EOT\n\nEOT"), "");
687 assert_eq!(unquote_str("<<<EOT\n\n\nEOT"), "\n");
688 assert_eq!(unquote_str("<<<EOT\na\nEOT"), "a");
689 assert_eq!(unquote_str("<<<EOT\n\na\n\nEOT"), "\na\n");
691 assert_eq!(unquote_str("'"), "'");
692 assert_eq!(unquote_str("\""), "\"");
693 assert_eq!(unquote_str("`"), "`");
695 assert_eq!(unquote_str("a"), "a");
696 assert_eq!(unquote_str("`a"), "`a");
697 assert_eq!(unquote_str(" `a`"), " `a`");
698 assert_eq!(unquote_str("'a\""), "'a\"");
700 assert_eq!(unquote_str("<<<"), "<<<");
701 assert_eq!(unquote_str("<<<EOTEOT"), "<<<EOTEOT");