hphp/hack/src/utils/php_escaping.ml

   1 (*
   2  * Copyright (c) 2015, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under the BSD-style license found in the
   6  * LICENSE file in the "hack" directory of this source tree. An additional grant
   7  * of patent rights can be found in the PATENTS file in the same directory.
   8  *
   9  *)
  10
  11 (* Implementation of string escaping stuff. Ugggggggh.
  12  * See http://php.net/manual/en/language.types.string.php *)
  13
  14 exception Invalid_string of string;;
  15
  16 let is_printable c = c >= ' ' && c <= '~'
  17 let is_lit_printable c = is_printable c && c <> '\\' && c <> '\"'
  18
  19 let is_hex c = (c >= '0' && c <= '9') ||
  20                (c >= 'a' && c <= 'f') ||
  21                (c >= 'A' && c <= 'F')
  22 let is_oct c = c >= '0' && c <= '7'
  23
  24 (* This escapes a string using the format understood by the assembler
  25  * and php serialization. The assembler and php serialization probably
  26  * don't actually have the same rules but this should safely fit in both.
  27  * It will escape $ in octal so that it can also be used as a PHP double
  28  * string. *)
  29
  30 let escape_char = function
  31   | '\n' -> "\\n"
  32   | '\r' -> "\\r"
  33   | '\t' -> "\\t"
  34   | '\\' -> "\\\\"
  35   | '"' -> "\\\""
  36   | '$' -> "$"
  37   | '?' -> "\\?"
  38   | c when is_lit_printable c -> String.make 1 c
  39   | c -> Printf.sprintf "\\%03o" (Char.code c)
  40
  41 let escape ?(f = escape_char) s =
  42   let buf = Buffer.create (String.length s) in
  43   String.iter (fun c -> Buffer.add_string buf @@ f c) s;
  44   Buffer.contents buf
  45
  46 (* Convert a codepoint to utf-8, appending the the bytes to a buffer *)
  47 let codepoint_to_utf8 n buf =
  48   let add i = Buffer.add_char buf (Char.chr i) in
  49   if n <= 0x7F then begin
  50     add n
  51   end else if n <= 0x7FF then begin
  52     add (0xC0 lor ((n lsr 6)          ));
  53     add (0x80 lor ((n      ) land 0x3F));
  54   end else if n <= 0x00FFFF then begin
  55     add (0xE0 lor ((n lsr 12)          ));
  56     add (0x80 lor ((n lsr  6) land 0x3F));
  57     add (0x80 lor ((n       ) land 0x3F));
  58   end else if n <= 0x10FFFF then begin
  59     add (0xF0 lor ((n lsr 18)          ));
  60     add (0x80 lor ((n lsr 12) land 0x3F));
  61     add (0x80 lor ((n lsr  6) land 0x3F));
  62     add (0x80 lor ((n       ) land 0x3F));
  63   end else
  64     raise (Invalid_string "UTF-8 codepoint too large")
  65
  66 let parse_int s =
  67   try
  68     int_of_string s
  69   with _ -> raise (Invalid_string "invalid numeric escape")
  70 let parse_numeric_escape ?(trim_to_byte = false) s =
  71   try
  72     let v = parse_int s in
  73     let v =  if trim_to_byte then v land 0xFF else v in
  74     Char.chr v
  75   with _ -> raise (Invalid_string "escaped character too large")
  76
  77 type literal_kind =
  78   | Literal_heredoc
  79   | Literal_double_quote
  80   | Literal_backtick
  81
  82 let unescape_literal literal_kind s =
  83   let len = String.length s in
  84   let buf = Buffer.create len in
  85   let idx = ref 0 in
  86   let next () =
  87     if !idx >= len then raise (Invalid_string "string ended early") else
  88       let c = s.[!idx] in (incr idx; c)
  89   in
  90
  91   (* Count how many characters, starting at the current string index.
  92    * Will always stop at i=max. *)
  93   let rec count_f f ~max i =
  94     if i >= max || !idx + i >= len || not (f s.[!idx+i]) then i
  95     else count_f f max (i+1)
  96   in
  97
  98   while !idx < len do
  99     let c = next () in
 100     if c <> '\\' then Buffer.add_char buf c else begin
 101       let c = next () in
 102       match c with
 103       | '\'' -> Buffer.add_string buf "\\\'"
 104       | 'n'  -> Buffer.add_char buf '\n'
 105       | 'r'  -> Buffer.add_char buf '\r'
 106       | 't'  -> Buffer.add_char buf '\t'
 107       | 'v'  -> Buffer.add_char buf '\x0b'
 108       | 'e'  -> Buffer.add_char buf '\x1b'
 109       | 'f'  -> Buffer.add_char buf '\x0c'
 110       | '\\' -> Buffer.add_char buf '\\'
 111       | '$'  -> Buffer.add_char buf '$'
 112       | '`' ->
 113         if literal_kind = Literal_backtick
 114         then Buffer.add_char buf '`'
 115         else Buffer.add_string buf "\\`"
 116       | '\"' ->
 117         if literal_kind = Literal_double_quote
 118         then Buffer.add_char buf '\"'
 119         else Buffer.add_string buf "\\\""
 120       | 'u' when !idx < len && s.[!idx] = '{' ->
 121         let _ = next () in
 122         let unicode_count = count_f (fun c -> c <> '}') ~max:6 0 in
 123         let n = parse_int ("0x" ^ String.sub s (!idx) unicode_count) in
 124         codepoint_to_utf8 n buf;
 125         idx := !idx + unicode_count;
 126         if next () <> '}' then
 127           raise (Invalid_string "Invalid UTF-8 escape sequence")
 128       | 'x' ->
 129         let hex_count = count_f is_hex ~max:2 0 in
 130         if hex_count = 0 then
 131           Buffer.add_string buf "\\x"
 132         else
 133           let c = parse_numeric_escape ("0x" ^ String.sub s (!idx) hex_count) in
 134           Buffer.add_char buf c;
 135           idx := !idx + hex_count
 136       | c when is_oct c ->
 137         idx := !idx - 1;
 138         let oct_count = count_f is_oct ~max:3 0 in
 139         let c = parse_numeric_escape
 140           ~trim_to_byte:true ("0o" ^ String.sub s (!idx) oct_count) in
 141         Buffer.add_char buf c;
 142         idx := !idx + oct_count
 143       (* unrecognized escapes are just copied over *)
 144       | c ->
 145         Buffer.add_char buf '\\';
 146         Buffer.add_char buf c
 147     end;
 148
 149   done;
 150
 151   Buffer.contents buf
 152
 153 let unescape_double s =
 154   unescape_literal Literal_double_quote s
 155
 156 let unescape_backtick s =
 157   unescape_literal Literal_backtick s
 158
 159 let unescape_heredoc s =
 160   unescape_literal Literal_heredoc s
 161
 162 let unescape_single_or_nowdoc ~is_nowdoc s =
 163   let len = String.length s in
 164   let buf = Buffer.create len in
 165   let idx = ref 0 in
 166   let next () =
 167     if !idx >= len then raise (Invalid_string "string ended early") else
 168       let c = s.[!idx] in (incr idx; c)
 169   in
 170
 171   while !idx < len do
 172     let c = next () in
 173     if c <> '\\' then Buffer.add_char buf c else begin
 174       let c = next () in
 175       match c with
 176       | '\'' when not is_nowdoc -> Buffer.add_char buf '\''
 177       | '\\' when not is_nowdoc -> Buffer.add_char buf '\\'
 178       (* unrecognized escapes are just copied over *)
 179       | c ->
 180         Buffer.add_char buf '\\';
 181         Buffer.add_char buf c
 182     end;
 183
 184   done;
 185
 186   Buffer.contents buf
 187
 188 let unescape_single s =
 189   unescape_single_or_nowdoc ~is_nowdoc:false s
 190
 191 let unescape_nowdoc s =
 192   unescape_single_or_nowdoc ~is_nowdoc:true s