Fix escaping of single quotation in nowdocs
[hiphop-php.git] / hphp / hack / src / utils / php_escaping.ml
blob364d6422175e2f58129891f37a03aeb0e159c0f3
1 (*
2 * Copyright (c) 2015, Facebook, Inc.
3 * All rights reserved.
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the "hack" directory of this source tree. An additional grant
7 * of patent rights can be found in the PATENTS file in the same directory.
9 *)
11 (* Implementation of string escaping stuff. Ugggggggh.
12 * See http://php.net/manual/en/language.types.string.php *)
14 exception Invalid_string of string;;
16 let is_printable c = c >= ' ' && c <= '~'
17 let is_lit_printable c = is_printable c && c <> '\\' && c <> '\"'
19 let is_hex c = (c >= '0' && c <= '9') ||
20 (c >= 'a' && c <= 'f') ||
21 (c >= 'A' && c <= 'F')
22 let is_oct c = c >= '0' && c <= '7'
24 (* This escapes a string using the format understood by the assembler
25 * and php serialization. The assembler and php serialization probably
26 * don't actually have the same rules but this should safely fit in both.
27 * It will escape $ in octal so that it can also be used as a PHP double
28 * string. *)
30 let escape_char = function
31 | '\n' -> "\\n"
32 | '\r' -> "\\r"
33 | '\t' -> "\\t"
34 | '\\' -> "\\\\"
35 | '"' -> "\\\""
36 | '$' -> "$"
37 | '?' -> "\\?"
38 | c when is_lit_printable c -> String.make 1 c
39 | c -> Printf.sprintf "\\%03o" (Char.code c)
41 let escape ?(f = escape_char) s =
42 let buf = Buffer.create (String.length s) in
43 String.iter (fun c -> Buffer.add_string buf @@ f c) s;
44 Buffer.contents buf
46 (* Convert a codepoint to utf-8, appending the the bytes to a buffer *)
47 let codepoint_to_utf8 n buf =
48 let add i = Buffer.add_char buf (Char.chr i) in
49 if n <= 0x7F then begin
50 add n
51 end else if n <= 0x7FF then begin
52 add (0xC0 lor ((n lsr 6) ));
53 add (0x80 lor ((n ) land 0x3F));
54 end else if n <= 0x00FFFF then begin
55 add (0xE0 lor ((n lsr 12) ));
56 add (0x80 lor ((n lsr 6) land 0x3F));
57 add (0x80 lor ((n ) land 0x3F));
58 end else if n <= 0x10FFFF then begin
59 add (0xF0 lor ((n lsr 18) ));
60 add (0x80 lor ((n lsr 12) land 0x3F));
61 add (0x80 lor ((n lsr 6) land 0x3F));
62 add (0x80 lor ((n ) land 0x3F));
63 end else
64 raise (Invalid_string "UTF-8 codepoint too large")
66 let parse_int s =
67 try
68 int_of_string s
69 with _ -> raise (Invalid_string "invalid numeric escape")
70 let parse_numeric_escape ?(trim_to_byte = false) s =
71 try
72 let v = parse_int s in
73 let v = if trim_to_byte then v land 0xFF else v in
74 Char.chr v
75 with _ -> raise (Invalid_string "escaped character too large")
77 type literal_kind =
78 | Literal_heredoc
79 | Literal_double_quote
80 | Literal_backtick
82 let unescape_literal literal_kind s =
83 let len = String.length s in
84 let buf = Buffer.create len in
85 let idx = ref 0 in
86 let next () =
87 if !idx >= len then raise (Invalid_string "string ended early") else
88 let c = s.[!idx] in (incr idx; c)
91 (* Count how many characters, starting at the current string index.
92 * Will always stop at i=max. *)
93 let rec count_f f ~max i =
94 if i >= max || !idx + i >= len || not (f s.[!idx+i]) then i
95 else count_f f max (i+1)
98 while !idx < len do
99 let c = next () in
100 if c <> '\\' then Buffer.add_char buf c else begin
101 let c = next () in
102 match c with
103 | '\'' -> Buffer.add_string buf "\\\'"
104 | 'n' -> Buffer.add_char buf '\n'
105 | 'r' -> Buffer.add_char buf '\r'
106 | 't' -> Buffer.add_char buf '\t'
107 | 'v' -> Buffer.add_char buf '\x0b'
108 | 'e' -> Buffer.add_char buf '\x1b'
109 | 'f' -> Buffer.add_char buf '\x0c'
110 | '\\' -> Buffer.add_char buf '\\'
111 | '$' -> Buffer.add_char buf '$'
112 | '`' ->
113 if literal_kind = Literal_backtick
114 then Buffer.add_char buf '`'
115 else Buffer.add_string buf "\\`"
116 | '\"' ->
117 if literal_kind = Literal_double_quote
118 then Buffer.add_char buf '\"'
119 else Buffer.add_string buf "\\\""
120 | 'u' when !idx < len && s.[!idx] = '{' ->
121 let _ = next () in
122 let unicode_count = count_f (fun c -> c <> '}') ~max:6 0 in
123 let n = parse_int ("0x" ^ String.sub s (!idx) unicode_count) in
124 codepoint_to_utf8 n buf;
125 idx := !idx + unicode_count;
126 if next () <> '}' then
127 raise (Invalid_string "Invalid UTF-8 escape sequence")
128 | 'x' ->
129 let hex_count = count_f is_hex ~max:2 0 in
130 if hex_count = 0 then
131 Buffer.add_string buf "\\x"
132 else
133 let c = parse_numeric_escape ("0x" ^ String.sub s (!idx) hex_count) in
134 Buffer.add_char buf c;
135 idx := !idx + hex_count
136 | c when is_oct c ->
137 idx := !idx - 1;
138 let oct_count = count_f is_oct ~max:3 0 in
139 let c = parse_numeric_escape
140 ~trim_to_byte:true ("0o" ^ String.sub s (!idx) oct_count) in
141 Buffer.add_char buf c;
142 idx := !idx + oct_count
143 (* unrecognized escapes are just copied over *)
144 | c ->
145 Buffer.add_char buf '\\';
146 Buffer.add_char buf c
147 end;
149 done;
151 Buffer.contents buf
153 let unescape_double s =
154 unescape_literal Literal_double_quote s
156 let unescape_backtick s =
157 unescape_literal Literal_backtick s
159 let unescape_heredoc s =
160 unescape_literal Literal_heredoc s
162 let unescape_single_or_nowdoc ~is_nowdoc s =
163 let len = String.length s in
164 let buf = Buffer.create len in
165 let idx = ref 0 in
166 let next () =
167 if !idx >= len then raise (Invalid_string "string ended early") else
168 let c = s.[!idx] in (incr idx; c)
171 while !idx < len do
172 let c = next () in
173 if c <> '\\' then Buffer.add_char buf c else begin
174 let c = next () in
175 match c with
176 | '\'' when not is_nowdoc -> Buffer.add_char buf '\''
177 | '\\' when not is_nowdoc -> Buffer.add_char buf '\\'
178 (* unrecognized escapes are just copied over *)
179 | c ->
180 Buffer.add_char buf '\\';
181 Buffer.add_char buf c
182 end;
184 done;
186 Buffer.contents buf
188 let unescape_single s =
189 unescape_single_or_nowdoc ~is_nowdoc:false s
191 let unescape_nowdoc s =
192 unescape_single_or_nowdoc ~is_nowdoc:true s