parsing/lexer.mll

   1 (***********************************************************************)
   2 (*                                                                     *)
   3 (*                           Objective Caml                            *)
   4 (*                                                                     *)
   5 (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
   6 (*                                                                     *)
   7 (*  Copyright 1996 Institut National de Recherche en Informatique et   *)
   8 (*  en Automatique.  All rights reserved.  This file is distributed    *)
   9 (*  under the terms of the Q Public License version 1.0.               *)
  10 (*                                                                     *)
  11 (***********************************************************************)
  12
  13 (* $Id$ *)
  14
  15 (* The lexer definition *)
  16
  17 {
  18 open Lexing
  19 open Misc
  20 open Parser
  21
  22 type error =
  23   | Illegal_character of char
  24   | Illegal_escape of string
  25   | Unterminated_comment
  26   | Unterminated_string
  27   | Unterminated_string_in_comment
  28   | Keyword_as_label of string
  29   | Literal_overflow of string
  30 ;;
  31
  32 exception Error of error * Location.t;;
  33
  34 (* The table of keywords *)
  35
  36 let keyword_table =
  37   create_hashtable 149 [
  38     "and", AND;
  39     "as", AS;
  40     "assert", ASSERT;
  41     "begin", BEGIN;
  42     "class", CLASS;
  43     "constraint", CONSTRAINT;
  44     "do", DO;
  45     "done", DONE;
  46     "downto", DOWNTO;
  47     "else", ELSE;
  48     "end", END;
  49     "exception", EXCEPTION;
  50     "external", EXTERNAL;
  51     "false", FALSE;
  52     "for", FOR;
  53     "fun", FUN;
  54     "function", FUNCTION;
  55     "functor", FUNCTOR;
  56     "if", IF;
  57     "in", IN;
  58     "include", INCLUDE;
  59     "inherit", INHERIT;
  60     "initializer", INITIALIZER;
  61     "lazy", LAZY;
  62     "let", LET;
  63     "match", MATCH;
  64     "method", METHOD;
  65     "module", MODULE;
  66     "mutable", MUTABLE;
  67     "new", NEW;
  68     "object", OBJECT;
  69     "of", OF;
  70     "open", OPEN;
  71     "or", OR;
  72 (*  "parser", PARSER; *)
  73     "private", PRIVATE;
  74     "rec", REC;
  75     "sig", SIG;
  76     "struct", STRUCT;
  77     "then", THEN;
  78     "to", TO;
  79     "true", TRUE;
  80     "try", TRY;
  81     "type", TYPE;
  82     "val", VAL;
  83     "virtual", VIRTUAL;
  84     "when", WHEN;
  85     "while", WHILE;
  86     "with", WITH;
  87
  88     "mod", INFIXOP3("mod");
  89     "land", INFIXOP3("land");
  90     "lor", INFIXOP3("lor");
  91     "lxor", INFIXOP3("lxor");
  92     "lsl", INFIXOP4("lsl");
  93     "lsr", INFIXOP4("lsr");
  94     "asr", INFIXOP4("asr")
  95 ]
  96
  97 (* To buffer string literals *)
  98
  99 let initial_string_buffer = String.create 256
 100 let string_buff = ref initial_string_buffer
 101 let string_index = ref 0
 102
 103 let reset_string_buffer () =
 104   string_buff := initial_string_buffer;
 105   string_index := 0
 106
 107 let store_string_char c =
 108   if !string_index >= String.length (!string_buff) then begin
 109     let new_buff = String.create (String.length (!string_buff) * 2) in
 110       String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
 111       string_buff := new_buff
 112   end;
 113   String.unsafe_set (!string_buff) (!string_index) c;
 114   incr string_index
 115
 116 let get_stored_string () =
 117   let s = String.sub (!string_buff) 0 (!string_index) in
 118   string_buff := initial_string_buffer;
 119   s
 120
 121 (* To store the position of the beginning of a string and comment *)
 122 let string_start_loc = ref Location.none;;
 123 let comment_start_loc = ref [];;
 124 let in_comment () = !comment_start_loc <> [];;
 125
 126 (* To translate escape sequences *)
 127
 128 let char_for_backslash = function
 129   | 'n' -> '\010'
 130   | 'r' -> '\013'
 131   | 'b' -> '\008'
 132   | 't' -> '\009'
 133   | c   -> c
 134
 135 let char_for_decimal_code lexbuf i =
 136   let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
 137            10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
 138                 (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in
 139   if (c < 0 || c > 255) && not (in_comment ())
 140   then raise (Error(Illegal_escape (Lexing.lexeme lexbuf),
 141                     Location.curr lexbuf))
 142   else Char.chr c
 143
 144 let char_for_hexadecimal_code lexbuf i =
 145   let d1 = Char.code (Lexing.lexeme_char lexbuf i) in
 146   let val1 = if d1 >= 97 then d1 - 87
 147              else if d1 >= 65 then d1 - 55
 148              else d1 - 48
 149   in
 150   let d2 = Char.code (Lexing.lexeme_char lexbuf (i+1)) in
 151   let val2 = if d2 >= 97 then d2 - 87
 152              else if d2 >= 65 then d2 - 55
 153              else d2 - 48
 154   in
 155   Char.chr (val1 * 16 + val2)
 156
 157 (* Remove underscores from float literals *)
 158
 159 let remove_underscores s =
 160   let l = String.length s in
 161   let rec remove src dst =
 162     if src >= l then
 163       if dst >= l then s else String.sub s 0 dst
 164     else
 165       match s.[src] with
 166         '_' -> remove (src + 1) dst
 167       |  c  -> s.[dst] <- c; remove (src + 1) (dst + 1)
 168   in remove 0 0
 169
 170 (* Update the current location with file name and line number. *)
 171
 172 let update_loc lexbuf file line absolute chars =
 173   let pos = lexbuf.lex_curr_p in
 174   let new_file = match file with
 175                  | None -> pos.pos_fname
 176                  | Some s -> s
 177   in
 178   lexbuf.lex_curr_p <- { pos with
 179     pos_fname = new_file;
 180     pos_lnum = if absolute then line else pos.pos_lnum + line;
 181     pos_bol = pos.pos_cnum - chars;
 182   }
 183 ;;
 184
 185 (* Error report *)
 186
 187 open Format
 188
 189 let report_error ppf = function
 190   | Illegal_character c ->
 191       fprintf ppf "Illegal character (%s)" (Char.escaped c)
 192   | Illegal_escape s ->
 193       fprintf ppf "Illegal backslash escape in string or character (%s)" s
 194   | Unterminated_comment ->
 195       fprintf ppf "Comment not terminated"
 196   | Unterminated_string ->
 197       fprintf ppf "String literal not terminated"
 198   | Unterminated_string_in_comment ->
 199       fprintf ppf "This comment contains an unterminated string literal"
 200   | Keyword_as_label kwd ->
 201       fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd
 202   | Literal_overflow ty ->
 203       fprintf ppf "Integer literal exceeds the range of representable integers of type %s" ty
 204 ;;
 205
 206 }
 207
 208 let newline = ('\010' | '\013' | "\013\010")
 209 let blank = [' ' '\009' '\012']
 210 let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
 211 let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
 212 let identchar =
 213   ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
 214 let symbolchar =
 215   ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
 216 let decimal_literal =
 217   ['0'-'9'] ['0'-'9' '_']*
 218 let hex_literal =
 219   '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']*
 220 let oct_literal =
 221   '0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
 222 let bin_literal =
 223   '0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
 224 let int_literal =
 225   decimal_literal | hex_literal | oct_literal | bin_literal
 226 let float_literal =
 227   ['0'-'9'] ['0'-'9' '_']*
 228   ('.' ['0'-'9' '_']* )?
 229   (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?
 230
 231 rule token = parse
 232   | newline
 233       { update_loc lexbuf None 1 false 0;
 234         token lexbuf
 235       }
 236   | blank +
 237       { token lexbuf }
 238   | "_"
 239       { UNDERSCORE }
 240   | "~"  { TILDE }
 241   | "~" lowercase identchar * ':'
 242       { let s = Lexing.lexeme lexbuf in
 243         let name = String.sub s 1 (String.length s - 2) in
 244         if Hashtbl.mem keyword_table name then
 245           raise (Error(Keyword_as_label name, Location.curr lexbuf));
 246         LABEL name }
 247   | "?"  { QUESTION }
 248   | "??" { QUESTIONQUESTION }
 249   | "?" lowercase identchar * ':'
 250       { let s = Lexing.lexeme lexbuf in
 251         let name = String.sub s 1 (String.length s - 2) in
 252         if Hashtbl.mem keyword_table name then
 253           raise (Error(Keyword_as_label name, Location.curr lexbuf));
 254         OPTLABEL name }
 255   | lowercase identchar *
 256       { let s = Lexing.lexeme lexbuf in
 257           try
 258             Hashtbl.find keyword_table s
 259           with Not_found ->
 260             LIDENT s }
 261   | uppercase identchar *
 262       { UIDENT(Lexing.lexeme lexbuf) }       (* No capitalized keywords *)
 263   | int_literal
 264       { try
 265           INT (int_of_string(Lexing.lexeme lexbuf))
 266         with Failure _ ->
 267           raise (Error(Literal_overflow "int", Location.curr lexbuf))
 268       }
 269   | float_literal
 270       { FLOAT (remove_underscores(Lexing.lexeme lexbuf)) }
 271   | int_literal "l"
 272       { let s = Lexing.lexeme lexbuf in
 273         try
 274           INT32 (Int32.of_string(String.sub s 0 (String.length s - 1)))
 275         with Failure _ ->
 276           raise (Error(Literal_overflow "int32", Location.curr lexbuf)) }
 277   | int_literal "L"
 278       { let s = Lexing.lexeme lexbuf in
 279         try
 280           INT64 (Int64.of_string(String.sub s 0 (String.length s - 1)))
 281         with Failure _ ->
 282           raise (Error(Literal_overflow "int64", Location.curr lexbuf)) }
 283   | int_literal "n"
 284       { let s = Lexing.lexeme lexbuf in
 285         try
 286           NATIVEINT
 287             (Nativeint.of_string(String.sub s 0 (String.length s - 1)))
 288         with Failure _ ->
 289           raise (Error(Literal_overflow "nativeint", Location.curr lexbuf)) }
 290   | "\""
 291       { reset_string_buffer();
 292         let string_start = lexbuf.lex_start_p in
 293         string_start_loc := Location.curr lexbuf;
 294         string lexbuf;
 295         lexbuf.lex_start_p <- string_start;
 296         STRING (get_stored_string()) }
 297   | "'" newline "'"
 298       { update_loc lexbuf None 1 false 1;
 299         CHAR (Lexing.lexeme_char lexbuf 1) }
 300   | "'" [^ '\\' '\'' '\010' '\013'] "'"
 301       { CHAR(Lexing.lexeme_char lexbuf 1) }
 302   | "'\\" ['\\' '\'' '"' 'n' 't' 'b' 'r' ' '] "'"
 303       { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
 304   | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
 305       { CHAR(char_for_decimal_code lexbuf 2) }
 306   | "'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"
 307       { CHAR(char_for_hexadecimal_code lexbuf 3) }
 308   | "'\\" _
 309       { let l = Lexing.lexeme lexbuf in
 310         let esc = String.sub l 1 (String.length l - 1) in
 311         raise (Error(Illegal_escape esc, Location.curr lexbuf))
 312       }
 313   | "(*"
 314       { comment_start_loc := [Location.curr lexbuf];
 315         comment lexbuf;
 316         token lexbuf }
 317   | "(*)"
 318       { let loc = Location.curr lexbuf in
 319         Location.prerr_warning loc Warnings.Comment_start;
 320         comment_start_loc := [Location.curr lexbuf];
 321         comment lexbuf;
 322         token lexbuf
 323       }
 324   | "*)"
 325       { let loc = Location.curr lexbuf in
 326         Location.prerr_warning loc Warnings.Comment_not_end;
 327         lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1;
 328         let curpos = lexbuf.lex_curr_p in
 329         lexbuf.lex_curr_p <- { curpos with pos_cnum = curpos.pos_cnum - 1 };
 330         STAR
 331       }
 332   | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
 333         ("\"" ([^ '\010' '\013' '"' ] * as name) "\"")?
 334         [^ '\010' '\013'] * newline
 335       { update_loc lexbuf name (int_of_string num) true 0;
 336         token lexbuf
 337       }
 338   | "#"  { SHARP }
 339   | "&"  { AMPERSAND }
 340   | "&&" { AMPERAMPER }
 341   | "`"  { BACKQUOTE }
 342   | "'"  { QUOTE }
 343   | "("  { LPAREN }
 344   | ")"  { RPAREN }
 345   | "*"  { STAR }
 346   | ","  { COMMA }
 347   | "->" { MINUSGREATER }
 348   | "."  { DOT }
 349   | ".." { DOTDOT }
 350   | ":"  { COLON }
 351   | "::" { COLONCOLON }
 352   | ":=" { COLONEQUAL }
 353   | ":>" { COLONGREATER }
 354   | ";"  { SEMI }
 355   | ";;" { SEMISEMI }
 356   | "<"  { LESS }
 357   | "<-" { LESSMINUS }
 358   | "="  { EQUAL }
 359   | "["  { LBRACKET }
 360   | "[|" { LBRACKETBAR }
 361   | "[<" { LBRACKETLESS }
 362   | "[>" { LBRACKETGREATER }
 363   | "]"  { RBRACKET }
 364   | "{"  { LBRACE }
 365   | "{<" { LBRACELESS }
 366   | "|"  { BAR }
 367   | "||" { BARBAR }
 368   | "|]" { BARRBRACKET }
 369   | ">"  { GREATER }
 370   | ">]" { GREATERRBRACKET }
 371   | "}"  { RBRACE }
 372   | ">}" { GREATERRBRACE }
 373
 374   | "!=" { INFIXOP0 "!=" }
 375   | "+"  { PLUS }
 376   | "-"  { MINUS }
 377   | "-." { MINUSDOT }
 378
 379   | "!" symbolchar *
 380             { PREFIXOP(Lexing.lexeme lexbuf) }
 381   | ['~' '?'] symbolchar +
 382             { PREFIXOP(Lexing.lexeme lexbuf) }
 383   | ['=' '<' '>' '|' '&' '$'] symbolchar *
 384             { INFIXOP0(Lexing.lexeme lexbuf) }
 385   | ['@' '^'] symbolchar *
 386             { INFIXOP1(Lexing.lexeme lexbuf) }
 387   | ['+' '-'] symbolchar *
 388             { INFIXOP2(Lexing.lexeme lexbuf) }
 389   | "**" symbolchar *
 390             { INFIXOP4(Lexing.lexeme lexbuf) }
 391   | ['*' '/' '%'] symbolchar *
 392             { INFIXOP3(Lexing.lexeme lexbuf) }
 393   | eof { EOF }
 394   | _
 395       { raise (Error(Illegal_character (Lexing.lexeme_char lexbuf 0),
 396                      Location.curr lexbuf))
 397       }
 398
 399 and comment = parse
 400     "(*"
 401       { comment_start_loc := (Location.curr lexbuf) :: !comment_start_loc;
 402         comment lexbuf;
 403       }
 404   | "*)"
 405       { match !comment_start_loc with
 406         | [] -> assert false
 407         | [x] -> comment_start_loc := [];
 408         | _ :: l -> comment_start_loc := l;
 409                     comment lexbuf;
 410        }
 411   | "\""
 412       { reset_string_buffer();
 413         string_start_loc := Location.curr lexbuf;
 414         begin try string lexbuf
 415         with Error (Unterminated_string, _) ->
 416           match !comment_start_loc with
 417           | [] -> assert false
 418           | loc :: _ -> comment_start_loc := [];
 419                         raise (Error (Unterminated_string_in_comment, loc))
 420         end;
 421         reset_string_buffer ();
 422         comment lexbuf }
 423   | "''"
 424       { comment lexbuf }
 425   | "'" newline "'"
 426       { update_loc lexbuf None 1 false 1;
 427         comment lexbuf
 428       }
 429   | "'" [^ '\\' '\'' '\010' '\013' ] "'"
 430       { comment lexbuf }
 431   | "'\\" ['\\' '"' '\'' 'n' 't' 'b' 'r' ' '] "'"
 432       { comment lexbuf }
 433   | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
 434       { comment lexbuf }
 435   | "'\\" 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"
 436       { comment lexbuf }
 437   | eof
 438       { match !comment_start_loc with
 439         | [] -> assert false
 440         | loc :: _ -> comment_start_loc := [];
 441                       raise (Error (Unterminated_comment, loc))
 442       }
 443   | newline
 444       { update_loc lexbuf None 1 false 0;
 445         comment lexbuf
 446       }
 447   | _
 448       { comment lexbuf }
 449
 450 and string = parse
 451     '"'
 452       { () }
 453   | '\\' newline ([' ' '\t'] * as space)
 454       { update_loc lexbuf None 1 false (String.length space);
 455         string lexbuf
 456       }
 457   | '\\' ['\\' '\'' '"' 'n' 't' 'b' 'r' ' ']
 458       { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
 459         string lexbuf }
 460   | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
 461       { store_string_char(char_for_decimal_code lexbuf 1);
 462          string lexbuf }
 463   | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F']
 464       { store_string_char(char_for_hexadecimal_code lexbuf 2);
 465          string lexbuf }
 466   | '\\' _
 467       { if in_comment ()
 468         then string lexbuf
 469         else begin
 470 (*  Should be an error, but we are very lax.
 471           raise (Error (Illegal_escape (Lexing.lexeme lexbuf),
 472                         Location.curr lexbuf))
 473 *)
 474           let loc = Location.curr lexbuf in
 475           Location.prerr_warning loc Warnings.Illegal_backslash;
 476           store_string_char (Lexing.lexeme_char lexbuf 0);
 477           store_string_char (Lexing.lexeme_char lexbuf 1);
 478           string lexbuf
 479         end
 480       }
 481   | newline
 482       { update_loc lexbuf None 1 false 0;
 483         let s = Lexing.lexeme lexbuf in
 484         for i = 0 to String.length s - 1 do
 485           store_string_char s.[i];
 486         done;
 487         string lexbuf
 488       }
 489   | eof
 490       { raise (Error (Unterminated_string, !string_start_loc)) }
 491   | _
 492       { store_string_char(Lexing.lexeme_char lexbuf 0);
 493         string lexbuf }
 494
 495 and skip_sharp_bang = parse
 496   | "#!" [^ '\n']* '\n' [^ '\n']* "\n!#\n"
 497        { update_loc lexbuf None 3 false 0 }
 498   | "#!" [^ '\n']* '\n'
 499        { update_loc lexbuf None 1 false 0 }
 500   | "" { () }