lib/Lex/LiteralSupport.cpp

   1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the NumericLiteralParser, CharLiteralParser, and
  11 // StringLiteralParser interfaces.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang/Lex/LiteralSupport.h"
  16 #include "clang/Lex/Preprocessor.h"
  17 #include "clang/Lex/LexDiagnostic.h"
  18 #include "clang/Basic/TargetInfo.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/ADT/StringExtras.h"
  21 using namespace clang;
  22
  23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
  24 /// not valid.
  25 static int HexDigitValue(char C) {
  26   if (C >= '0' && C <= '9') return C-'0';
  27   if (C >= 'a' && C <= 'f') return C-'a'+10;
  28   if (C >= 'A' && C <= 'F') return C-'A'+10;
  29   return -1;
  30 }
  31
  32 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  33 /// either a character or a string literal.
  34 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  35                                   const char *ThisTokEnd, bool &HadError,
  36                                   FullSourceLoc Loc, bool IsWide,
  37                                   Diagnostic *Diags, const TargetInfo &Target) {
  38   // Skip the '\' char.
  39   ++ThisTokBuf;
  40
  41   // We know that this character can't be off the end of the buffer, because
  42   // that would have been \", which would not have been the end of string.
  43   unsigned ResultChar = *ThisTokBuf++;
  44   switch (ResultChar) {
  45   // These map to themselves.
  46   case '\\': case '\'': case '"': case '?': break;
  47
  48     // These have fixed mappings.
  49   case 'a':
  50     // TODO: K&R: the meaning of '\\a' is different in traditional C
  51     ResultChar = 7;
  52     break;
  53   case 'b':
  54     ResultChar = 8;
  55     break;
  56   case 'e':
  57     if (Diags)
  58       Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
  59     ResultChar = 27;
  60     break;
  61   case 'E':
  62     if (Diags)
  63       Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
  64     ResultChar = 27;
  65     break;
  66   case 'f':
  67     ResultChar = 12;
  68     break;
  69   case 'n':
  70     ResultChar = 10;
  71     break;
  72   case 'r':
  73     ResultChar = 13;
  74     break;
  75   case 't':
  76     ResultChar = 9;
  77     break;
  78   case 'v':
  79     ResultChar = 11;
  80     break;
  81   case 'x': { // Hex escape.
  82     ResultChar = 0;
  83     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
  84       if (Diags)
  85         Diags->Report(Loc, diag::err_hex_escape_no_digits);
  86       HadError = 1;
  87       break;
  88     }
  89
  90     // Hex escapes are a maximal series of hex digits.
  91     bool Overflow = false;
  92     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
  93       int CharVal = HexDigitValue(ThisTokBuf[0]);
  94       if (CharVal == -1) break;
  95       // About to shift out a digit?
  96       Overflow |= (ResultChar & 0xF0000000) ? true : false;
  97       ResultChar <<= 4;
  98       ResultChar |= CharVal;
  99     }
 100
 101     // See if any bits will be truncated when evaluated as a character.
 102     unsigned CharWidth =
 103       IsWide ? Target.getWCharWidth() : Target.getCharWidth();
 104
 105     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 106       Overflow = true;
 107       ResultChar &= ~0U >> (32-CharWidth);
 108     }
 109
 110     // Check for overflow.
 111     if (Overflow && Diags)   // Too many digits to fit in
 112       Diags->Report(Loc, diag::warn_hex_escape_too_large);
 113     break;
 114   }
 115   case '0': case '1': case '2': case '3':
 116   case '4': case '5': case '6': case '7': {
 117     // Octal escapes.
 118     --ThisTokBuf;
 119     ResultChar = 0;
 120
 121     // Octal escapes are a series of octal digits with maximum length 3.
 122     // "\0123" is a two digit sequence equal to "\012" "3".
 123     unsigned NumDigits = 0;
 124     do {
 125       ResultChar <<= 3;
 126       ResultChar |= *ThisTokBuf++ - '0';
 127       ++NumDigits;
 128     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
 129              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 130
 131     // Check for overflow.  Reject '\777', but not L'\777'.
 132     unsigned CharWidth =
 133       IsWide ? Target.getWCharWidth() : Target.getCharWidth();
 134
 135     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 136       if (Diags)
 137         Diags->Report(Loc, diag::warn_octal_escape_too_large);
 138       ResultChar &= ~0U >> (32-CharWidth);
 139     }
 140     break;
 141   }
 142
 143     // Otherwise, these are not valid escapes.
 144   case '(': case '{': case '[': case '%':
 145     // GCC accepts these as extensions.  We warn about them as such though.
 146     if (Diags)
 147       Diags->Report(Loc, diag::ext_nonstandard_escape)
 148         << std::string()+(char)ResultChar;
 149     break;
 150   default:
 151     if (Diags == 0)
 152       break;
 153
 154     if (isgraph(ResultChar))
 155       Diags->Report(Loc, diag::ext_unknown_escape)
 156         << std::string()+(char)ResultChar;
 157     else
 158       Diags->Report(Loc, diag::ext_unknown_escape)
 159         << "x"+llvm::utohexstr(ResultChar);
 160     break;
 161   }
 162
 163   return ResultChar;
 164 }
 165
 166 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 167 /// return the UTF32.
 168 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 169                              uint32_t &UcnVal, unsigned short &UcnLen,
 170                              FullSourceLoc Loc, Diagnostic *Diags,
 171                              const LangOptions &Features) {
 172   if (!Features.CPlusPlus && !Features.C99 && Diags)
 173     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
 174
 175   // Save the beginning of the string (for error diagnostics).
 176   const char *ThisTokBegin = ThisTokBuf;
 177
 178   // Skip the '\u' char's.
 179   ThisTokBuf += 2;
 180
 181   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
 182     if (Diags)
 183       Diags->Report(Loc, diag::err_ucn_escape_no_digits);
 184     return false;
 185   }
 186   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
 187   unsigned short UcnLenSave = UcnLen;
 188   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
 189     int CharVal = HexDigitValue(ThisTokBuf[0]);
 190     if (CharVal == -1) break;
 191     UcnVal <<= 4;
 192     UcnVal |= CharVal;
 193   }
 194   // If we didn't consume the proper number of digits, there is a problem.
 195   if (UcnLenSave) {
 196     if (Diags) {
 197       SourceLocation L =
 198         Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin,
 199                                        Loc.getManager(), Features);
 200       Diags->Report(FullSourceLoc(L, Loc.getManager()),
 201                     diag::err_ucn_escape_incomplete);
 202     }
 203     return false;
 204   }
 205   // Check UCN constraints (C99 6.4.3p2).
 206   if ((UcnVal < 0xa0 &&
 207       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
 208       || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
 209       || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
 210     if (Diags)
 211       Diags->Report(Loc, diag::err_ucn_escape_invalid);
 212     return false;
 213   }
 214   return true;
 215 }
 216
 217 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
 218 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
 219 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
 220 /// we will likely rework our support for UCN's.
 221 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 222                             char *&ResultBuf, bool &HadError,
 223                             FullSourceLoc Loc, bool wide, Diagnostic *Diags,
 224                             const LangOptions &Features) {
 225   typedef uint32_t UTF32;
 226   UTF32 UcnVal = 0;
 227   unsigned short UcnLen = 0;
 228   if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags,
 229                         Features)) {
 230     HadError = 1;
 231     return;
 232   }
 233
 234   if (wide) {
 235     (void)UcnLen;
 236     assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 237
 238     if (!Features.ShortWChar) {
 239       // Note: our internal rep of wide char tokens is always little-endian.
 240       *ResultBuf++ = (UcnVal & 0x000000FF);
 241       *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
 242       *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
 243       *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
 244       return;
 245     }
 246
 247     // Convert to UTF16.
 248     if (UcnVal < (UTF32)0xFFFF) {
 249       *ResultBuf++ = (UcnVal & 0x000000FF);
 250       *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
 251       return;
 252     }
 253     if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large);
 254
 255     typedef uint16_t UTF16;
 256     UcnVal -= 0x10000;
 257     UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
 258     UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
 259     *ResultBuf++ = (surrogate1 & 0x000000FF);
 260     *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
 261     *ResultBuf++ = (surrogate2 & 0x000000FF);
 262     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
 263     return;
 264   }
 265   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
 266   // The conversion below was inspired by:
 267   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 268   // First, we determine how many bytes the result will require.
 269   typedef uint8_t UTF8;
 270
 271   unsigned short bytesToWrite = 0;
 272   if (UcnVal < (UTF32)0x80)
 273     bytesToWrite = 1;
 274   else if (UcnVal < (UTF32)0x800)
 275     bytesToWrite = 2;
 276   else if (UcnVal < (UTF32)0x10000)
 277     bytesToWrite = 3;
 278   else
 279     bytesToWrite = 4;
 280
 281   const unsigned byteMask = 0xBF;
 282   const unsigned byteMark = 0x80;
 283
 284   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
 285   // into the first byte, depending on how many bytes follow.
 286   static const UTF8 firstByteMark[5] = {
 287     0x00, 0x00, 0xC0, 0xE0, 0xF0
 288   };
 289   // Finally, we write the bytes into ResultBuf.
 290   ResultBuf += bytesToWrite;
 291   switch (bytesToWrite) { // note: everything falls through.
 292     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 293     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 294     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 295     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
 296   }
 297   // Update the buffer.
 298   ResultBuf += bytesToWrite;
 299 }
 300
 301
 302 ///       integer-constant: [C99 6.4.4.1]
 303 ///         decimal-constant integer-suffix
 304 ///         octal-constant integer-suffix
 305 ///         hexadecimal-constant integer-suffix
 306 ///       decimal-constant:
 307 ///         nonzero-digit
 308 ///         decimal-constant digit
 309 ///       octal-constant:
 310 ///         0
 311 ///         octal-constant octal-digit
 312 ///       hexadecimal-constant:
 313 ///         hexadecimal-prefix hexadecimal-digit
 314 ///         hexadecimal-constant hexadecimal-digit
 315 ///       hexadecimal-prefix: one of
 316 ///         0x 0X
 317 ///       integer-suffix:
 318 ///         unsigned-suffix [long-suffix]
 319 ///         unsigned-suffix [long-long-suffix]
 320 ///         long-suffix [unsigned-suffix]
 321 ///         long-long-suffix [unsigned-sufix]
 322 ///       nonzero-digit:
 323 ///         1 2 3 4 5 6 7 8 9
 324 ///       octal-digit:
 325 ///         0 1 2 3 4 5 6 7
 326 ///       hexadecimal-digit:
 327 ///         0 1 2 3 4 5 6 7 8 9
 328 ///         a b c d e f
 329 ///         A B C D E F
 330 ///       unsigned-suffix: one of
 331 ///         u U
 332 ///       long-suffix: one of
 333 ///         l L
 334 ///       long-long-suffix: one of
 335 ///         ll LL
 336 ///
 337 ///       floating-constant: [C99 6.4.4.2]
 338 ///         TODO: add rules...
 339 ///
 340 NumericLiteralParser::
 341 NumericLiteralParser(const char *begin, const char *end,
 342                      SourceLocation TokLoc, Preprocessor &pp)
 343   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
 344
 345   // This routine assumes that the range begin/end matches the regex for integer
 346   // and FP constants (specifically, the 'pp-number' regex), and assumes that
 347   // the byte at "*end" is both valid and not part of the regex.  Because of
 348   // this, it doesn't have to check for 'overscan' in various places.
 349   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
 350          "Lexer didn't maximally munch?");
 351
 352   s = DigitsBegin = begin;
 353   saw_exponent = false;
 354   saw_period = false;
 355   isLong = false;
 356   isUnsigned = false;
 357   isLongLong = false;
 358   isFloat = false;
 359   isImaginary = false;
 360   isMicrosoftInteger = false;
 361   hadError = false;
 362
 363   if (*s == '0') { // parse radix
 364     ParseNumberStartingWithZero(TokLoc);
 365     if (hadError)
 366       return;
 367   } else { // the first digit is non-zero
 368     radix = 10;
 369     s = SkipDigits(s);
 370     if (s == ThisTokEnd) {
 371       // Done.
 372     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
 373       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 374               diag::err_invalid_decimal_digit) << llvm::StringRef(s, 1);
 375       hadError = true;
 376       return;
 377     } else if (*s == '.') {
 378       s++;
 379       saw_period = true;
 380       s = SkipDigits(s);
 381     }
 382     if ((*s == 'e' || *s == 'E')) { // exponent
 383       const char *Exponent = s;
 384       s++;
 385       saw_exponent = true;
 386       if (*s == '+' || *s == '-')  s++; // sign
 387       const char *first_non_digit = SkipDigits(s);
 388       if (first_non_digit != s) {
 389         s = first_non_digit;
 390       } else {
 391         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
 392                 diag::err_exponent_has_no_digits);
 393         hadError = true;
 394         return;
 395       }
 396     }
 397   }
 398
 399   SuffixBegin = s;
 400
 401   // Parse the suffix.  At this point we can classify whether we have an FP or
 402   // integer constant.
 403   bool isFPConstant = isFloatingLiteral();
 404
 405   // Loop over all of the characters of the suffix.  If we see something bad,
 406   // we break out of the loop.
 407   for (; s != ThisTokEnd; ++s) {
 408     switch (*s) {
 409     case 'f':      // FP Suffix for "float"
 410     case 'F':
 411       if (!isFPConstant) break;  // Error for integer constant.
 412       if (isFloat || isLong) break; // FF, LF invalid.
 413       isFloat = true;
 414       continue;  // Success.
 415     case 'u':
 416     case 'U':
 417       if (isFPConstant) break;  // Error for floating constant.
 418       if (isUnsigned) break;    // Cannot be repeated.
 419       isUnsigned = true;
 420       continue;  // Success.
 421     case 'l':
 422     case 'L':
 423       if (isLong || isLongLong) break;  // Cannot be repeated.
 424       if (isFloat) break;               // LF invalid.
 425
 426       // Check for long long.  The L's need to be adjacent and the same case.
 427       if (s+1 != ThisTokEnd && s[1] == s[0]) {
 428         if (isFPConstant) break;        // long long invalid for floats.
 429         isLongLong = true;
 430         ++s;  // Eat both of them.
 431       } else {
 432         isLong = true;
 433       }
 434       continue;  // Success.
 435     case 'i':
 436     case 'I':
 437       if (PP.getLangOptions().Microsoft) {
 438         if (isFPConstant || isLong || isLongLong) break;
 439
 440         // Allow i8, i16, i32, i64, and i128.
 441         if (s + 1 != ThisTokEnd) {
 442           switch (s[1]) {
 443             case '8':
 444               s += 2; // i8 suffix
 445               isMicrosoftInteger = true;
 446               break;
 447             case '1':
 448               if (s + 2 == ThisTokEnd) break;
 449               if (s[2] == '6') s += 3; // i16 suffix
 450               else if (s[2] == '2') {
 451                 if (s + 3 == ThisTokEnd) break;
 452                 if (s[3] == '8') s += 4; // i128 suffix
 453               }
 454               isMicrosoftInteger = true;
 455               break;
 456             case '3':
 457               if (s + 2 == ThisTokEnd) break;
 458               if (s[2] == '2') s += 3; // i32 suffix
 459               isMicrosoftInteger = true;
 460               break;
 461             case '6':
 462               if (s + 2 == ThisTokEnd) break;
 463               if (s[2] == '4') s += 3; // i64 suffix
 464               isMicrosoftInteger = true;
 465               break;
 466             default:
 467               break;
 468           }
 469           break;
 470         }
 471       }
 472       // fall through.
 473     case 'j':
 474     case 'J':
 475       if (isImaginary) break;   // Cannot be repeated.
 476       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 477               diag::ext_imaginary_constant);
 478       isImaginary = true;
 479       continue;  // Success.
 480     }
 481     // If we reached here, there was an error.
 482     break;
 483   }
 484
 485   // Report an error if there are any.
 486   if (s != ThisTokEnd) {
 487     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 488             isFPConstant ? diag::err_invalid_suffix_float_constant :
 489                            diag::err_invalid_suffix_integer_constant)
 490       << llvm::StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
 491     hadError = true;
 492     return;
 493   }
 494 }
 495
 496 /// ParseNumberStartingWithZero - This method is called when the first character
 497 /// of the number is found to be a zero.  This means it is either an octal
 498 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
 499 /// a floating point number (01239.123e4).  Eat the prefix, determining the
 500 /// radix etc.
 501 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
 502   assert(s[0] == '0' && "Invalid method call");
 503   s++;
 504
 505   // Handle a hex number like 0x1234.
 506   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
 507     s++;
 508     radix = 16;
 509     DigitsBegin = s;
 510     s = SkipHexDigits(s);
 511     if (s == ThisTokEnd) {
 512       // Done.
 513     } else if (*s == '.') {
 514       s++;
 515       saw_period = true;
 516       s = SkipHexDigits(s);
 517     }
 518     // A binary exponent can appear with or with a '.'. If dotted, the
 519     // binary exponent is required.
 520     if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) {
 521       const char *Exponent = s;
 522       s++;
 523       saw_exponent = true;
 524       if (*s == '+' || *s == '-')  s++; // sign
 525       const char *first_non_digit = SkipDigits(s);
 526       if (first_non_digit == s) {
 527         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 528                 diag::err_exponent_has_no_digits);
 529         hadError = true;
 530         return;
 531       }
 532       s = first_non_digit;
 533
 534       // In C++0x, we cannot support hexadecmial floating literals because
 535       // they conflict with user-defined literals, so we warn in previous
 536       // versions of C++ by default.
 537       if (PP.getLangOptions().CPlusPlus)
 538         PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus);
 539       else if (!PP.getLangOptions().HexFloats)
 540         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
 541     } else if (saw_period) {
 542       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 543               diag::err_hexconstant_requires_exponent);
 544       hadError = true;
 545     }
 546     return;
 547   }
 548
 549   // Handle simple binary numbers 0b01010
 550   if (*s == 'b' || *s == 'B') {
 551     // 0b101010 is a GCC extension.
 552     PP.Diag(TokLoc, diag::ext_binary_literal);
 553     ++s;
 554     radix = 2;
 555     DigitsBegin = s;
 556     s = SkipBinaryDigits(s);
 557     if (s == ThisTokEnd) {
 558       // Done.
 559     } else if (isxdigit(*s)) {
 560       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 561               diag::err_invalid_binary_digit) << llvm::StringRef(s, 1);
 562       hadError = true;
 563     }
 564     // Other suffixes will be diagnosed by the caller.
 565     return;
 566   }
 567
 568   // For now, the radix is set to 8. If we discover that we have a
 569   // floating point constant, the radix will change to 10. Octal floating
 570   // point constants are not permitted (only decimal and hexadecimal).
 571   radix = 8;
 572   DigitsBegin = s;
 573   s = SkipOctalDigits(s);
 574   if (s == ThisTokEnd)
 575     return; // Done, simple octal number like 01234
 576
 577   // If we have some other non-octal digit that *is* a decimal digit, see if
 578   // this is part of a floating point number like 094.123 or 09e1.
 579   if (isdigit(*s)) {
 580     const char *EndDecimal = SkipDigits(s);
 581     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
 582       s = EndDecimal;
 583       radix = 10;
 584     }
 585   }
 586
 587   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
 588   // the code is using an incorrect base.
 589   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
 590     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 591             diag::err_invalid_octal_digit) << llvm::StringRef(s, 1);
 592     hadError = true;
 593     return;
 594   }
 595
 596   if (*s == '.') {
 597     s++;
 598     radix = 10;
 599     saw_period = true;
 600     s = SkipDigits(s); // Skip suffix.
 601   }
 602   if (*s == 'e' || *s == 'E') { // exponent
 603     const char *Exponent = s;
 604     s++;
 605     radix = 10;
 606     saw_exponent = true;
 607     if (*s == '+' || *s == '-')  s++; // sign
 608     const char *first_non_digit = SkipDigits(s);
 609     if (first_non_digit != s) {
 610       s = first_non_digit;
 611     } else {
 612       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 613               diag::err_exponent_has_no_digits);
 614       hadError = true;
 615       return;
 616     }
 617   }
 618 }
 619
 620
 621 /// GetIntegerValue - Convert this numeric literal value to an APInt that
 622 /// matches Val's input width.  If there is an overflow, set Val to the low bits
 623 /// of the result and return true.  Otherwise, return false.
 624 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
 625   // Fast path: Compute a conservative bound on the maximum number of
 626   // bits per digit in this radix. If we can't possibly overflow a
 627   // uint64 based on that bound then do the simple conversion to
 628   // integer. This avoids the expensive overflow checking below, and
 629   // handles the common cases that matter (small decimal integers and
 630   // hex/octal values which don't overflow).
 631   unsigned MaxBitsPerDigit = 1;
 632   while ((1U << MaxBitsPerDigit) < radix)
 633     MaxBitsPerDigit += 1;
 634   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
 635     uint64_t N = 0;
 636     for (s = DigitsBegin; s != SuffixBegin; ++s)
 637       N = N*radix + HexDigitValue(*s);
 638
 639     // This will truncate the value to Val's input width. Simply check
 640     // for overflow by comparing.
 641     Val = N;
 642     return Val.getZExtValue() != N;
 643   }
 644
 645   Val = 0;
 646   s = DigitsBegin;
 647
 648   llvm::APInt RadixVal(Val.getBitWidth(), radix);
 649   llvm::APInt CharVal(Val.getBitWidth(), 0);
 650   llvm::APInt OldVal = Val;
 651
 652   bool OverflowOccurred = false;
 653   while (s < SuffixBegin) {
 654     unsigned C = HexDigitValue(*s++);
 655
 656     // If this letter is out of bound for this radix, reject it.
 657     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
 658
 659     CharVal = C;
 660
 661     // Add the digit to the value in the appropriate radix.  If adding in digits
 662     // made the value smaller, then this overflowed.
 663     OldVal = Val;
 664
 665     // Multiply by radix, did overflow occur on the multiply?
 666     Val *= RadixVal;
 667     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
 668
 669     // Add value, did overflow occur on the value?
 670     //   (a + b) ult b  <=> overflow
 671     Val += CharVal;
 672     OverflowOccurred |= Val.ult(CharVal);
 673   }
 674   return OverflowOccurred;
 675 }
 676
 677 llvm::APFloat::opStatus
 678 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 679   using llvm::APFloat;
 680   using llvm::StringRef;
 681
 682   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
 683   return Result.convertFromString(StringRef(ThisTokBegin, n),
 684                                   APFloat::rmNearestTiesToEven);
 685 }
 686
 687
 688 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
 689                                      SourceLocation Loc, Preprocessor &PP) {
 690   // At this point we know that the character matches the regex "L?'.*'".
 691   HadError = false;
 692
 693   // Determine if this is a wide character.
 694   IsWide = begin[0] == 'L';
 695   if (IsWide) ++begin;
 696
 697   // Skip over the entry quote.
 698   assert(begin[0] == '\'' && "Invalid token lexed");
 699   ++begin;
 700
 701   // FIXME: The "Value" is an uint64_t so we can handle char literals of
 702   // upto 64-bits.
 703   // FIXME: This extensively assumes that 'char' is 8-bits.
 704   assert(PP.getTargetInfo().getCharWidth() == 8 &&
 705          "Assumes char is 8 bits");
 706   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
 707          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
 708          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
 709   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
 710          "Assumes sizeof(wchar) on target is <= 64");
 711
 712   // This is what we will use for overflow detection
 713   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
 714
 715   unsigned NumCharsSoFar = 0;
 716   bool Warned = false;
 717   while (begin[0] != '\'') {
 718     uint64_t ResultChar;
 719
 720       // Is this a Universal Character Name escape?
 721     if (begin[0] != '\\')     // If this is a normal character, consume it.
 722       ResultChar = *begin++;
 723     else {                    // Otherwise, this is an escape character.
 724       // Check for UCN.
 725       if (begin[1] == 'u' || begin[1] == 'U') {
 726         uint32_t utf32 = 0;
 727         unsigned short UcnLen = 0;
 728         if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
 729                               FullSourceLoc(Loc, PP.getSourceManager()),
 730                               &PP.getDiagnostics(), PP.getLangOptions())) {
 731           HadError = 1;
 732         }
 733         ResultChar = utf32;
 734       } else {
 735         // Otherwise, this is a non-UCN escape character.  Process it.
 736         ResultChar = ProcessCharEscape(begin, end, HadError,
 737                                        FullSourceLoc(Loc,PP.getSourceManager()),
 738                                        IsWide,
 739                                        &PP.getDiagnostics(), PP.getTargetInfo());
 740       }
 741     }
 742
 743     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
 744     // implementation defined (C99 6.4.4.4p10).
 745     if (NumCharsSoFar) {
 746       if (IsWide) {
 747         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
 748         LitVal = 0;
 749       } else {
 750         // Narrow character literals act as though their value is concatenated
 751         // in this implementation, but warn on overflow.
 752         if (LitVal.countLeadingZeros() < 8 && !Warned) {
 753           PP.Diag(Loc, diag::warn_char_constant_too_large);
 754           Warned = true;
 755         }
 756         LitVal <<= 8;
 757       }
 758     }
 759
 760     LitVal = LitVal + ResultChar;
 761     ++NumCharsSoFar;
 762   }
 763
 764   // If this is the second character being processed, do special handling.
 765   if (NumCharsSoFar > 1) {
 766     // Warn about discarding the top bits for multi-char wide-character
 767     // constants (L'abcd').
 768     if (IsWide)
 769       PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
 770     else if (NumCharsSoFar != 4)
 771       PP.Diag(Loc, diag::ext_multichar_character_literal);
 772     else
 773       PP.Diag(Loc, diag::ext_four_char_character_literal);
 774     IsMultiChar = true;
 775   } else
 776     IsMultiChar = false;
 777
 778   // Transfer the value from APInt to uint64_t
 779   Value = LitVal.getZExtValue();
 780
 781   if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
 782     PP.Diag(Loc, diag::warn_ucn_escape_too_large);
 783
 784   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
 785   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
 786   // character constants are not sign extended in the this implementation:
 787   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
 788   if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
 789       PP.getLangOptions().CharIsSigned)
 790     Value = (signed char)Value;
 791 }
 792
 793
 794 ///       string-literal: [C99 6.4.5]
 795 ///          " [s-char-sequence] "
 796 ///         L" [s-char-sequence] "
 797 ///       s-char-sequence:
 798 ///         s-char
 799 ///         s-char-sequence s-char
 800 ///       s-char:
 801 ///         any source character except the double quote ",
 802 ///           backslash \, or newline character
 803 ///         escape-character
 804 ///         universal-character-name
 805 ///       escape-character: [C99 6.4.4.4]
 806 ///         \ escape-code
 807 ///         universal-character-name
 808 ///       escape-code:
 809 ///         character-escape-code
 810 ///         octal-escape-code
 811 ///         hex-escape-code
 812 ///       character-escape-code: one of
 813 ///         n t b r f v a
 814 ///         \ ' " ?
 815 ///       octal-escape-code:
 816 ///         octal-digit
 817 ///         octal-digit octal-digit
 818 ///         octal-digit octal-digit octal-digit
 819 ///       hex-escape-code:
 820 ///         x hex-digit
 821 ///         hex-escape-code hex-digit
 822 ///       universal-character-name:
 823 ///         \u hex-quad
 824 ///         \U hex-quad hex-quad
 825 ///       hex-quad:
 826 ///         hex-digit hex-digit hex-digit hex-digit
 827 ///
 828 StringLiteralParser::
 829 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
 830                     Preprocessor &PP, bool Complain)
 831   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
 832     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0) {
 833   init(StringToks, NumStringToks);
 834 }
 835
 836 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 837   // Scan all of the string portions, remember the max individual token length,
 838   // computing a bound on the concatenated string length, and see whether any
 839   // piece is a wide-string.  If any of the string portions is a wide-string
 840   // literal, the result is a wide-string literal [C99 6.4.5p4].
 841   MaxTokenLength = StringToks[0].getLength();
 842   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
 843   AnyWide = StringToks[0].is(tok::wide_string_literal);
 844
 845   hadError = false;
 846
 847   // Implement Translation Phase #6: concatenation of string literals
 848   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
 849   for (unsigned i = 1; i != NumStringToks; ++i) {
 850     // The string could be shorter than this if it needs cleaning, but this is a
 851     // reasonable bound, which is all we need.
 852     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
 853
 854     // Remember maximum string piece length.
 855     if (StringToks[i].getLength() > MaxTokenLength)
 856       MaxTokenLength = StringToks[i].getLength();
 857
 858     // Remember if we see any wide strings.
 859     AnyWide |= StringToks[i].is(tok::wide_string_literal);
 860   }
 861
 862   // Include space for the null terminator.
 863   ++SizeBound;
 864
 865   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 866
 867   // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
 868   // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
 869   wchar_tByteWidth = ~0U;
 870   if (AnyWide) {
 871     wchar_tByteWidth = Target.getWCharWidth();
 872     assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
 873     wchar_tByteWidth /= 8;
 874   }
 875
 876   // The output buffer size needs to be large enough to hold wide characters.
 877   // This is a worst-case assumption which basically corresponds to L"" "long".
 878   if (AnyWide)
 879     SizeBound *= wchar_tByteWidth;
 880
 881   // Size the temporary buffer to hold the result string data.
 882   ResultBuf.resize(SizeBound);
 883
 884   // Likewise, but for each string piece.
 885   llvm::SmallString<512> TokenBuf;
 886   TokenBuf.resize(MaxTokenLength);
 887
 888   // Loop over all the strings, getting their spelling, and expanding them to
 889   // wide strings as appropriate.
 890   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
 891
 892   Pascal = false;
 893
 894   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
 895     const char *ThisTokBuf = &TokenBuf[0];
 896     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
 897     // that ThisTokBuf points to a buffer that is big enough for the whole token
 898     // and 'spelled' tokens can only shrink.
 899     bool StringInvalid = false;
 900     unsigned ThisTokLen =
 901       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
 902                          &StringInvalid);
 903     if (StringInvalid) {
 904       hadError = 1;
 905       continue;
 906     }
 907
 908     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
 909     bool wide = false;
 910     // TODO: Input character set mapping support.
 911
 912     // Skip L marker for wide strings.
 913     if (ThisTokBuf[0] == 'L') {
 914       wide = true;
 915       ++ThisTokBuf;
 916     }
 917
 918     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
 919     ++ThisTokBuf;
 920
 921     // Check if this is a pascal string
 922     if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
 923         ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
 924
 925       // If the \p sequence is found in the first token, we have a pascal string
 926       // Otherwise, if we already have a pascal string, ignore the first \p
 927       if (i == 0) {
 928         ++ThisTokBuf;
 929         Pascal = true;
 930       } else if (Pascal)
 931         ThisTokBuf += 2;
 932     }
 933
 934     while (ThisTokBuf != ThisTokEnd) {
 935       // Is this a span of non-escape characters?
 936       if (ThisTokBuf[0] != '\\') {
 937         const char *InStart = ThisTokBuf;
 938         do {
 939           ++ThisTokBuf;
 940         } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 941
 942         // Copy the character span over.
 943         unsigned Len = ThisTokBuf-InStart;
 944         if (!AnyWide) {
 945           memcpy(ResultPtr, InStart, Len);
 946           ResultPtr += Len;
 947         } else {
 948           // Note: our internal rep of wide char tokens is always little-endian.
 949           for (; Len; --Len, ++InStart) {
 950             *ResultPtr++ = InStart[0];
 951             // Add zeros at the end.
 952             for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
 953               *ResultPtr++ = 0;
 954           }
 955         }
 956         continue;
 957       }
 958       // Is this a Universal Character Name escape?
 959       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
 960         EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
 961                         hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
 962                         wide, Diags, Features);
 963         continue;
 964       }
 965       // Otherwise, this is a non-UCN escape character.  Process it.
 966       unsigned ResultChar =
 967         ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
 968                           FullSourceLoc(StringToks[i].getLocation(), SM),
 969                           AnyWide, Diags, Target);
 970
 971       // Note: our internal rep of wide char tokens is always little-endian.
 972       *ResultPtr++ = ResultChar & 0xFF;
 973
 974       if (AnyWide) {
 975         for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
 976           *ResultPtr++ = ResultChar >> i*8;
 977       }
 978     }
 979   }
 980
 981   if (Pascal) {
 982     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
 983     if (AnyWide)
 984       ResultBuf[0] /= wchar_tByteWidth;
 985
 986     // Verify that pascal strings aren't too large.
 987     if (GetStringLength() > 256) {
 988       if (Diags)
 989         Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
 990                       diag::err_pascal_string_too_long)
 991           << SourceRange(StringToks[0].getLocation(),
 992                          StringToks[NumStringToks-1].getLocation());
 993       hadError = 1;
 994       return;
 995     }
 996   } else if (Diags) {
 997     // Complain if this string literal has too many characters.
 998     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
 999
1000     if (GetNumStringChars() > MaxChars)
1001       Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1002                     diag::ext_string_too_long)
1003         << GetNumStringChars() << MaxChars
1004         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1005         << SourceRange(StringToks[0].getLocation(),
1006                        StringToks[NumStringToks-1].getLocation());
1007   }
1008 }
1009
1010
1011 /// getOffsetOfStringByte - This function returns the offset of the
1012 /// specified byte of the string data represented by Token.  This handles
1013 /// advancing over escape sequences in the string.
1014 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1015                                                     unsigned ByteNo) const {
1016   // Get the spelling of the token.
1017   llvm::SmallString<32> SpellingBuffer;
1018   SpellingBuffer.resize(Tok.getLength());
1019
1020   bool StringInvalid = false;
1021   const char *SpellingPtr = &SpellingBuffer[0];
1022   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1023                                        &StringInvalid);
1024   if (StringInvalid)
1025     return 0;
1026
1027   assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
1028
1029
1030   const char *SpellingStart = SpellingPtr;
1031   const char *SpellingEnd = SpellingPtr+TokLen;
1032
1033   // Skip over the leading quote.
1034   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1035   ++SpellingPtr;
1036
1037   // Skip over bytes until we find the offset we're looking for.
1038   while (ByteNo) {
1039     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1040
1041     // Step over non-escapes simply.
1042     if (*SpellingPtr != '\\') {
1043       ++SpellingPtr;
1044       --ByteNo;
1045       continue;
1046     }
1047
1048     // Otherwise, this is an escape character.  Advance over it.
1049     bool HadError = false;
1050     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
1051                       FullSourceLoc(Tok.getLocation(), SM),
1052                       false, Diags, Target);
1053     assert(!HadError && "This method isn't valid on erroneous strings");
1054     --ByteNo;
1055   }
1056
1057   return SpellingPtr-SpellingStart;
1058 }