Driver/PrintPreprocessedOutput.cpp

   1 //===--- PrintPreprocessedOutput.cpp - Implement the -E mode --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This code simply runs the preprocessor on the input file and prints out the
  11 // result.  This is the traditional behavior of the -E option.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang.h"
  16 #include "clang/Lex/PPCallbacks.h"
  17 #include "clang/Lex/Preprocessor.h"
  18 #include "clang/Lex/Pragma.h"
  19 #include "clang/Basic/SourceManager.h"
  20 #include "clang/Basic/Diagnostic.h"
  21 #include "llvm/ADT/SmallString.h"
  22 #include "llvm/ADT/StringExtras.h"
  23 #include "llvm/System/Path.h"
  24 #include "llvm/Support/CommandLine.h"
  25 #include "llvm/Config/config.h"
  26 #include "llvm/Support/raw_ostream.h"
  27 #include <cstdio>
  28 using namespace clang;
  29
  30 //===----------------------------------------------------------------------===//
  31 // Preprocessed token printer
  32 //===----------------------------------------------------------------------===//
  33
  34 static llvm::cl::opt<bool>
  35 DisableLineMarkers("P", llvm::cl::desc("Disable linemarker output in -E mode"));
  36 static llvm::cl::opt<bool>
  37 EnableCommentOutput("C", llvm::cl::desc("Enable comment output in -E mode"));
  38 static llvm::cl::opt<bool>
  39 EnableMacroCommentOutput("CC",
  40                          llvm::cl::desc("Enable comment output in -E mode, "
  41                                         "even from macro expansions"));
  42
  43 namespace {
  44 class PrintPPOutputPPCallbacks : public PPCallbacks {
  45   Preprocessor &PP;
  46 public:
  47   llvm::raw_ostream &OS;
  48 private:
  49   unsigned CurLine;
  50   bool EmittedTokensOnThisLine;
  51   SrcMgr::CharacteristicKind FileType;
  52   llvm::SmallString<512> CurFilename;
  53   bool Initialized;
  54 public:
  55   PrintPPOutputPPCallbacks(Preprocessor &pp, llvm::raw_ostream &os)
  56      : PP(pp), OS(os) {
  57     CurLine = 0;
  58     CurFilename += "<uninit>";
  59     EmittedTokensOnThisLine = false;
  60     FileType = SrcMgr::C_User;
  61     Initialized = false;
  62   }
  63
  64   void SetEmittedTokensOnThisLine() { EmittedTokensOnThisLine = true; }
  65   bool hasEmittedTokensOnThisLine() const { return EmittedTokensOnThisLine; }
  66
  67   virtual void FileChanged(SourceLocation Loc, FileChangeReason Reason,
  68                            SrcMgr::CharacteristicKind FileType);
  69   virtual void Ident(SourceLocation Loc, const std::string &str);
  70
  71
  72   bool HandleFirstTokOnLine(Token &Tok);
  73   bool MoveToLine(SourceLocation Loc);
  74   bool AvoidConcat(const Token &PrevTok, const Token &Tok);
  75   void WriteLineInfo(unsigned LineNo, const char *Extra=0, unsigned ExtraLen=0);
  76 };
  77 }  // end anonymous namespace
  78
  79 void PrintPPOutputPPCallbacks::WriteLineInfo(unsigned LineNo,
  80                                              const char *Extra,
  81                                              unsigned ExtraLen) {
  82   if (EmittedTokensOnThisLine) {
  83     OS << '\n';
  84     EmittedTokensOnThisLine = false;
  85   }
  86
  87   OS << '#' << ' ' << LineNo << ' ' << '"';
  88   OS.write(&CurFilename[0], CurFilename.size());
  89   OS << '"';
  90
  91   if (ExtraLen)
  92     OS.write(Extra, ExtraLen);
  93
  94   if (FileType == SrcMgr::C_System)
  95     OS.write(" 3", 2);
  96   else if (FileType == SrcMgr::C_ExternCSystem)
  97     OS.write(" 3 4", 4);
  98   OS << '\n';
  99 }
 100
 101 /// MoveToLine - Move the output to the source line specified by the location
 102 /// object.  We can do this by emitting some number of \n's, or be emitting a
 103 /// #line directive.  This returns false if already at the specified line, true
 104 /// if some newlines were emitted.
 105 bool PrintPPOutputPPCallbacks::MoveToLine(SourceLocation Loc) {
 106   unsigned LineNo = PP.getSourceManager().getLogicalLineNumber(Loc);
 107
 108   if (DisableLineMarkers) {
 109     if (LineNo == CurLine) return false;
 110
 111     CurLine = LineNo;
 112
 113     if (!EmittedTokensOnThisLine)
 114       return true;
 115
 116     OS << '\n';
 117     EmittedTokensOnThisLine = false;
 118     return true;
 119   }
 120
 121   // If this line is "close enough" to the original line, just print newlines,
 122   // otherwise print a #line directive.
 123   if (LineNo-CurLine <= 8) {
 124     if (LineNo-CurLine == 1)
 125       OS << '\n';
 126     else if (LineNo == CurLine)
 127       return false;    // Phys line moved, but logical line didn't.
 128     else {
 129       const char *NewLines = "\n\n\n\n\n\n\n\n";
 130       OS.write(NewLines, LineNo-CurLine);
 131     }
 132   } else {
 133     WriteLineInfo(LineNo, 0, 0);
 134   }
 135
 136   CurLine = LineNo;
 137   return true;
 138 }
 139
 140
 141 /// FileChanged - Whenever the preprocessor enters or exits a #include file
 142 /// it invokes this handler.  Update our conception of the current source
 143 /// position.
 144 void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc,
 145                                            FileChangeReason Reason,
 146                                        SrcMgr::CharacteristicKind NewFileType) {
 147   // Unless we are exiting a #include, make sure to skip ahead to the line the
 148   // #include directive was at.
 149   SourceManager &SourceMgr = PP.getSourceManager();
 150   if (Reason == PPCallbacks::EnterFile) {
 151     MoveToLine(SourceMgr.getIncludeLoc(Loc));
 152   } else if (Reason == PPCallbacks::SystemHeaderPragma) {
 153     MoveToLine(Loc);
 154
 155     // TODO GCC emits the # directive for this directive on the line AFTER the
 156     // directive and emits a bunch of spaces that aren't needed.  Emulate this
 157     // strange behavior.
 158   }
 159
 160   Loc = SourceMgr.getLogicalLoc(Loc);
 161   CurLine = SourceMgr.getLineNumber(Loc);
 162
 163   if (DisableLineMarkers) return;
 164
 165   CurFilename.clear();
 166   CurFilename += SourceMgr.getSourceName(Loc);
 167   Lexer::Stringify(CurFilename);
 168   FileType = NewFileType;
 169
 170   if (!Initialized) {
 171     WriteLineInfo(CurLine);
 172     Initialized = true;
 173   }
 174
 175   switch (Reason) {
 176   case PPCallbacks::EnterFile:
 177     WriteLineInfo(CurLine, " 1", 2);
 178     break;
 179   case PPCallbacks::ExitFile:
 180     WriteLineInfo(CurLine, " 2", 2);
 181     break;
 182   case PPCallbacks::SystemHeaderPragma:
 183   case PPCallbacks::RenameFile:
 184     WriteLineInfo(CurLine);
 185     break;
 186   }
 187 }
 188
 189 /// HandleIdent - Handle #ident directives when read by the preprocessor.
 190 ///
 191 void PrintPPOutputPPCallbacks::Ident(SourceLocation Loc, const std::string &S) {
 192   MoveToLine(Loc);
 193
 194   OS.write("#ident ", strlen("#ident "));
 195   OS.write(&S[0], S.size());
 196   EmittedTokensOnThisLine = true;
 197 }
 198
 199 /// HandleFirstTokOnLine - When emitting a preprocessed file in -E mode, this
 200 /// is called for the first token on each new line.  If this really is the start
 201 /// of a new logical line, handle it and return true, otherwise return false.
 202 /// This may not be the start of a logical line because the "start of line"
 203 /// marker is set for physical lines, not logical ones.
 204 bool PrintPPOutputPPCallbacks::HandleFirstTokOnLine(Token &Tok) {
 205   // Figure out what line we went to and insert the appropriate number of
 206   // newline characters.
 207   if (!MoveToLine(Tok.getLocation()))
 208     return false;
 209
 210   // Print out space characters so that the first token on a line is
 211   // indented for easy reading.
 212   const SourceManager &SourceMgr = PP.getSourceManager();
 213   unsigned ColNo = SourceMgr.getLogicalColumnNumber(Tok.getLocation());
 214
 215   // This hack prevents stuff like:
 216   // #define HASH #
 217   // HASH define foo bar
 218   // From having the # character end up at column 1, which makes it so it
 219   // is not handled as a #define next time through the preprocessor if in
 220   // -fpreprocessed mode.
 221   if (ColNo <= 1 && Tok.is(tok::hash))
 222     OS << ' ';
 223
 224   // Otherwise, indent the appropriate number of spaces.
 225   for (; ColNo > 1; --ColNo)
 226     OS << ' ';
 227
 228   return true;
 229 }
 230
 231 namespace {
 232 struct UnknownPragmaHandler : public PragmaHandler {
 233   const char *Prefix;
 234   PrintPPOutputPPCallbacks *Callbacks;
 235
 236   UnknownPragmaHandler(const char *prefix, PrintPPOutputPPCallbacks *callbacks)
 237     : PragmaHandler(0), Prefix(prefix), Callbacks(callbacks) {}
 238   virtual void HandlePragma(Preprocessor &PP, Token &PragmaTok) {
 239     // Figure out what line we went to and insert the appropriate number of
 240     // newline characters.
 241     Callbacks->MoveToLine(PragmaTok.getLocation());
 242     Callbacks->OS.write(Prefix, strlen(Prefix));
 243
 244     // Read and print all of the pragma tokens.
 245     while (PragmaTok.isNot(tok::eom)) {
 246       if (PragmaTok.hasLeadingSpace())
 247         Callbacks->OS << ' ';
 248       std::string TokSpell = PP.getSpelling(PragmaTok);
 249       Callbacks->OS.write(&TokSpell[0], TokSpell.size());
 250       PP.LexUnexpandedToken(PragmaTok);
 251     }
 252     Callbacks->OS << '\n';
 253   }
 254 };
 255 } // end anonymous namespace
 256
 257
 258 enum AvoidConcatInfo {
 259   /// By default, a token never needs to avoid concatenation.  Most tokens (e.g.
 260   /// ',', ')', etc) don't cause a problem when concatenated.
 261   aci_never_avoid_concat = 0,
 262
 263   /// aci_custom_firstchar - AvoidConcat contains custom code to handle this
 264   /// token's requirements, and it needs to know the first character of the
 265   /// token.
 266   aci_custom_firstchar = 1,
 267
 268   /// aci_custom - AvoidConcat contains custom code to handle this token's
 269   /// requirements, but it doesn't need to know the first character of the
 270   /// token.
 271   aci_custom = 2,
 272
 273   /// aci_avoid_equal - Many tokens cannot be safely followed by an '='
 274   /// character.  For example, "<<" turns into "<<=" when followed by an =.
 275   aci_avoid_equal = 4
 276 };
 277
 278 /// This array contains information for each token on what action to take when
 279 /// avoiding concatenation of tokens in the AvoidConcat method.
 280 static char TokenInfo[tok::NUM_TOKENS];
 281
 282 /// InitAvoidConcatTokenInfo - Tokens that must avoid concatenation should be
 283 /// marked by this function.
 284 static void InitAvoidConcatTokenInfo() {
 285   // These tokens have custom code in AvoidConcat.
 286   TokenInfo[tok::identifier      ] |= aci_custom;
 287   TokenInfo[tok::numeric_constant] |= aci_custom_firstchar;
 288   TokenInfo[tok::period          ] |= aci_custom_firstchar;
 289   TokenInfo[tok::amp             ] |= aci_custom_firstchar;
 290   TokenInfo[tok::plus            ] |= aci_custom_firstchar;
 291   TokenInfo[tok::minus           ] |= aci_custom_firstchar;
 292   TokenInfo[tok::slash           ] |= aci_custom_firstchar;
 293   TokenInfo[tok::less            ] |= aci_custom_firstchar;
 294   TokenInfo[tok::greater         ] |= aci_custom_firstchar;
 295   TokenInfo[tok::pipe            ] |= aci_custom_firstchar;
 296   TokenInfo[tok::percent         ] |= aci_custom_firstchar;
 297   TokenInfo[tok::colon           ] |= aci_custom_firstchar;
 298   TokenInfo[tok::hash            ] |= aci_custom_firstchar;
 299   TokenInfo[tok::arrow           ] |= aci_custom_firstchar;
 300
 301   // These tokens change behavior if followed by an '='.
 302   TokenInfo[tok::amp         ] |= aci_avoid_equal;           // &=
 303   TokenInfo[tok::plus        ] |= aci_avoid_equal;           // +=
 304   TokenInfo[tok::minus       ] |= aci_avoid_equal;           // -=
 305   TokenInfo[tok::slash       ] |= aci_avoid_equal;           // /=
 306   TokenInfo[tok::less        ] |= aci_avoid_equal;           // <=
 307   TokenInfo[tok::greater     ] |= aci_avoid_equal;           // >=
 308   TokenInfo[tok::pipe        ] |= aci_avoid_equal;           // |=
 309   TokenInfo[tok::percent     ] |= aci_avoid_equal;           // %=
 310   TokenInfo[tok::star        ] |= aci_avoid_equal;           // *=
 311   TokenInfo[tok::exclaim     ] |= aci_avoid_equal;           // !=
 312   TokenInfo[tok::lessless    ] |= aci_avoid_equal;           // <<=
 313   TokenInfo[tok::greaterequal] |= aci_avoid_equal;           // >>=
 314   TokenInfo[tok::caret       ] |= aci_avoid_equal;           // ^=
 315   TokenInfo[tok::equal       ] |= aci_avoid_equal;           // ==
 316 }
 317
 318 /// StartsWithL - Return true if the spelling of this token starts with 'L'.
 319 static bool StartsWithL(const Token &Tok, Preprocessor &PP) {
 320   if (!Tok.needsCleaning()) {
 321     SourceManager &SrcMgr = PP.getSourceManager();
 322     return *SrcMgr.getCharacterData(SrcMgr.getPhysicalLoc(Tok.getLocation()))
 323                == 'L';
 324   }
 325
 326   if (Tok.getLength() < 256) {
 327     char Buffer[256];
 328     const char *TokPtr = Buffer;
 329     PP.getSpelling(Tok, TokPtr);
 330     return TokPtr[0] == 'L';
 331   }
 332
 333   return PP.getSpelling(Tok)[0] == 'L';
 334 }
 335
 336 /// IsIdentifierL - Return true if the spelling of this token is literally 'L'.
 337 static bool IsIdentifierL(const Token &Tok, Preprocessor &PP) {
 338   if (!Tok.needsCleaning()) {
 339     if (Tok.getLength() != 1)
 340       return false;
 341     SourceManager &SrcMgr = PP.getSourceManager();
 342     return *SrcMgr.getCharacterData(SrcMgr.getPhysicalLoc(Tok.getLocation()))
 343                == 'L';
 344   }
 345
 346   if (Tok.getLength() < 256) {
 347     char Buffer[256];
 348     const char *TokPtr = Buffer;
 349     if (PP.getSpelling(Tok, TokPtr) != 1)
 350       return false;
 351     return TokPtr[0] == 'L';
 352   }
 353
 354   return PP.getSpelling(Tok) == "L";
 355 }
 356
 357
 358 /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
 359 /// the two individual tokens to be lexed as a single token, return true (which
 360 /// causes a space to be printed between them).  This allows the output of -E
 361 /// mode to be lexed to the same token stream as lexing the input directly
 362 /// would.
 363 ///
 364 /// This code must conservatively return true if it doesn't want to be 100%
 365 /// accurate.  This will cause the output to include extra space characters, but
 366 /// the resulting output won't have incorrect concatenations going on.  Examples
 367 /// include "..", which we print with a space between, because we don't want to
 368 /// track enough to tell "x.." from "...".
 369 bool PrintPPOutputPPCallbacks::AvoidConcat(const Token &PrevTok,
 370                                            const Token &Tok) {
 371   char Buffer[256];
 372
 373   tok::TokenKind PrevKind = PrevTok.getKind();
 374   if (PrevTok.getIdentifierInfo())  // Language keyword or named operator.
 375     PrevKind = tok::identifier;
 376
 377   // Look up information on when we should avoid concatenation with prevtok.
 378   unsigned ConcatInfo = TokenInfo[PrevKind];
 379
 380   // If prevtok never causes a problem for anything after it, return quickly.
 381   if (ConcatInfo == 0) return false;
 382
 383   if (ConcatInfo & aci_avoid_equal) {
 384     // If the next token is '=' or '==', avoid concatenation.
 385     if (Tok.is(tok::equal) || Tok.is(tok::equalequal))
 386       return true;
 387     ConcatInfo &= ~aci_avoid_equal;
 388   }
 389
 390   if (ConcatInfo == 0) return false;
 391
 392
 393
 394   // Basic algorithm: we look at the first character of the second token, and
 395   // determine whether it, if appended to the first token, would form (or would
 396   // contribute) to a larger token if concatenated.
 397   char FirstChar = 0;
 398   if (ConcatInfo & aci_custom) {
 399     // If the token does not need to know the first character, don't get it.
 400   } else if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
 401     // Avoid spelling identifiers, the most common form of token.
 402     FirstChar = II->getName()[0];
 403   } else if (!Tok.needsCleaning()) {
 404     SourceManager &SrcMgr = PP.getSourceManager();
 405     FirstChar =
 406       *SrcMgr.getCharacterData(SrcMgr.getPhysicalLoc(Tok.getLocation()));
 407   } else if (Tok.getLength() < 256) {
 408     const char *TokPtr = Buffer;
 409     PP.getSpelling(Tok, TokPtr);
 410     FirstChar = TokPtr[0];
 411   } else {
 412     FirstChar = PP.getSpelling(Tok)[0];
 413   }
 414
 415   switch (PrevKind) {
 416   default: assert(0 && "InitAvoidConcatTokenInfo built wrong");
 417   case tok::identifier:   // id+id or id+number or id+L"foo".
 418     if (Tok.is(tok::numeric_constant) || Tok.getIdentifierInfo() ||
 419         Tok.is(tok::wide_string_literal) /* ||
 420         Tok.is(tok::wide_char_literal)*/)
 421       return true;
 422
 423     // If this isn't identifier + string, we're done.
 424     if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
 425       return false;
 426
 427     // FIXME: need a wide_char_constant!
 428
 429     // If the string was a wide string L"foo" or wide char L'f', it would concat
 430     // with the previous identifier into fooL"bar".  Avoid this.
 431     if (StartsWithL(Tok, PP))
 432       return true;
 433
 434     // Otherwise, this is a narrow character or string.  If the *identifier* is
 435     // a literal 'L', avoid pasting L "foo" -> L"foo".
 436     return IsIdentifierL(PrevTok, PP);
 437   case tok::numeric_constant:
 438     return isalnum(FirstChar) || Tok.is(tok::numeric_constant) ||
 439            FirstChar == '+' || FirstChar == '-' || FirstChar == '.';
 440   case tok::period:          // ..., .*, .1234
 441     return FirstChar == '.' || FirstChar == '*' || isdigit(FirstChar);
 442   case tok::amp:             // &&
 443     return FirstChar == '&';
 444   case tok::plus:            // ++
 445     return FirstChar == '+';
 446   case tok::minus:           // --, ->, ->*
 447     return FirstChar == '-' || FirstChar == '>';
 448   case tok::slash:           //, /*, //
 449     return FirstChar == '*' || FirstChar == '/';
 450   case tok::less:            // <<, <<=, <:, <%
 451     return FirstChar == '<' || FirstChar == ':' || FirstChar == '%';
 452   case tok::greater:         // >>, >>=
 453     return FirstChar == '>';
 454   case tok::pipe:            // ||
 455     return FirstChar == '|';
 456   case tok::percent:         // %>, %:
 457     return FirstChar == '>' || FirstChar == ':';
 458   case tok::colon:           // ::, :>
 459     return FirstChar == ':' || FirstChar == '>';
 460   case tok::hash:            // ##, #@, %:%:
 461     return FirstChar == '#' || FirstChar == '@' || FirstChar == '%';
 462   case tok::arrow:           // ->*
 463     return FirstChar == '*';
 464   }
 465 }
 466
 467 /// DoPrintPreprocessedInput - This implements -E mode.
 468 ///
 469 void clang::DoPrintPreprocessedInput(Preprocessor &PP,
 470                                      const std::string &OutFile) {
 471   // Inform the preprocessor whether we want it to retain comments or not, due
 472   // to -C or -CC.
 473   PP.SetCommentRetentionState(EnableCommentOutput, EnableMacroCommentOutput);
 474   InitAvoidConcatTokenInfo();
 475
 476
 477   // Open the output buffer.
 478   std::string Err;
 479   llvm::raw_fd_ostream OS(OutFile.empty() ? "-" : OutFile.c_str(), Err);
 480   if (!Err.empty()) {
 481     fprintf(stderr, "%s\n", Err.c_str());
 482     exit(1);
 483   }
 484
 485   OS.SetBufferSize(64*1024);
 486
 487
 488   Token Tok, PrevTok;
 489   char Buffer[256];
 490   PrintPPOutputPPCallbacks *Callbacks = new PrintPPOutputPPCallbacks(PP, OS);
 491   PP.setPPCallbacks(Callbacks);
 492
 493   PP.AddPragmaHandler(0, new UnknownPragmaHandler("#pragma", Callbacks));
 494   PP.AddPragmaHandler("GCC", new UnknownPragmaHandler("#pragma GCC",Callbacks));
 495
 496   // After we have configured the preprocessor, enter the main file.
 497
 498   // Start parsing the specified input file.
 499   PP.EnterMainSourceFile();
 500
 501   // Consume all of the tokens that come from the predefines buffer.  Those
 502   // should not be emitted into the output and are guaranteed to be at the
 503   // start.
 504   const SourceManager &SourceMgr = PP.getSourceManager();
 505   do PP.Lex(Tok);
 506   while (Tok.isNot(tok::eof) && Tok.getLocation().isFileID() &&
 507          !strcmp(SourceMgr.getSourceName(Tok.getLocation()), "<predefines>"));
 508
 509   while (1) {
 510
 511     // If this token is at the start of a line, emit newlines if needed.
 512     if (Tok.isAtStartOfLine() && Callbacks->HandleFirstTokOnLine(Tok)) {
 513       // done.
 514     } else if (Tok.hasLeadingSpace() ||
 515                // If we haven't emitted a token on this line yet, PrevTok isn't
 516                // useful to look at and no concatenation could happen anyway.
 517                (Callbacks->hasEmittedTokensOnThisLine() &&
 518                 // Don't print "-" next to "-", it would form "--".
 519                 Callbacks->AvoidConcat(PrevTok, Tok))) {
 520       OS << ' ';
 521     }
 522
 523     if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
 524       const char *Str = II->getName();
 525       unsigned Len = Tok.needsCleaning() ? strlen(Str) : Tok.getLength();
 526       OS.write(Str, Len);
 527     } else if (Tok.getLength() < 256) {
 528       const char *TokPtr = Buffer;
 529       unsigned Len = PP.getSpelling(Tok, TokPtr);
 530       OS.write(TokPtr, Len);
 531     } else {
 532       std::string S = PP.getSpelling(Tok);
 533       OS.write(&S[0], S.size());
 534     }
 535     Callbacks->SetEmittedTokensOnThisLine();
 536
 537     if (Tok.is(tok::eof)) break;
 538
 539     PrevTok = Tok;
 540     PP.Lex(Tok);
 541   }
 542   OS << '\n';
 543
 544   // Flush the ostream.
 545   OS.flush();
 546
 547   // If an error occurred, remove the output file.
 548   if (PP.getDiagnostics().hasErrorOccurred() && !OutFile.empty())
 549     llvm::sys::Path(OutFile).eraseFromDisk();
 550 }
 551