1 //===--- PrintPreprocessedOutput.cpp - Implement the -E mode --------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This code simply runs the preprocessor on the input file and prints out the
11 // result. This is the traditional behavior of the -E option.
13 //===----------------------------------------------------------------------===//
16 #include "clang/Lex/PPCallbacks.h"
17 #include "clang/Lex/Preprocessor.h"
18 #include "clang/Lex/Pragma.h"
19 #include "clang/Basic/SourceManager.h"
20 #include "clang/Basic/Diagnostic.h"
21 #include "llvm/ADT/SmallString.h"
22 #include "llvm/ADT/StringExtras.h"
23 #include "llvm/System/Path.h"
24 #include "llvm/Support/CommandLine.h"
25 #include "llvm/Config/config.h"
26 #include "llvm/Support/raw_ostream.h"
28 using namespace clang
;
30 //===----------------------------------------------------------------------===//
31 // Preprocessed token printer
32 //===----------------------------------------------------------------------===//
34 static llvm::cl::opt
<bool>
35 DisableLineMarkers("P", llvm::cl::desc("Disable linemarker output in -E mode"));
36 static llvm::cl::opt
<bool>
37 EnableCommentOutput("C", llvm::cl::desc("Enable comment output in -E mode"));
38 static llvm::cl::opt
<bool>
39 EnableMacroCommentOutput("CC",
40 llvm::cl::desc("Enable comment output in -E mode, "
41 "even from macro expansions"));
44 class PrintPPOutputPPCallbacks
: public PPCallbacks
{
47 llvm::raw_ostream
&OS
;
50 bool EmittedTokensOnThisLine
;
51 SrcMgr::CharacteristicKind FileType
;
52 llvm::SmallString
<512> CurFilename
;
55 PrintPPOutputPPCallbacks(Preprocessor
&pp
, llvm::raw_ostream
&os
)
58 CurFilename
+= "<uninit>";
59 EmittedTokensOnThisLine
= false;
60 FileType
= SrcMgr::C_User
;
64 void SetEmittedTokensOnThisLine() { EmittedTokensOnThisLine
= true; }
65 bool hasEmittedTokensOnThisLine() const { return EmittedTokensOnThisLine
; }
67 virtual void FileChanged(SourceLocation Loc
, FileChangeReason Reason
,
68 SrcMgr::CharacteristicKind FileType
);
69 virtual void Ident(SourceLocation Loc
, const std::string
&str
);
72 bool HandleFirstTokOnLine(Token
&Tok
);
73 bool MoveToLine(SourceLocation Loc
);
74 bool AvoidConcat(const Token
&PrevTok
, const Token
&Tok
);
75 void WriteLineInfo(unsigned LineNo
, const char *Extra
=0, unsigned ExtraLen
=0);
77 } // end anonymous namespace
79 void PrintPPOutputPPCallbacks::WriteLineInfo(unsigned LineNo
,
82 if (EmittedTokensOnThisLine
) {
84 EmittedTokensOnThisLine
= false;
87 OS
<< '#' << ' ' << LineNo
<< ' ' << '"';
88 OS
.write(&CurFilename
[0], CurFilename
.size());
92 OS
.write(Extra
, ExtraLen
);
94 if (FileType
== SrcMgr::C_System
)
96 else if (FileType
== SrcMgr::C_ExternCSystem
)
101 /// MoveToLine - Move the output to the source line specified by the location
102 /// object. We can do this by emitting some number of \n's, or be emitting a
103 /// #line directive. This returns false if already at the specified line, true
104 /// if some newlines were emitted.
105 bool PrintPPOutputPPCallbacks::MoveToLine(SourceLocation Loc
) {
106 unsigned LineNo
= PP
.getSourceManager().getLogicalLineNumber(Loc
);
108 if (DisableLineMarkers
) {
109 if (LineNo
== CurLine
) return false;
113 if (!EmittedTokensOnThisLine
)
117 EmittedTokensOnThisLine
= false;
121 // If this line is "close enough" to the original line, just print newlines,
122 // otherwise print a #line directive.
123 if (LineNo
-CurLine
<= 8) {
124 if (LineNo
-CurLine
== 1)
126 else if (LineNo
== CurLine
)
127 return false; // Phys line moved, but logical line didn't.
129 const char *NewLines
= "\n\n\n\n\n\n\n\n";
130 OS
.write(NewLines
, LineNo
-CurLine
);
133 WriteLineInfo(LineNo
, 0, 0);
141 /// FileChanged - Whenever the preprocessor enters or exits a #include file
142 /// it invokes this handler. Update our conception of the current source
144 void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc
,
145 FileChangeReason Reason
,
146 SrcMgr::CharacteristicKind NewFileType
) {
147 // Unless we are exiting a #include, make sure to skip ahead to the line the
148 // #include directive was at.
149 SourceManager
&SourceMgr
= PP
.getSourceManager();
150 if (Reason
== PPCallbacks::EnterFile
) {
151 MoveToLine(SourceMgr
.getIncludeLoc(Loc
));
152 } else if (Reason
== PPCallbacks::SystemHeaderPragma
) {
155 // TODO GCC emits the # directive for this directive on the line AFTER the
156 // directive and emits a bunch of spaces that aren't needed. Emulate this
160 Loc
= SourceMgr
.getLogicalLoc(Loc
);
161 CurLine
= SourceMgr
.getLineNumber(Loc
);
163 if (DisableLineMarkers
) return;
166 CurFilename
+= SourceMgr
.getSourceName(Loc
);
167 Lexer::Stringify(CurFilename
);
168 FileType
= NewFileType
;
171 WriteLineInfo(CurLine
);
176 case PPCallbacks::EnterFile
:
177 WriteLineInfo(CurLine
, " 1", 2);
179 case PPCallbacks::ExitFile
:
180 WriteLineInfo(CurLine
, " 2", 2);
182 case PPCallbacks::SystemHeaderPragma
:
183 case PPCallbacks::RenameFile
:
184 WriteLineInfo(CurLine
);
189 /// HandleIdent - Handle #ident directives when read by the preprocessor.
191 void PrintPPOutputPPCallbacks::Ident(SourceLocation Loc
, const std::string
&S
) {
194 OS
.write("#ident ", strlen("#ident "));
195 OS
.write(&S
[0], S
.size());
196 EmittedTokensOnThisLine
= true;
199 /// HandleFirstTokOnLine - When emitting a preprocessed file in -E mode, this
200 /// is called for the first token on each new line. If this really is the start
201 /// of a new logical line, handle it and return true, otherwise return false.
202 /// This may not be the start of a logical line because the "start of line"
203 /// marker is set for physical lines, not logical ones.
204 bool PrintPPOutputPPCallbacks::HandleFirstTokOnLine(Token
&Tok
) {
205 // Figure out what line we went to and insert the appropriate number of
206 // newline characters.
207 if (!MoveToLine(Tok
.getLocation()))
210 // Print out space characters so that the first token on a line is
211 // indented for easy reading.
212 const SourceManager
&SourceMgr
= PP
.getSourceManager();
213 unsigned ColNo
= SourceMgr
.getLogicalColumnNumber(Tok
.getLocation());
215 // This hack prevents stuff like:
217 // HASH define foo bar
218 // From having the # character end up at column 1, which makes it so it
219 // is not handled as a #define next time through the preprocessor if in
220 // -fpreprocessed mode.
221 if (ColNo
<= 1 && Tok
.is(tok::hash
))
224 // Otherwise, indent the appropriate number of spaces.
225 for (; ColNo
> 1; --ColNo
)
232 struct UnknownPragmaHandler
: public PragmaHandler
{
234 PrintPPOutputPPCallbacks
*Callbacks
;
236 UnknownPragmaHandler(const char *prefix
, PrintPPOutputPPCallbacks
*callbacks
)
237 : PragmaHandler(0), Prefix(prefix
), Callbacks(callbacks
) {}
238 virtual void HandlePragma(Preprocessor
&PP
, Token
&PragmaTok
) {
239 // Figure out what line we went to and insert the appropriate number of
240 // newline characters.
241 Callbacks
->MoveToLine(PragmaTok
.getLocation());
242 Callbacks
->OS
.write(Prefix
, strlen(Prefix
));
244 // Read and print all of the pragma tokens.
245 while (PragmaTok
.isNot(tok::eom
)) {
246 if (PragmaTok
.hasLeadingSpace())
247 Callbacks
->OS
<< ' ';
248 std::string TokSpell
= PP
.getSpelling(PragmaTok
);
249 Callbacks
->OS
.write(&TokSpell
[0], TokSpell
.size());
250 PP
.LexUnexpandedToken(PragmaTok
);
252 Callbacks
->OS
<< '\n';
255 } // end anonymous namespace
258 enum AvoidConcatInfo
{
259 /// By default, a token never needs to avoid concatenation. Most tokens (e.g.
260 /// ',', ')', etc) don't cause a problem when concatenated.
261 aci_never_avoid_concat
= 0,
263 /// aci_custom_firstchar - AvoidConcat contains custom code to handle this
264 /// token's requirements, and it needs to know the first character of the
266 aci_custom_firstchar
= 1,
268 /// aci_custom - AvoidConcat contains custom code to handle this token's
269 /// requirements, but it doesn't need to know the first character of the
273 /// aci_avoid_equal - Many tokens cannot be safely followed by an '='
274 /// character. For example, "<<" turns into "<<=" when followed by an =.
278 /// This array contains information for each token on what action to take when
279 /// avoiding concatenation of tokens in the AvoidConcat method.
280 static char TokenInfo
[tok::NUM_TOKENS
];
282 /// InitAvoidConcatTokenInfo - Tokens that must avoid concatenation should be
283 /// marked by this function.
284 static void InitAvoidConcatTokenInfo() {
285 // These tokens have custom code in AvoidConcat.
286 TokenInfo
[tok::identifier
] |= aci_custom
;
287 TokenInfo
[tok::numeric_constant
] |= aci_custom_firstchar
;
288 TokenInfo
[tok::period
] |= aci_custom_firstchar
;
289 TokenInfo
[tok::amp
] |= aci_custom_firstchar
;
290 TokenInfo
[tok::plus
] |= aci_custom_firstchar
;
291 TokenInfo
[tok::minus
] |= aci_custom_firstchar
;
292 TokenInfo
[tok::slash
] |= aci_custom_firstchar
;
293 TokenInfo
[tok::less
] |= aci_custom_firstchar
;
294 TokenInfo
[tok::greater
] |= aci_custom_firstchar
;
295 TokenInfo
[tok::pipe
] |= aci_custom_firstchar
;
296 TokenInfo
[tok::percent
] |= aci_custom_firstchar
;
297 TokenInfo
[tok::colon
] |= aci_custom_firstchar
;
298 TokenInfo
[tok::hash
] |= aci_custom_firstchar
;
299 TokenInfo
[tok::arrow
] |= aci_custom_firstchar
;
301 // These tokens change behavior if followed by an '='.
302 TokenInfo
[tok::amp
] |= aci_avoid_equal
; // &=
303 TokenInfo
[tok::plus
] |= aci_avoid_equal
; // +=
304 TokenInfo
[tok::minus
] |= aci_avoid_equal
; // -=
305 TokenInfo
[tok::slash
] |= aci_avoid_equal
; // /=
306 TokenInfo
[tok::less
] |= aci_avoid_equal
; // <=
307 TokenInfo
[tok::greater
] |= aci_avoid_equal
; // >=
308 TokenInfo
[tok::pipe
] |= aci_avoid_equal
; // |=
309 TokenInfo
[tok::percent
] |= aci_avoid_equal
; // %=
310 TokenInfo
[tok::star
] |= aci_avoid_equal
; // *=
311 TokenInfo
[tok::exclaim
] |= aci_avoid_equal
; // !=
312 TokenInfo
[tok::lessless
] |= aci_avoid_equal
; // <<=
313 TokenInfo
[tok::greaterequal
] |= aci_avoid_equal
; // >>=
314 TokenInfo
[tok::caret
] |= aci_avoid_equal
; // ^=
315 TokenInfo
[tok::equal
] |= aci_avoid_equal
; // ==
318 /// StartsWithL - Return true if the spelling of this token starts with 'L'.
319 static bool StartsWithL(const Token
&Tok
, Preprocessor
&PP
) {
320 if (!Tok
.needsCleaning()) {
321 SourceManager
&SrcMgr
= PP
.getSourceManager();
322 return *SrcMgr
.getCharacterData(SrcMgr
.getPhysicalLoc(Tok
.getLocation()))
326 if (Tok
.getLength() < 256) {
328 const char *TokPtr
= Buffer
;
329 PP
.getSpelling(Tok
, TokPtr
);
330 return TokPtr
[0] == 'L';
333 return PP
.getSpelling(Tok
)[0] == 'L';
336 /// IsIdentifierL - Return true if the spelling of this token is literally 'L'.
337 static bool IsIdentifierL(const Token
&Tok
, Preprocessor
&PP
) {
338 if (!Tok
.needsCleaning()) {
339 if (Tok
.getLength() != 1)
341 SourceManager
&SrcMgr
= PP
.getSourceManager();
342 return *SrcMgr
.getCharacterData(SrcMgr
.getPhysicalLoc(Tok
.getLocation()))
346 if (Tok
.getLength() < 256) {
348 const char *TokPtr
= Buffer
;
349 if (PP
.getSpelling(Tok
, TokPtr
) != 1)
351 return TokPtr
[0] == 'L';
354 return PP
.getSpelling(Tok
) == "L";
358 /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
359 /// the two individual tokens to be lexed as a single token, return true (which
360 /// causes a space to be printed between them). This allows the output of -E
361 /// mode to be lexed to the same token stream as lexing the input directly
364 /// This code must conservatively return true if it doesn't want to be 100%
365 /// accurate. This will cause the output to include extra space characters, but
366 /// the resulting output won't have incorrect concatenations going on. Examples
367 /// include "..", which we print with a space between, because we don't want to
368 /// track enough to tell "x.." from "...".
369 bool PrintPPOutputPPCallbacks::AvoidConcat(const Token
&PrevTok
,
373 tok::TokenKind PrevKind
= PrevTok
.getKind();
374 if (PrevTok
.getIdentifierInfo()) // Language keyword or named operator.
375 PrevKind
= tok::identifier
;
377 // Look up information on when we should avoid concatenation with prevtok.
378 unsigned ConcatInfo
= TokenInfo
[PrevKind
];
380 // If prevtok never causes a problem for anything after it, return quickly.
381 if (ConcatInfo
== 0) return false;
383 if (ConcatInfo
& aci_avoid_equal
) {
384 // If the next token is '=' or '==', avoid concatenation.
385 if (Tok
.is(tok::equal
) || Tok
.is(tok::equalequal
))
387 ConcatInfo
&= ~aci_avoid_equal
;
390 if (ConcatInfo
== 0) return false;
394 // Basic algorithm: we look at the first character of the second token, and
395 // determine whether it, if appended to the first token, would form (or would
396 // contribute) to a larger token if concatenated.
398 if (ConcatInfo
& aci_custom
) {
399 // If the token does not need to know the first character, don't get it.
400 } else if (IdentifierInfo
*II
= Tok
.getIdentifierInfo()) {
401 // Avoid spelling identifiers, the most common form of token.
402 FirstChar
= II
->getName()[0];
403 } else if (!Tok
.needsCleaning()) {
404 SourceManager
&SrcMgr
= PP
.getSourceManager();
406 *SrcMgr
.getCharacterData(SrcMgr
.getPhysicalLoc(Tok
.getLocation()));
407 } else if (Tok
.getLength() < 256) {
408 const char *TokPtr
= Buffer
;
409 PP
.getSpelling(Tok
, TokPtr
);
410 FirstChar
= TokPtr
[0];
412 FirstChar
= PP
.getSpelling(Tok
)[0];
416 default: assert(0 && "InitAvoidConcatTokenInfo built wrong");
417 case tok::identifier
: // id+id or id+number or id+L"foo".
418 if (Tok
.is(tok::numeric_constant
) || Tok
.getIdentifierInfo() ||
419 Tok
.is(tok::wide_string_literal
) /* ||
420 Tok.is(tok::wide_char_literal)*/)
423 // If this isn't identifier + string, we're done.
424 if (Tok
.isNot(tok::char_constant
) && Tok
.isNot(tok::string_literal
))
427 // FIXME: need a wide_char_constant!
429 // If the string was a wide string L"foo" or wide char L'f', it would concat
430 // with the previous identifier into fooL"bar". Avoid this.
431 if (StartsWithL(Tok
, PP
))
434 // Otherwise, this is a narrow character or string. If the *identifier* is
435 // a literal 'L', avoid pasting L "foo" -> L"foo".
436 return IsIdentifierL(PrevTok
, PP
);
437 case tok::numeric_constant
:
438 return isalnum(FirstChar
) || Tok
.is(tok::numeric_constant
) ||
439 FirstChar
== '+' || FirstChar
== '-' || FirstChar
== '.';
440 case tok::period
: // ..., .*, .1234
441 return FirstChar
== '.' || FirstChar
== '*' || isdigit(FirstChar
);
443 return FirstChar
== '&';
444 case tok::plus
: // ++
445 return FirstChar
== '+';
446 case tok::minus
: // --, ->, ->*
447 return FirstChar
== '-' || FirstChar
== '>';
448 case tok::slash
: //, /*, //
449 return FirstChar
== '*' || FirstChar
== '/';
450 case tok::less
: // <<, <<=, <:, <%
451 return FirstChar
== '<' || FirstChar
== ':' || FirstChar
== '%';
452 case tok::greater
: // >>, >>=
453 return FirstChar
== '>';
454 case tok::pipe
: // ||
455 return FirstChar
== '|';
456 case tok::percent
: // %>, %:
457 return FirstChar
== '>' || FirstChar
== ':';
458 case tok::colon
: // ::, :>
459 return FirstChar
== ':' || FirstChar
== '>';
460 case tok::hash
: // ##, #@, %:%:
461 return FirstChar
== '#' || FirstChar
== '@' || FirstChar
== '%';
462 case tok::arrow
: // ->*
463 return FirstChar
== '*';
467 /// DoPrintPreprocessedInput - This implements -E mode.
469 void clang::DoPrintPreprocessedInput(Preprocessor
&PP
,
470 const std::string
&OutFile
) {
471 // Inform the preprocessor whether we want it to retain comments or not, due
473 PP
.SetCommentRetentionState(EnableCommentOutput
, EnableMacroCommentOutput
);
474 InitAvoidConcatTokenInfo();
477 // Open the output buffer.
479 llvm::raw_fd_ostream
OS(OutFile
.empty() ? "-" : OutFile
.c_str(), Err
);
481 fprintf(stderr
, "%s\n", Err
.c_str());
485 OS
.SetBufferSize(64*1024);
490 PrintPPOutputPPCallbacks
*Callbacks
= new PrintPPOutputPPCallbacks(PP
, OS
);
491 PP
.setPPCallbacks(Callbacks
);
493 PP
.AddPragmaHandler(0, new UnknownPragmaHandler("#pragma", Callbacks
));
494 PP
.AddPragmaHandler("GCC", new UnknownPragmaHandler("#pragma GCC",Callbacks
));
496 // After we have configured the preprocessor, enter the main file.
498 // Start parsing the specified input file.
499 PP
.EnterMainSourceFile();
501 // Consume all of the tokens that come from the predefines buffer. Those
502 // should not be emitted into the output and are guaranteed to be at the
504 const SourceManager
&SourceMgr
= PP
.getSourceManager();
506 while (Tok
.isNot(tok::eof
) && Tok
.getLocation().isFileID() &&
507 !strcmp(SourceMgr
.getSourceName(Tok
.getLocation()), "<predefines>"));
511 // If this token is at the start of a line, emit newlines if needed.
512 if (Tok
.isAtStartOfLine() && Callbacks
->HandleFirstTokOnLine(Tok
)) {
514 } else if (Tok
.hasLeadingSpace() ||
515 // If we haven't emitted a token on this line yet, PrevTok isn't
516 // useful to look at and no concatenation could happen anyway.
517 (Callbacks
->hasEmittedTokensOnThisLine() &&
518 // Don't print "-" next to "-", it would form "--".
519 Callbacks
->AvoidConcat(PrevTok
, Tok
))) {
523 if (IdentifierInfo
*II
= Tok
.getIdentifierInfo()) {
524 const char *Str
= II
->getName();
525 unsigned Len
= Tok
.needsCleaning() ? strlen(Str
) : Tok
.getLength();
527 } else if (Tok
.getLength() < 256) {
528 const char *TokPtr
= Buffer
;
529 unsigned Len
= PP
.getSpelling(Tok
, TokPtr
);
530 OS
.write(TokPtr
, Len
);
532 std::string S
= PP
.getSpelling(Tok
);
533 OS
.write(&S
[0], S
.size());
535 Callbacks
->SetEmittedTokensOnThisLine();
537 if (Tok
.is(tok::eof
)) break;
544 // Flush the ostream.
547 // If an error occurred, remove the output file.
548 if (PP
.getDiagnostics().hasErrorOccurred() && !OutFile
.empty())
549 llvm::sys::Path(OutFile
).eraseFromDisk();