Fix indenting a snippet when there is whitespace after the
[geany-mirror.git] / scintilla / LexBash.cxx
blob5801278becb9f7d5f7860f62fb230b368c3aab5c
1 // Scintilla source code edit control
2 /** @file LexBash.cxx
3 ** Lexer for Bash.
4 **/
5 // Copyright 2004-2008 by Neil Hodgson <neilh@scintilla.org>
6 // Adapted from LexPerl by Kein-Hong Man 2004
7 // The License.txt file describes the conditions under which this software may be distributed.
9 #include <stdlib.h>
10 #include <string.h>
11 #include <ctype.h>
12 #include <stdio.h>
13 #include <stdarg.h>
15 #include "Platform.h"
17 #include "PropSet.h"
18 #include "Accessor.h"
19 #include "StyleContext.h"
20 #include "KeyWords.h"
21 #include "Scintilla.h"
22 #include "SciLexer.h"
23 #include "CharacterSet.h"
25 #ifdef SCI_NAMESPACE
26 using namespace Scintilla;
27 #endif
29 #define HERE_DELIM_MAX 256
31 // define this if you want 'invalid octals' to be marked as errors
32 // usually, this is not a good idea, permissive lexing is better
33 #undef PEDANTIC_OCTAL
35 #define BASH_BASE_ERROR 65
36 #define BASH_BASE_DECIMAL 66
37 #define BASH_BASE_HEX 67
38 #ifdef PEDANTIC_OCTAL
39 #define BASH_BASE_OCTAL 68
40 #define BASH_BASE_OCTAL_ERROR 69
41 #endif
43 static inline int translateBashDigit(int ch) {
44 if (ch >= '0' && ch <= '9') {
45 return ch - '0';
46 } else if (ch >= 'a' && ch <= 'z') {
47 return ch - 'a' + 10;
48 } else if (ch >= 'A' && ch <= 'Z') {
49 return ch - 'A' + 36;
50 } else if (ch == '@') {
51 return 62;
52 } else if (ch == '_') {
53 return 63;
55 return BASH_BASE_ERROR;
58 static inline int getBashNumberBase(char *s) {
59 int i = 0;
60 int base = 0;
61 while (*s) {
62 base = base * 10 + (*s++ - '0');
63 i++;
65 if (base > 64 || i > 2) {
66 return BASH_BASE_ERROR;
68 return base;
71 static int opposite(int ch) {
72 if (ch == '(') return ')';
73 if (ch == '[') return ']';
74 if (ch == '{') return '}';
75 if (ch == '<') return '>';
76 return ch;
79 static void ColouriseBashDoc(unsigned int startPos, int length, int initStyle,
80 WordList *keywordlists[], Accessor &styler) {
82 WordList &keywords = *keywordlists[0];
84 CharacterSet setWordStart(CharacterSet::setAlpha, "_");
85 // note that [+-] are often parts of identifiers in shell scripts
86 CharacterSet setWord(CharacterSet::setAlphaNum, "._+-");
87 CharacterSet setBashOperator(CharacterSet::setNone, "^&\\%()-+=|{}[]:;>,*/<?!.~@");
88 CharacterSet setSingleCharOp(CharacterSet::setNone, "rwxoRWXOezsfdlpSbctugkTBMACahGLNn");
89 CharacterSet setParam(CharacterSet::setAlphaNum, "$_");
90 CharacterSet setHereDoc(CharacterSet::setAlpha, "_\\-+!");
91 CharacterSet setHereDoc2(CharacterSet::setAlphaNum, "_-+!");
92 CharacterSet setLeftShift(CharacterSet::setDigits, "=$");
94 class HereDocCls { // Class to manage HERE document elements
95 public:
96 int State; // 0: '<<' encountered
97 // 1: collect the delimiter
98 // 2: here doc text (lines after the delimiter)
99 int Quote; // the char after '<<'
100 bool Quoted; // true if Quote in ('\'','"','`')
101 bool Indent; // indented delimiter (for <<-)
102 int DelimiterLength; // strlen(Delimiter)
103 char *Delimiter; // the Delimiter, 256: sizeof PL_tokenbuf
104 HereDocCls() {
105 State = 0;
106 Quote = 0;
107 Quoted = false;
108 Indent = 0;
109 DelimiterLength = 0;
110 Delimiter = new char[HERE_DELIM_MAX];
111 Delimiter[0] = '\0';
113 void Append(int ch) {
114 Delimiter[DelimiterLength++] = static_cast<char>(ch);
115 Delimiter[DelimiterLength] = '\0';
117 ~HereDocCls() {
118 delete []Delimiter;
121 HereDocCls HereDoc;
123 class QuoteCls { // Class to manage quote pairs (simplified vs LexPerl)
124 public:
125 int Count;
126 int Up, Down;
127 QuoteCls() {
128 Count = 0;
129 Up = '\0';
130 Down = '\0';
132 void Open(int u) {
133 Count++;
134 Up = u;
135 Down = opposite(Up);
137 void Start(int u) {
138 Count = 0;
139 Open(u);
142 QuoteCls Quote;
144 int numBase = 0;
145 int digit;
146 unsigned int endPos = startPos + length;
148 // Backtrack to beginning of style if required...
149 // If in a long distance lexical state, backtrack to find quote characters
150 if (initStyle == SCE_SH_HERE_Q) {
151 while ((startPos > 1) && (styler.StyleAt(startPos) != SCE_SH_HERE_DELIM)) {
152 startPos--;
154 startPos = styler.LineStart(styler.GetLine(startPos));
155 initStyle = styler.StyleAt(startPos - 1);
157 // Bash strings can be multi-line with embedded newlines, so backtrack.
158 // Bash numbers have additional state during lexing, so backtrack too.
159 if (initStyle == SCE_SH_STRING
160 || initStyle == SCE_SH_BACKTICKS
161 || initStyle == SCE_SH_CHARACTER
162 || initStyle == SCE_SH_NUMBER
163 || initStyle == SCE_SH_IDENTIFIER
164 || initStyle == SCE_SH_COMMENTLINE) {
165 while ((startPos > 1) && (styler.StyleAt(startPos - 1) == initStyle)) {
166 startPos--;
168 initStyle = SCE_SH_DEFAULT;
171 StyleContext sc(startPos, endPos - startPos, initStyle, styler);
173 for (; sc.More(); sc.Forward()) {
175 // Determine if the current state should terminate.
176 switch (sc.state) {
177 case SCE_SH_OPERATOR:
178 sc.SetState(SCE_SH_DEFAULT);
179 break;
180 case SCE_SH_WORD:
181 // "." never used in Bash variable names but used in file names
182 if (!setWord.Contains(sc.ch)) {
183 char s[1000];
184 sc.GetCurrent(s, sizeof(s));
185 if (s[0] != '-' && // for file operators
186 !keywords.InList(s)) {
187 sc.ChangeState(SCE_SH_IDENTIFIER);
189 sc.SetState(SCE_SH_DEFAULT);
191 break;
192 case SCE_SH_IDENTIFIER:
193 if (sc.chPrev == '\\') { // for escaped chars
194 sc.ForwardSetState(SCE_SH_DEFAULT);
195 } else if (!setWord.Contains(sc.ch)) {
196 sc.SetState(SCE_SH_DEFAULT);
198 break;
199 case SCE_SH_NUMBER:
200 digit = translateBashDigit(sc.ch);
201 if (numBase == BASH_BASE_DECIMAL) {
202 if (sc.ch == '#') {
203 char s[10];
204 sc.GetCurrent(s, sizeof(s));
205 numBase = getBashNumberBase(s);
206 if (numBase != BASH_BASE_ERROR)
207 break;
208 } else if (IsADigit(sc.ch))
209 break;
210 } else if (numBase == BASH_BASE_HEX) {
211 if (IsADigit(sc.ch, 16))
212 break;
213 #ifdef PEDANTIC_OCTAL
214 } else if (numBase == BASH_BASE_OCTAL ||
215 numBase == BASH_BASE_OCTAL_ERROR) {
216 if (digit <= 7)
217 break;
218 if (digit <= 9) {
219 numBase = BASH_BASE_OCTAL_ERROR;
220 break;
222 #endif
223 } else if (numBase == BASH_BASE_ERROR) {
224 if (digit <= 9)
225 break;
226 } else { // DD#DDDD number style handling
227 if (digit != BASH_BASE_ERROR) {
228 if (numBase <= 36) {
229 // case-insensitive if base<=36
230 if (digit >= 36) digit -= 26;
232 if (digit < numBase)
233 break;
234 if (digit <= 9) {
235 numBase = BASH_BASE_ERROR;
236 break;
240 // fallthrough when number is at an end or error
241 if (numBase == BASH_BASE_ERROR
242 #ifdef PEDANTIC_OCTAL
243 || numBase == BASH_BASE_OCTAL_ERROR
244 #endif
246 sc.ChangeState(SCE_SH_ERROR);
248 sc.SetState(SCE_SH_DEFAULT);
249 break;
250 case SCE_SH_COMMENTLINE:
251 if (sc.ch == '\\' && (sc.chNext == '\r' || sc.chNext == '\n')) {
252 // comment continuation
253 sc.Forward();
254 if (sc.ch == '\r' && sc.chNext == '\n') {
255 sc.Forward();
257 } else if (sc.atLineEnd) {
258 sc.ForwardSetState(SCE_SH_DEFAULT);
260 break;
261 case SCE_SH_HERE_DELIM:
262 // From Bash info:
263 // ---------------
264 // Specifier format is: <<[-]WORD
265 // Optional '-' is for removal of leading tabs from here-doc.
266 // Whitespace acceptable after <<[-] operator
268 if (HereDoc.State == 0) { // '<<' encountered
269 HereDoc.Quote = sc.chNext;
270 HereDoc.Quoted = false;
271 HereDoc.DelimiterLength = 0;
272 HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0';
273 if (sc.chNext == '\'' || sc.chNext == '\"') { // a quoted here-doc delimiter (' or ")
274 sc.Forward();
275 HereDoc.Quoted = true;
276 HereDoc.State = 1;
277 } else if (!HereDoc.Indent && sc.chNext == '-') { // <<- indent case
278 HereDoc.Indent = true;
279 } else if (setHereDoc.Contains(sc.chNext)) {
280 // an unquoted here-doc delimiter, no special handling
281 // TODO check what exactly bash considers part of the delim
282 HereDoc.State = 1;
283 } else if (sc.chNext == '<') { // HERE string <<<
284 sc.Forward();
285 sc.ForwardSetState(SCE_SH_DEFAULT);
286 } else if (IsASpace(sc.chNext)) {
287 // eat whitespace
288 } else if (setLeftShift.Contains(sc.chNext)) {
289 // left shift << or <<= operator cases
290 sc.ChangeState(SCE_SH_OPERATOR);
291 sc.ForwardSetState(SCE_SH_DEFAULT);
292 } else {
293 // symbols terminates; deprecated zero-length delimiter
294 HereDoc.State = 1;
296 } else if (HereDoc.State == 1) { // collect the delimiter
297 if (HereDoc.Quoted) { // a quoted here-doc delimiter
298 if (sc.ch == HereDoc.Quote) { // closing quote => end of delimiter
299 sc.ForwardSetState(SCE_SH_DEFAULT);
300 } else {
301 if (sc.ch == '\\' && sc.chNext == HereDoc.Quote) { // escaped quote
302 sc.Forward();
304 HereDoc.Append(sc.ch);
306 } else { // an unquoted here-doc delimiter
307 if (setHereDoc2.Contains(sc.ch)) {
308 HereDoc.Append(sc.ch);
309 } else if (sc.ch == '\\') {
310 // skip escape prefix
311 } else {
312 sc.SetState(SCE_SH_DEFAULT);
315 if (HereDoc.DelimiterLength >= HERE_DELIM_MAX - 1) { // force blowup
316 sc.SetState(SCE_SH_ERROR);
317 HereDoc.State = 0;
320 break;
321 case SCE_SH_HERE_Q:
322 // HereDoc.State == 2
323 if (sc.atLineStart) {
324 sc.SetState(SCE_SH_HERE_Q);
325 int prefixws = 0;
326 while (IsASpace(sc.ch) && !sc.atLineEnd) { // whitespace prefix
327 sc.Forward();
328 prefixws++;
330 if (prefixws > 0)
331 sc.SetState(SCE_SH_HERE_Q);
332 while (!sc.atLineEnd) {
333 sc.Forward();
335 char s[HERE_DELIM_MAX];
336 sc.GetCurrent(s, sizeof(s));
337 if (sc.LengthCurrent() == 0)
338 break;
339 if (s[strlen(s) - 1] == '\r')
340 s[strlen(s) - 1] = '\0';
341 if (strcmp(HereDoc.Delimiter, s) == 0) {
342 if ((prefixws > 0 && HereDoc.Indent) || // indentation rule
343 (prefixws == 0 && !HereDoc.Indent)) {
344 sc.SetState(SCE_SH_DEFAULT);
345 break;
349 break;
350 case SCE_SH_SCALAR: // variable names
351 if (!setParam.Contains(sc.ch)) {
352 if (sc.LengthCurrent() == 1) {
353 // Special variable: $(, $_ etc.
354 sc.ForwardSetState(SCE_SH_DEFAULT);
355 } else {
356 sc.SetState(SCE_SH_DEFAULT);
359 break;
360 case SCE_SH_STRING: // delimited styles
361 case SCE_SH_CHARACTER:
362 case SCE_SH_BACKTICKS:
363 case SCE_SH_PARAM:
364 if (sc.ch == '\\' && Quote.Up != '\\') {
365 sc.Forward();
366 } else if (sc.ch == Quote.Down) {
367 Quote.Count--;
368 if (Quote.Count == 0) {
369 sc.ForwardSetState(SCE_SH_DEFAULT);
371 } else if (sc.ch == Quote.Up) {
372 Quote.Count++;
374 break;
377 // Must check end of HereDoc state 1 before default state is handled
378 if (HereDoc.State == 1 && sc.atLineEnd) {
379 // Begin of here-doc (the line after the here-doc delimiter):
380 // Lexically, the here-doc starts from the next line after the >>, but the
381 // first line of here-doc seem to follow the style of the last EOL sequence
382 HereDoc.State = 2;
383 if (HereDoc.Quoted) {
384 if (sc.state == SCE_SH_HERE_DELIM) {
385 // Missing quote at end of string! We are stricter than bash.
386 // Colour here-doc anyway while marking this bit as an error.
387 sc.ChangeState(SCE_SH_ERROR);
389 // HereDoc.Quote always == '\''
391 sc.SetState(SCE_SH_HERE_Q);
394 // Determine if a new state should be entered.
395 if (sc.state == SCE_SH_DEFAULT) {
396 if (sc.ch == '\\') { // escaped character
397 sc.SetState(SCE_SH_IDENTIFIER);
398 } else if (IsADigit(sc.ch)) {
399 sc.SetState(SCE_SH_NUMBER);
400 numBase = BASH_BASE_DECIMAL;
401 if (sc.ch == '0') { // hex,octal
402 if (sc.chNext == 'x' || sc.chNext == 'X') {
403 numBase = BASH_BASE_HEX;
404 sc.Forward();
405 } else if (IsADigit(sc.chNext)) {
406 #ifdef PEDANTIC_OCTAL
407 numBase = BASH_BASE_OCTAL;
408 #else
409 numBase = BASH_BASE_HEX;
410 #endif
413 } else if (setWordStart.Contains(sc.ch)) {
414 sc.SetState(SCE_SH_WORD);
415 } else if (sc.ch == '#') {
416 sc.SetState(SCE_SH_COMMENTLINE);
417 } else if (sc.ch == '\"') {
418 sc.SetState(SCE_SH_STRING);
419 Quote.Start(sc.ch);
420 } else if (sc.ch == '\'') {
421 sc.SetState(SCE_SH_CHARACTER);
422 Quote.Start(sc.ch);
423 } else if (sc.ch == '`') {
424 sc.SetState(SCE_SH_BACKTICKS);
425 Quote.Start(sc.ch);
426 } else if (sc.ch == '$') {
427 sc.SetState(SCE_SH_SCALAR);
428 sc.Forward();
429 if (sc.ch == '{') {
430 sc.ChangeState(SCE_SH_PARAM);
431 } else if (sc.ch == '\'') {
432 sc.ChangeState(SCE_SH_CHARACTER);
433 } else if (sc.ch == '"') {
434 sc.ChangeState(SCE_SH_STRING);
435 } else if (sc.ch == '(' || sc.ch == '`') {
436 sc.ChangeState(SCE_SH_BACKTICKS);
437 if (sc.chNext == '(') { // $(( is lexed as operator
438 sc.ChangeState(SCE_SH_OPERATOR);
440 } else {
441 continue; // scalar has no delimiter pair
443 // fallthrough, open delim for $[{'"(`]
444 Quote.Start(sc.ch);
445 } else if (sc.Match('<', '<')) {
446 sc.SetState(SCE_SH_HERE_DELIM);
447 HereDoc.State = 0;
448 HereDoc.Indent = false;
449 } else if (sc.ch == '-' && // one-char file test operators
450 setSingleCharOp.Contains(sc.chNext) &&
451 !setWord.Contains(sc.GetRelative(2)) &&
452 IsASpace(sc.chPrev)) {
453 sc.SetState(SCE_SH_WORD);
454 sc.Forward();
455 } else if (setBashOperator.Contains(sc.ch)) {
456 sc.SetState(SCE_SH_OPERATOR);
460 sc.Complete();
463 static bool IsCommentLine(int line, Accessor &styler) {
464 int pos = styler.LineStart(line);
465 int eol_pos = styler.LineStart(line + 1) - 1;
466 for (int i = pos; i < eol_pos; i++) {
467 char ch = styler[i];
468 if (ch == '#')
469 return true;
470 else if (ch != ' ' && ch != '\t')
471 return false;
473 return false;
476 static void FoldBashDoc(unsigned int startPos, int length, int, WordList *[],
477 Accessor &styler) {
478 bool foldComment = styler.GetPropertyInt("fold.comment") != 0;
479 bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0;
480 unsigned int endPos = startPos + length;
481 int visibleChars = 0;
482 int lineCurrent = styler.GetLine(startPos);
483 int levelPrev = styler.LevelAt(lineCurrent) & SC_FOLDLEVELNUMBERMASK;
484 int levelCurrent = levelPrev;
485 char chNext = styler[startPos];
486 int styleNext = styler.StyleAt(startPos);
487 for (unsigned int i = startPos; i < endPos; i++) {
488 char ch = chNext;
489 chNext = styler.SafeGetCharAt(i + 1);
490 int style = styleNext;
491 styleNext = styler.StyleAt(i + 1);
492 bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n');
493 // Comment folding
494 if (foldComment && atEOL && IsCommentLine(lineCurrent, styler))
496 if (!IsCommentLine(lineCurrent - 1, styler)
497 && IsCommentLine(lineCurrent + 1, styler))
498 levelCurrent++;
499 else if (IsCommentLine(lineCurrent - 1, styler)
500 && !IsCommentLine(lineCurrent + 1, styler))
501 levelCurrent--;
503 if (style == SCE_SH_OPERATOR) {
504 if (ch == '{') {
505 levelCurrent++;
506 } else if (ch == '}') {
507 levelCurrent--;
510 if (atEOL) {
511 int lev = levelPrev;
512 if (visibleChars == 0 && foldCompact)
513 lev |= SC_FOLDLEVELWHITEFLAG;
514 if ((levelCurrent > levelPrev) && (visibleChars > 0))
515 lev |= SC_FOLDLEVELHEADERFLAG;
516 if (lev != styler.LevelAt(lineCurrent)) {
517 styler.SetLevel(lineCurrent, lev);
519 lineCurrent++;
520 levelPrev = levelCurrent;
521 visibleChars = 0;
523 if (!isspacechar(ch))
524 visibleChars++;
526 // Fill in the real level of the next line, keeping the current flags as they will be filled in later
527 int flagsNext = styler.LevelAt(lineCurrent) & ~SC_FOLDLEVELNUMBERMASK;
528 styler.SetLevel(lineCurrent, levelPrev | flagsNext);
531 static const char * const bashWordListDesc[] = {
532 "Keywords",
536 LexerModule lmBash(SCLEX_BASH, ColouriseBashDoc, "bash", FoldBashDoc, bashWordListDesc);