scintilla/lexers/LexRuby.cxx

   1 // Scintilla source code edit control
   2 /** @file LexRuby.cxx
   3  ** Lexer for Ruby.
   4  **/
   5 // Copyright 2001- by Clemens Wyss <wys@helbling.ch>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <stdlib.h>
   9 #include <string.h>
  10 #include <stdio.h>
  11 #include <stdarg.h>
  12 #include <assert.h>
  13 #include <ctype.h>
  14
  15 #include "ILexer.h"
  16 #include "Scintilla.h"
  17 #include "SciLexer.h"
  18
  19 #include "WordList.h"
  20 #include "LexAccessor.h"
  21 #include "Accessor.h"
  22 #include "StyleContext.h"
  23 #include "CharacterSet.h"
  24 #include "LexerModule.h"
  25
  26 #ifdef SCI_NAMESPACE
  27 using namespace Scintilla;
  28 #endif
  29
  30 //XXX Identical to Perl, put in common area
  31 static inline bool isEOLChar(char ch) {
  32         return (ch == '\r') || (ch == '\n');
  33 }
  34
  35 #define isSafeASCII(ch) ((unsigned int)(ch) <= 127)
  36 // This one's redundant, but makes for more readable code
  37 #define isHighBitChar(ch) ((unsigned int)(ch) > 127)
  38
  39 static inline bool isSafeAlpha(char ch) {
  40     return (isSafeASCII(ch) && isalpha(ch)) || ch == '_';
  41 }
  42
  43 static inline bool isSafeAlnum(char ch) {
  44     return (isSafeASCII(ch) && isalnum(ch)) || ch == '_';
  45 }
  46
  47 static inline bool isSafeAlnumOrHigh(char ch) {
  48     return isHighBitChar(ch) || isalnum(ch) || ch == '_';
  49 }
  50
  51 static inline bool isSafeDigit(char ch) {
  52     return isSafeASCII(ch) && isdigit(ch);
  53 }
  54
  55 static inline bool isSafeWordcharOrHigh(char ch) {
  56     // Error: scintilla's KeyWords.h includes '.' as a word-char
  57     // we want to separate things that can take methods from the
  58     // methods.
  59     return isHighBitChar(ch) || isalnum(ch) || ch == '_';
  60 }
  61
  62 static bool inline iswhitespace(char ch) {
  63         return ch == ' ' || ch == '\t';
  64 }
  65
  66 #define MAX_KEYWORD_LENGTH 200
  67
  68 #define STYLE_MASK 63
  69 #define actual_style(style) (style & STYLE_MASK)
  70
  71 static bool followsDot(unsigned int pos, Accessor &styler) {
  72     styler.Flush();
  73     for (; pos >= 1; --pos) {
  74         int style = actual_style(styler.StyleAt(pos));
  75         char ch;
  76         switch (style) {
  77             case SCE_RB_DEFAULT:
  78                 ch = styler[pos];
  79                 if (ch == ' ' || ch == '\t') {
  80                     //continue
  81                 } else {
  82                     return false;
  83                 }
  84                 break;
  85
  86             case SCE_RB_OPERATOR:
  87                 return styler[pos] == '.';
  88
  89             default:
  90                 return false;
  91         }
  92     }
  93     return false;
  94 }
  95
  96 // Forward declarations
  97 static bool keywordIsAmbiguous(const char *prevWord);
  98 static bool keywordDoStartsLoop(int pos,
  99                                 Accessor &styler);
 100 static bool keywordIsModifier(const char *word,
 101                               int pos,
 102                               Accessor &styler);
 103
 104 static int ClassifyWordRb(unsigned int start, unsigned int end, WordList &keywords, Accessor &styler, char *prevWord) {
 105         char s[MAX_KEYWORD_LENGTH];
 106     unsigned int i, j;
 107         unsigned int lim = end - start + 1; // num chars to copy
 108         if (lim >= MAX_KEYWORD_LENGTH) {
 109                 lim = MAX_KEYWORD_LENGTH - 1;
 110         }
 111         for (i = start, j = 0; j < lim; i++, j++) {
 112                 s[j] = styler[i];
 113         }
 114     s[j] = '\0';
 115         int chAttr;
 116         if (0 == strcmp(prevWord, "class"))
 117                 chAttr = SCE_RB_CLASSNAME;
 118         else if (0 == strcmp(prevWord, "module"))
 119                 chAttr = SCE_RB_MODULE_NAME;
 120         else if (0 == strcmp(prevWord, "def"))
 121                 chAttr = SCE_RB_DEFNAME;
 122     else if (keywords.InList(s) && ((start == 0) || !followsDot(start - 1, styler))) {
 123         if (keywordIsAmbiguous(s)
 124             && keywordIsModifier(s, start, styler)) {
 125
 126             // Demoted keywords are colored as keywords,
 127             // but do not affect changes in indentation.
 128             //
 129             // Consider the word 'if':
 130             // 1. <<if test ...>> : normal
 131             // 2. <<stmt if test>> : demoted
 132             // 3. <<lhs = if ...>> : normal: start a new indent level
 133             // 4. <<obj.if = 10>> : color as identifer, since it follows '.'
 134
 135             chAttr = SCE_RB_WORD_DEMOTED;
 136         } else {
 137             chAttr = SCE_RB_WORD;
 138         }
 139         } else
 140         chAttr = SCE_RB_IDENTIFIER;
 141         styler.ColourTo(end, chAttr);
 142         if (chAttr == SCE_RB_WORD) {
 143                 strcpy(prevWord, s);
 144         } else {
 145                 prevWord[0] = 0;
 146         }
 147     return chAttr;
 148 }
 149
 150
 151 //XXX Identical to Perl, put in common area
 152 static bool isMatch(Accessor &styler, int lengthDoc, int pos, const char *val) {
 153         if ((pos + static_cast<int>(strlen(val))) >= lengthDoc) {
 154                 return false;
 155         }
 156         while (*val) {
 157                 if (*val != styler[pos++]) {
 158                         return false;
 159                 }
 160                 val++;
 161         }
 162         return true;
 163 }
 164
 165 // Do Ruby better -- find the end of the line, work back,
 166 // and then check for leading white space
 167
 168 // Precondition: the here-doc target can be indented
 169 static bool lookingAtHereDocDelim(Accessor         &styler,
 170                                   int                   pos,
 171                                   int                   lengthDoc,
 172                                   const char   *HereDocDelim)
 173 {
 174     if (!isMatch(styler, lengthDoc, pos, HereDocDelim)) {
 175         return false;
 176     }
 177     while (--pos > 0) {
 178         char ch = styler[pos];
 179         if (isEOLChar(ch)) {
 180             return true;
 181         } else if (ch != ' ' && ch != '\t') {
 182             return false;
 183         }
 184     }
 185     return false;
 186 }
 187
 188 //XXX Identical to Perl, put in common area
 189 static char opposite(char ch) {
 190         if (ch == '(')
 191                 return ')';
 192         if (ch == '[')
 193                 return ']';
 194         if (ch == '{')
 195                 return '}';
 196         if (ch == '<')
 197                 return '>';
 198         return ch;
 199 }
 200
 201 // Null transitions when we see we've reached the end
 202 // and need to relex the curr char.
 203
 204 static void redo_char(int &i, char &ch, char &chNext, char &chNext2,
 205                       int &state) {
 206     i--;
 207     chNext2 = chNext;
 208     chNext = ch;
 209     state = SCE_RB_DEFAULT;
 210 }
 211
 212 static void advance_char(int &i, char &ch, char &chNext, char &chNext2) {
 213     i++;
 214     ch = chNext;
 215     chNext = chNext2;
 216 }
 217
 218 // precondition: startPos points to one after the EOL char
 219 static bool currLineContainsHereDelims(int& startPos,
 220                                        Accessor &styler) {
 221     if (startPos <= 1)
 222         return false;
 223
 224     int pos;
 225     for (pos = startPos - 1; pos > 0; pos--) {
 226         char ch = styler.SafeGetCharAt(pos);
 227         if (isEOLChar(ch)) {
 228             // Leave the pointers where they are -- there are no
 229             // here doc delims on the current line, even if
 230             // the EOL isn't default style
 231
 232             return false;
 233         } else {
 234             styler.Flush();
 235             if (actual_style(styler.StyleAt(pos)) == SCE_RB_HERE_DELIM) {
 236                 break;
 237             }
 238         }
 239     }
 240     if (pos == 0) {
 241         return false;
 242     }
 243     // Update the pointers so we don't have to re-analyze the string
 244     startPos = pos;
 245     return true;
 246 }
 247
 248 // This class is used by the enter and exit methods, so it needs
 249 // to be hoisted out of the function.
 250
 251 class QuoteCls {
 252     public:
 253     int  Count;
 254     char Up;
 255     char Down;
 256     QuoteCls() {
 257         New();
 258     }
 259     void New() {
 260         Count = 0;
 261         Up    = '\0';
 262         Down  = '\0';
 263     }
 264     void Open(char u) {
 265         Count++;
 266         Up    = u;
 267         Down  = opposite(Up);
 268     }
 269     QuoteCls(const QuoteCls& q) {
 270         // copy constructor -- use this for copying in
 271         Count = q.Count;
 272         Up    = q.Up;
 273         Down  = q.Down;
 274     }
 275     QuoteCls& operator=(const QuoteCls& q) { // assignment constructor
 276         if (this != &q) {
 277             Count = q.Count;
 278             Up    = q.Up;
 279             Down  = q.Down;
 280         }
 281                 return *this;
 282     }
 283
 284 };
 285
 286
 287 static void enterInnerExpression(int  *p_inner_string_types,
 288                                  int  *p_inner_expn_brace_counts,
 289                                  QuoteCls *p_inner_quotes,
 290                                  int&  inner_string_count,
 291                                  int&  state,
 292                                  int&  brace_counts,
 293                                  QuoteCls curr_quote
 294                                  ) {
 295     p_inner_string_types[inner_string_count] = state;
 296     state = SCE_RB_DEFAULT;
 297     p_inner_expn_brace_counts[inner_string_count] = brace_counts;
 298     brace_counts = 0;
 299     p_inner_quotes[inner_string_count] = curr_quote;
 300     ++inner_string_count;
 301 }
 302
 303 static void exitInnerExpression(int *p_inner_string_types,
 304                                  int *p_inner_expn_brace_counts,
 305                                  QuoteCls *p_inner_quotes,
 306                                  int& inner_string_count,
 307                                  int& state,
 308                                  int&  brace_counts,
 309                                  QuoteCls& curr_quote
 310                                 ) {
 311     --inner_string_count;
 312     state = p_inner_string_types[inner_string_count];
 313     brace_counts = p_inner_expn_brace_counts[inner_string_count];
 314     curr_quote = p_inner_quotes[inner_string_count];
 315 }
 316
 317 static bool isEmptyLine(int pos,
 318                         Accessor &styler) {
 319         int spaceFlags = 0;
 320         int lineCurrent = styler.GetLine(pos);
 321         int indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, NULL);
 322     return (indentCurrent & SC_FOLDLEVELWHITEFLAG) != 0;
 323 }
 324
 325 static bool RE_CanFollowKeyword(const char *keyword) {
 326     if (!strcmp(keyword, "and")
 327         || !strcmp(keyword, "begin")
 328         || !strcmp(keyword, "break")
 329         || !strcmp(keyword, "case")
 330         || !strcmp(keyword, "do")
 331         || !strcmp(keyword, "else")
 332         || !strcmp(keyword, "elsif")
 333         || !strcmp(keyword, "if")
 334         || !strcmp(keyword, "next")
 335         || !strcmp(keyword, "return")
 336         || !strcmp(keyword, "when")
 337         || !strcmp(keyword, "unless")
 338         || !strcmp(keyword, "until")
 339         || !strcmp(keyword, "not")
 340         || !strcmp(keyword, "or")) {
 341         return true;
 342     }
 343     return false;
 344 }
 345
 346 // Look at chars up to but not including endPos
 347 // Don't look at styles in case we're looking forward
 348
 349 static int skipWhitespace(int startPos,
 350                            int endPos,
 351                            Accessor &styler) {
 352     for (int i = startPos; i < endPos; i++) {
 353         if (!iswhitespace(styler[i])) {
 354             return i;
 355         }
 356     }
 357     return endPos;
 358 }
 359
 360 // This routine looks for false positives like
 361 // undef foo, <<
 362 // There aren't too many.
 363 //
 364 // iPrev points to the start of <<
 365
 366 static bool sureThisIsHeredoc(int iPrev,
 367                               Accessor &styler,
 368                               char *prevWord) {
 369
 370     // Not so fast, since Ruby's so dynamic.  Check the context
 371     // to make sure we're OK.
 372     int prevStyle;
 373     int lineStart = styler.GetLine(iPrev);
 374     int lineStartPosn = styler.LineStart(lineStart);
 375     styler.Flush();
 376
 377     // Find the first word after some whitespace
 378     int firstWordPosn = skipWhitespace(lineStartPosn, iPrev, styler);
 379     if (firstWordPosn >= iPrev) {
 380         // Have something like {^     <<}
 381                 //XXX Look at the first previous non-comment non-white line
 382                 // to establish the context.  Not too likely though.
 383         return true;
 384     } else {
 385         switch (prevStyle = styler.StyleAt(firstWordPosn)) {
 386         case SCE_RB_WORD:
 387         case SCE_RB_WORD_DEMOTED:
 388         case SCE_RB_IDENTIFIER:
 389             break;
 390         default:
 391             return true;
 392         }
 393     }
 394     int firstWordEndPosn = firstWordPosn;
 395     char *dst = prevWord;
 396     for (;;) {
 397         if (firstWordEndPosn >= iPrev ||
 398             styler.StyleAt(firstWordEndPosn) != prevStyle) {
 399             *dst = 0;
 400             break;
 401         }
 402         *dst++ = styler[firstWordEndPosn];
 403         firstWordEndPosn += 1;
 404     }
 405     //XXX Write a style-aware thing to regex scintilla buffer objects
 406     if (!strcmp(prevWord, "undef")
 407         || !strcmp(prevWord, "def")
 408         || !strcmp(prevWord, "alias")) {
 409         // These keywords are what we were looking for
 410         return false;
 411     }
 412     return true;
 413 }
 414
 415 // Routine that saves us from allocating a buffer for the here-doc target
 416 // targetEndPos points one past the end of the current target
 417 static bool haveTargetMatch(int currPos,
 418                             int lengthDoc,
 419                             int targetStartPos,
 420                             int targetEndPos,
 421                             Accessor &styler) {
 422     if (lengthDoc - currPos < targetEndPos - targetStartPos) {
 423         return false;
 424     }
 425     int i, j;
 426     for (i = targetStartPos, j = currPos;
 427          i < targetEndPos && j < lengthDoc;
 428          i++, j++) {
 429         if (styler[i] != styler[j]) {
 430             return false;
 431         }
 432     }
 433     return true;
 434 }
 435
 436 // We need a check because the form
 437 // [identifier] <<[target]
 438 // is ambiguous.  The Ruby lexer/parser resolves it by
 439 // looking to see if [identifier] names a variable or a
 440 // function.  If it's the first, it's the start of a here-doc.
 441 // If it's a var, it's an operator.  This lexer doesn't
 442 // maintain a symbol table, so it looks ahead to see what's
 443 // going on, in cases where we have
 444 // ^[white-space]*[identifier([.|::]identifier)*][white-space]*<<[target]
 445 //
 446 // If there's no occurrence of [target] on a line, assume we don't.
 447
 448 // return true == yes, we have no heredocs
 449
 450 static bool sureThisIsNotHeredoc(int lt2StartPos,
 451                                  Accessor &styler) {
 452     int prevStyle;
 453      // Use full document, not just part we're styling
 454     int lengthDoc = styler.Length();
 455     int lineStart = styler.GetLine(lt2StartPos);
 456     int lineStartPosn = styler.LineStart(lineStart);
 457     styler.Flush();
 458     const bool definitely_not_a_here_doc = true;
 459     const bool looks_like_a_here_doc = false;
 460
 461     // Find the first word after some whitespace
 462     int firstWordPosn = skipWhitespace(lineStartPosn, lt2StartPos, styler);
 463     if (firstWordPosn >= lt2StartPos) {
 464         return definitely_not_a_here_doc;
 465     }
 466     prevStyle = styler.StyleAt(firstWordPosn);
 467     // If we have '<<' following a keyword, it's not a heredoc
 468     if (prevStyle != SCE_RB_IDENTIFIER
 469         && prevStyle != SCE_RB_INSTANCE_VAR
 470         && prevStyle != SCE_RB_CLASS_VAR) {
 471         return definitely_not_a_here_doc;
 472     }
 473     int newStyle = prevStyle;
 474     // Some compilers incorrectly warn about uninit newStyle
 475     for (firstWordPosn += 1; firstWordPosn <= lt2StartPos; firstWordPosn += 1) {
 476         // Inner loop looks at the name
 477         for (; firstWordPosn <= lt2StartPos; firstWordPosn += 1) {
 478             newStyle = styler.StyleAt(firstWordPosn);
 479             if (newStyle != prevStyle) {
 480                 break;
 481             }
 482         }
 483         // Do we have '::' or '.'?
 484         if (firstWordPosn < lt2StartPos && newStyle == SCE_RB_OPERATOR) {
 485             char ch = styler[firstWordPosn];
 486             if (ch == '.') {
 487                 // yes
 488             } else if (ch == ':') {
 489                 if (styler.StyleAt(++firstWordPosn) != SCE_RB_OPERATOR) {
 490                     return definitely_not_a_here_doc;
 491                 } else if (styler[firstWordPosn] != ':') {
 492                     return definitely_not_a_here_doc;
 493                 }
 494             } else {
 495                 break;
 496             }
 497         } else {
 498             break;
 499         }
 500         // on second and next passes, only identifiers may appear since
 501         // class and instance variable are private
 502         prevStyle = SCE_RB_IDENTIFIER;
 503     }
 504     // Skip next batch of white-space
 505     firstWordPosn = skipWhitespace(firstWordPosn, lt2StartPos, styler);
 506     if (firstWordPosn != lt2StartPos) {
 507         // Have [[^ws[identifier]ws[*something_else*]ws<<
 508         return definitely_not_a_here_doc;
 509     }
 510     // OK, now 'j' will point to the current spot moving ahead
 511         int j = firstWordPosn + 1;
 512     if (styler.StyleAt(j) != SCE_RB_OPERATOR || styler[j] != '<') {
 513         // This shouldn't happen
 514         return definitely_not_a_here_doc;
 515     }
 516     int nextLineStartPosn = styler.LineStart(lineStart + 1);
 517     if (nextLineStartPosn >= lengthDoc) {
 518         return definitely_not_a_here_doc;
 519     }
 520     j = skipWhitespace(j + 1, nextLineStartPosn, styler);
 521     if (j >= lengthDoc) {
 522         return definitely_not_a_here_doc;
 523     }
 524     bool allow_indent;
 525     int target_start, target_end;
 526     // From this point on no more styling, since we're looking ahead
 527     if (styler[j] == '-') {
 528         allow_indent = true;
 529         j++;
 530     } else {
 531         allow_indent = false;
 532     }
 533
 534     // Allow for quoted targets.
 535     char target_quote = 0;
 536     switch (styler[j]) {
 537     case '\'':
 538     case '"':
 539     case '`':
 540         target_quote = styler[j];
 541         j += 1;
 542     }
 543
 544     if (isSafeAlnum(styler[j])) {
 545         // Init target_end because some compilers think it won't
 546         // be initialized by the time it's used
 547         target_start = target_end = j;
 548         j++;
 549     } else {
 550         return definitely_not_a_here_doc;
 551     }
 552     for (; j < lengthDoc; j++) {
 553         if (!isSafeAlnum(styler[j])) {
 554             if (target_quote && styler[j] != target_quote) {
 555                 // unquoted end
 556                 return definitely_not_a_here_doc;
 557             }
 558
 559             // And for now make sure that it's a newline
 560             // don't handle arbitrary expressions yet
 561
 562             target_end = j;
 563                         if (target_quote) {
 564                                 // Now we can move to the character after the string delimiter.
 565                                 j += 1;
 566                         }
 567             j = skipWhitespace(j, lengthDoc, styler);
 568             if (j >= lengthDoc) {
 569                 return definitely_not_a_here_doc;
 570             } else {
 571                 char ch = styler[j];
 572                 if (ch == '#' || isEOLChar(ch)) {
 573                     // This is OK, so break and continue;
 574                     break;
 575                 } else {
 576                     return definitely_not_a_here_doc;
 577                 }
 578             }
 579         }
 580     }
 581
 582     // Just look at the start of each line
 583     int last_line = styler.GetLine(lengthDoc - 1);
 584     // But don't go too far
 585     if (last_line > lineStart + 50) {
 586         last_line = lineStart + 50;
 587     }
 588     for (int line_num = lineStart + 1; line_num <= last_line; line_num++) {
 589         if (allow_indent) {
 590             j = skipWhitespace(styler.LineStart(line_num), lengthDoc, styler);
 591         } else {
 592             j = styler.LineStart(line_num);
 593         }
 594         // target_end is one past the end
 595         if (haveTargetMatch(j, lengthDoc, target_start, target_end, styler)) {
 596             // We got it
 597             return looks_like_a_here_doc;
 598         }
 599     }
 600     return definitely_not_a_here_doc;
 601 }
 602
 603 //todo: if we aren't looking at a stdio character,
 604 // move to the start of the first line that is not in a
 605 // multi-line construct
 606
 607 static void synchronizeDocStart(unsigned int& startPos,
 608                                 int &length,
 609                                 int &initStyle,
 610                                 Accessor &styler,
 611                                 bool skipWhiteSpace=false) {
 612
 613     styler.Flush();
 614     int style = actual_style(styler.StyleAt(startPos));
 615     switch (style) {
 616         case SCE_RB_STDIN:
 617         case SCE_RB_STDOUT:
 618         case SCE_RB_STDERR:
 619             // Don't do anything else with these.
 620             return;
 621     }
 622
 623     int pos = startPos;
 624     // Quick way to characterize each line
 625     int lineStart;
 626     for (lineStart = styler.GetLine(pos); lineStart > 0; lineStart--) {
 627         // Now look at the style before the previous line's EOL
 628         pos = styler.LineStart(lineStart) - 1;
 629         if (pos <= 10) {
 630             lineStart = 0;
 631             break;
 632         }
 633         char ch = styler.SafeGetCharAt(pos);
 634         char chPrev = styler.SafeGetCharAt(pos - 1);
 635         if (ch == '\n' && chPrev == '\r') {
 636             pos--;
 637         }
 638         if (styler.SafeGetCharAt(pos - 1) == '\\') {
 639             // Continuation line -- keep going
 640         } else if (actual_style(styler.StyleAt(pos)) != SCE_RB_DEFAULT) {
 641             // Part of multi-line construct -- keep going
 642         } else if (currLineContainsHereDelims(pos, styler)) {
 643             // Keep going, with pos and length now pointing
 644             // at the end of the here-doc delimiter
 645         } else if (skipWhiteSpace && isEmptyLine(pos, styler)) {
 646             // Keep going
 647         } else {
 648             break;
 649         }
 650     }
 651     pos = styler.LineStart(lineStart);
 652     length += (startPos - pos);
 653     startPos = pos;
 654     initStyle = SCE_RB_DEFAULT;
 655 }
 656
 657 static void ColouriseRbDoc(unsigned int startPos, int length, int initStyle,
 658                                                    WordList *keywordlists[], Accessor &styler) {
 659
 660         // Lexer for Ruby often has to backtrack to start of current style to determine
 661         // which characters are being used as quotes, how deeply nested is the
 662         // start position and what the termination string is for here documents
 663
 664         WordList &keywords = *keywordlists[0];
 665
 666         class HereDocCls {
 667         public:
 668                 int State;
 669         // States
 670         // 0: '<<' encountered
 671                 // 1: collect the delimiter
 672         // 1b: text between the end of the delimiter and the EOL
 673                 // 2: here doc text (lines after the delimiter)
 674                 char Quote;             // the char after '<<'
 675                 bool Quoted;            // true if Quote in ('\'','"','`')
 676                 int DelimiterLength;    // strlen(Delimiter)
 677                 char Delimiter[256];    // the Delimiter, limit of 256: from Perl
 678         bool CanBeIndented;
 679                 HereDocCls() {
 680                         State = 0;
 681                         DelimiterLength = 0;
 682                         Delimiter[0] = '\0';
 683             CanBeIndented = false;
 684                 }
 685         };
 686         HereDocCls HereDoc;
 687
 688         QuoteCls Quote;
 689
 690     int numDots = 0;  // For numbers --
 691                       // Don't start lexing in the middle of a num
 692
 693     synchronizeDocStart(startPos, length, initStyle, styler, // ref args
 694                         false);
 695
 696         bool preferRE = true;
 697     int state = initStyle;
 698         int lengthDoc = startPos + length;
 699
 700         char prevWord[MAX_KEYWORD_LENGTH + 1]; // 1 byte for zero
 701         prevWord[0] = '\0';
 702         if (length == 0)
 703                 return;
 704
 705         char chPrev = styler.SafeGetCharAt(startPos - 1);
 706         char chNext = styler.SafeGetCharAt(startPos);
 707         bool is_real_number = true;   // Differentiate between constants and ?-sequences.
 708         styler.StartAt(startPos);
 709         styler.StartSegment(startPos);
 710
 711     static int q_states[] = {SCE_RB_STRING_Q,
 712                              SCE_RB_STRING_QQ,
 713                              SCE_RB_STRING_QR,
 714                              SCE_RB_STRING_QW,
 715                              SCE_RB_STRING_QW,
 716                              SCE_RB_STRING_QX};
 717     static const char* q_chars = "qQrwWx";
 718
 719     // In most cases a value of 2 should be ample for the code in the
 720     // Ruby library, and the code the user is likely to enter.
 721     // For example,
 722     // fu_output_message "mkdir #{options[:mode] ? ('-m %03o ' % options[:mode]) : ''}#{list.join ' '}"
 723     //     if options[:verbose]
 724     // from fileutils.rb nests to a level of 2
 725     // If the user actually hits a 6th occurrence of '#{' in a double-quoted
 726     // string (including regex'es, %Q, %<sym>, %w, and other strings
 727     // that interpolate), it will stay as a string.  The problem with this
 728     // is that quotes might flip, a 7th '#{' will look like a comment,
 729     // and code-folding might be wrong.
 730
 731     // If anyone runs into this problem, I recommend raising this
 732     // value slightly higher to replacing the fixed array with a linked
 733     // list.  Keep in mind this code will be called every time the lexer
 734     // is invoked.
 735
 736 #define INNER_STRINGS_MAX_COUNT 5
 737     // These vars track our instances of "...#{,,,%Q<..#{,,,}...>,,,}..."
 738     int inner_string_types[INNER_STRINGS_MAX_COUNT];
 739     // Track # braces when we push a new #{ thing
 740     int inner_expn_brace_counts[INNER_STRINGS_MAX_COUNT];
 741     QuoteCls inner_quotes[INNER_STRINGS_MAX_COUNT];
 742     int inner_string_count = 0;
 743     int brace_counts = 0;   // Number of #{ ... } things within an expression
 744
 745     int i;
 746         for (i = 0; i < INNER_STRINGS_MAX_COUNT; i++) {
 747         inner_string_types[i] = 0;
 748         inner_expn_brace_counts[i] = 0;
 749     }
 750         for (i = startPos; i < lengthDoc; i++) {
 751                 char ch = chNext;
 752                 chNext = styler.SafeGetCharAt(i + 1);
 753                 char chNext2 = styler.SafeGetCharAt(i + 2);
 754
 755         if (styler.IsLeadByte(ch)) {
 756                         chNext = chNext2;
 757                         chPrev = ' ';
 758                         i += 1;
 759                         continue;
 760                 }
 761
 762         // skip on DOS/Windows
 763         //No, don't, because some things will get tagged on,
 764         // so we won't recognize keywords, for example
 765 #if 0
 766                 if (ch == '\r' && chNext == '\n') {
 767                 continue;
 768         }
 769 #endif
 770
 771         if (HereDoc.State == 1 && isEOLChar(ch)) {
 772                         // Begin of here-doc (the line after the here-doc delimiter):
 773                         HereDoc.State = 2;
 774                         styler.ColourTo(i-1, state);
 775             // Don't check for a missing quote, just jump into
 776             // the here-doc state
 777             state = SCE_RB_HERE_Q;
 778         }
 779
 780         // Regular transitions
 781                 if (state == SCE_RB_DEFAULT) {
 782             if (isSafeDigit(ch)) {
 783                 styler.ColourTo(i - 1, state);
 784                                 state = SCE_RB_NUMBER;
 785                 is_real_number = true;
 786                 numDots = 0;
 787             } else if (isHighBitChar(ch) || iswordstart(ch)) {
 788                 styler.ColourTo(i - 1, state);
 789                                 state = SCE_RB_WORD;
 790                         } else if (ch == '#') {
 791                                 styler.ColourTo(i - 1, state);
 792                                 state = SCE_RB_COMMENTLINE;
 793                         } else if (ch == '=') {
 794                                 // =begin indicates the start of a comment (doc) block
 795                 if ((i == 0 || isEOLChar(chPrev))
 796                     && chNext == 'b'
 797                     && styler.SafeGetCharAt(i + 2) == 'e'
 798                     && styler.SafeGetCharAt(i + 3) == 'g'
 799                     && styler.SafeGetCharAt(i + 4) == 'i'
 800                     && styler.SafeGetCharAt(i + 5) == 'n'
 801                     && !isSafeWordcharOrHigh(styler.SafeGetCharAt(i + 6))) {
 802                     styler.ColourTo(i - 1, state);
 803                     state = SCE_RB_POD;
 804                                 } else {
 805                                         styler.ColourTo(i - 1, state);
 806                                         styler.ColourTo(i, SCE_RB_OPERATOR);
 807                                         preferRE = true;
 808                                 }
 809                         } else if (ch == '"') {
 810                                 styler.ColourTo(i - 1, state);
 811                                 state = SCE_RB_STRING;
 812                                 Quote.New();
 813                                 Quote.Open(ch);
 814                         } else if (ch == '\'') {
 815                 styler.ColourTo(i - 1, state);
 816                 state = SCE_RB_CHARACTER;
 817                 Quote.New();
 818                 Quote.Open(ch);
 819                         } else if (ch == '`') {
 820                                 styler.ColourTo(i - 1, state);
 821                                 state = SCE_RB_BACKTICKS;
 822                                 Quote.New();
 823                                 Quote.Open(ch);
 824                         } else if (ch == '@') {
 825                 // Instance or class var
 826                                 styler.ColourTo(i - 1, state);
 827                 if (chNext == '@') {
 828                     state = SCE_RB_CLASS_VAR;
 829                     advance_char(i, ch, chNext, chNext2); // pass by ref
 830                 } else {
 831                     state = SCE_RB_INSTANCE_VAR;
 832                 }
 833                         } else if (ch == '$') {
 834                 // Check for a builtin global
 835                                 styler.ColourTo(i - 1, state);
 836                 // Recognize it bit by bit
 837                 state = SCE_RB_GLOBAL;
 838             } else if (ch == '/' && preferRE) {
 839                 // Ambigous operator
 840                                 styler.ColourTo(i - 1, state);
 841                                 state = SCE_RB_REGEX;
 842                 Quote.New();
 843                 Quote.Open(ch);
 844                         } else if (ch == '<' && chNext == '<' && chNext2 != '=') {
 845
 846                 // Recognise the '<<' symbol - either a here document or a binary op
 847                                 styler.ColourTo(i - 1, state);
 848                 i++;
 849                 chNext = chNext2;
 850                                 styler.ColourTo(i, SCE_RB_OPERATOR);
 851
 852                 if (! (strchr("\"\'`_-", chNext2) || isSafeAlpha(chNext2))) {
 853                     // It's definitely not a here-doc,
 854                     // based on Ruby's lexer/parser in the
 855                     // heredoc_identifier routine.
 856                     // Nothing else to do.
 857                 } else if (preferRE) {
 858                     if (sureThisIsHeredoc(i - 1, styler, prevWord)) {
 859                         state = SCE_RB_HERE_DELIM;
 860                         HereDoc.State = 0;
 861                     }
 862                     // else leave it in default state
 863                 } else {
 864                     if (sureThisIsNotHeredoc(i - 1, styler)) {
 865                         // leave state as default
 866                         // We don't have all the heuristics Perl has for indications
 867                         // of a here-doc, because '<<' is overloadable and used
 868                         // for so many other classes.
 869                     } else {
 870                         state = SCE_RB_HERE_DELIM;
 871                         HereDoc.State = 0;
 872                     }
 873                 }
 874                 preferRE = (state != SCE_RB_HERE_DELIM);
 875             } else if (ch == ':') {
 876                                 styler.ColourTo(i - 1, state);
 877                 if (chNext == ':') {
 878                     // Mark "::" as an operator, not symbol start
 879                     styler.ColourTo(i + 1, SCE_RB_OPERATOR);
 880                     advance_char(i, ch, chNext, chNext2); // pass by ref
 881                     state = SCE_RB_DEFAULT;
 882                                         preferRE = false;
 883                 } else if (isSafeWordcharOrHigh(chNext)) {
 884                                         state = SCE_RB_SYMBOL;
 885                 } else if ((chNext == '@' || chNext == '$') &&
 886                             isSafeWordcharOrHigh(chNext2)) {
 887                     // instance and global variable followed by an identifier
 888                     advance_char(i, ch, chNext, chNext2);
 889                     state = SCE_RB_SYMBOL;
 890                 } else if (((chNext == '@' && chNext2 == '@')  ||
 891                             (chNext == '$' && chNext2 == '-')) &&
 892                            isSafeWordcharOrHigh(styler.SafeGetCharAt(i+3))) {
 893                     // class variables and special global variable "$-IDENTCHAR"
 894                     state = SCE_RB_SYMBOL;
 895                     // $-IDENTCHAR doesn't continue past the IDENTCHAR
 896                     if (chNext == '$') {
 897                         styler.ColourTo(i+3, SCE_RB_SYMBOL);
 898                         state = SCE_RB_DEFAULT;
 899                     }
 900                     i += 3;
 901                     ch = styler.SafeGetCharAt(i);
 902                     chNext = styler.SafeGetCharAt(i+1);
 903                 } else if (chNext == '$' && strchr("_~*$?!@/\\;,.=:<>\"&`'+", chNext2)) {
 904                     // single-character special global variables
 905                     i += 2;
 906                     ch = chNext2;
 907                     chNext = styler.SafeGetCharAt(i+1);
 908                     styler.ColourTo(i, SCE_RB_SYMBOL);
 909                     state = SCE_RB_DEFAULT;
 910                 } else if (strchr("[*!~+-*/%=<>&^|", chNext)) {
 911                     // Do the operator analysis in-line, looking ahead
 912                     // Based on the table in pickaxe 2nd ed., page 339
 913                     bool doColoring = true;
 914                     switch (chNext) {
 915                     case '[':
 916                         if (chNext2 == ']' ) {
 917                             char ch_tmp = styler.SafeGetCharAt(i + 3);
 918                             if (ch_tmp == '=') {
 919                                 i += 3;
 920                                 ch = ch_tmp;
 921                                 chNext = styler.SafeGetCharAt(i + 1);
 922                             } else {
 923                                 i += 2;
 924                                 ch = chNext2;
 925                                 chNext = ch_tmp;
 926                             }
 927                         } else {
 928                             doColoring = false;
 929                         }
 930                         break;
 931
 932                     case '*':
 933                         if (chNext2 == '*') {
 934                             i += 2;
 935                             ch = chNext2;
 936                             chNext = styler.SafeGetCharAt(i + 1);
 937                         } else {
 938                             advance_char(i, ch, chNext, chNext2);
 939                         }
 940                         break;
 941
 942                     case '!':
 943                         if (chNext2 == '=' || chNext2 == '~') {
 944                             i += 2;
 945                             ch = chNext2;
 946                             chNext = styler.SafeGetCharAt(i + 1);
 947                         } else {
 948                             advance_char(i, ch, chNext, chNext2);
 949                         }
 950                         break;
 951
 952                     case '<':
 953                         if (chNext2 == '<') {
 954                             i += 2;
 955                             ch = chNext2;
 956                             chNext = styler.SafeGetCharAt(i + 1);
 957                         } else if (chNext2 == '=') {
 958                             char ch_tmp = styler.SafeGetCharAt(i + 3);
 959                             if (ch_tmp == '>') {  // <=> operator
 960                                 i += 3;
 961                                 ch = ch_tmp;
 962                                 chNext = styler.SafeGetCharAt(i + 1);
 963                             } else {
 964                                 i += 2;
 965                                 ch = chNext2;
 966                                 chNext = ch_tmp;
 967                             }
 968                         } else {
 969                             advance_char(i, ch, chNext, chNext2);
 970                         }
 971                         break;
 972
 973                     default:
 974                         // Simple one-character operators
 975                         advance_char(i, ch, chNext, chNext2);
 976                         break;
 977                     }
 978                     if (doColoring) {
 979                         styler.ColourTo(i, SCE_RB_SYMBOL);
 980                         state = SCE_RB_DEFAULT;
 981                     }
 982                                 } else if (!preferRE) {
 983                                         // Don't color symbol strings (yet)
 984                                         // Just color the ":" and color rest as string
 985                                         styler.ColourTo(i, SCE_RB_SYMBOL);
 986                                         state = SCE_RB_DEFAULT;
 987                 } else {
 988                     styler.ColourTo(i, SCE_RB_OPERATOR);
 989                     state = SCE_RB_DEFAULT;
 990                     preferRE = true;
 991                 }
 992             } else if (ch == '%') {
 993                 styler.ColourTo(i - 1, state);
 994                 bool have_string = false;
 995                 if (strchr(q_chars, chNext) && !isSafeWordcharOrHigh(chNext2)) {
 996                     Quote.New();
 997                     const char *hit = strchr(q_chars, chNext);
 998                     if (hit != NULL) {
 999                         state = q_states[hit - q_chars];
1000                         Quote.Open(chNext2);
1001                         i += 2;
1002                         ch = chNext2;
1003                                                 chNext = styler.SafeGetCharAt(i + 1);
1004                         have_string = true;
1005                     }
1006                 } else if (preferRE && !isSafeWordcharOrHigh(chNext)) {
1007                     // Ruby doesn't allow high bit chars here,
1008                     // but the editor host might
1009                     Quote.New();
1010                     state = SCE_RB_STRING_QQ;
1011                     Quote.Open(chNext);
1012                     advance_char(i, ch, chNext, chNext2); // pass by ref
1013                     have_string = true;
1014                 } else if (!isSafeWordcharOrHigh(chNext) && !iswhitespace(chNext) && !isEOLChar(chNext)) {
1015                     // Ruby doesn't allow high bit chars here,
1016                     // but the editor host might
1017                     Quote.New();
1018                     state = SCE_RB_STRING_QQ;
1019                     Quote.Open(chNext);
1020                     advance_char(i, ch, chNext, chNext2); // pass by ref
1021                     have_string = true;
1022                 }
1023                 if (!have_string) {
1024                     styler.ColourTo(i, SCE_RB_OPERATOR);
1025                     // stay in default
1026                     preferRE = true;
1027                 }
1028             } else if (ch == '?') {
1029                 styler.ColourTo(i - 1, state);
1030                 if (iswhitespace(chNext) || chNext == '\n' || chNext == '\r') {
1031                     styler.ColourTo(i, SCE_RB_OPERATOR);
1032                 } else {
1033                     // It's the start of a character code escape sequence
1034                     // Color it as a number.
1035                     state = SCE_RB_NUMBER;
1036                     is_real_number = false;
1037                 }
1038             } else if (isoperator(ch) || ch == '.') {
1039                                 styler.ColourTo(i - 1, state);
1040                                 styler.ColourTo(i, SCE_RB_OPERATOR);
1041                 // If we're ending an expression or block,
1042                 // assume it ends an object, and the ambivalent
1043                 // constructs are binary operators
1044                 //
1045                 // So if we don't have one of these chars,
1046                 // we aren't ending an object exp'n, and ops
1047                 // like : << / are unary operators.
1048
1049                 if (ch == '{') {
1050                     ++brace_counts;
1051                     preferRE = true;
1052                 } else if (ch == '}' && --brace_counts < 0
1053                            && inner_string_count > 0) {
1054                     styler.ColourTo(i, SCE_RB_OPERATOR);
1055                     exitInnerExpression(inner_string_types,
1056                                         inner_expn_brace_counts,
1057                                         inner_quotes,
1058                                         inner_string_count,
1059                                         state, brace_counts, Quote);
1060                 } else {
1061                     preferRE = (strchr(")}].", ch) == NULL);
1062                 }
1063                 // Stay in default state
1064             } else if (isEOLChar(ch)) {
1065                 // Make sure it's a true line-end, with no backslash
1066                 if ((ch == '\r' || (ch == '\n' && chPrev != '\r'))
1067                     && chPrev != '\\') {
1068                     // Assume we've hit the end of the statement.
1069                     preferRE = true;
1070                 }
1071             }
1072         } else if (state == SCE_RB_WORD) {
1073             if (ch == '.' || !isSafeWordcharOrHigh(ch)) {
1074                 // Words include x? in all contexts,
1075                 // and <letters>= after either 'def' or a dot
1076                 // Move along until a complete word is on our left
1077
1078                 // Default accessor treats '.' as word-chars,
1079                 // but we don't for now.
1080
1081                 if (ch == '='
1082                     && isSafeWordcharOrHigh(chPrev)
1083                     && (chNext == '('
1084                         || strchr(" \t\n\r", chNext) != NULL)
1085                     && (!strcmp(prevWord, "def")
1086                         || followsDot(styler.GetStartSegment(), styler))) {
1087                     // <name>= is a name only when being def'd -- Get it the next time
1088                     // This means that <name>=<name> is always lexed as
1089                     // <name>, (op, =), <name>
1090                 } else if ((ch == '?' || ch == '!')
1091                            && isSafeWordcharOrHigh(chPrev)
1092                            && !isSafeWordcharOrHigh(chNext)) {
1093                     // <name>? is a name -- Get it the next time
1094                     // But <name>?<name> is always lexed as
1095                     // <name>, (op, ?), <name>
1096                     // Same with <name>! to indicate a method that
1097                     // modifies its target
1098                 } else if (isEOLChar(ch)
1099                            && isMatch(styler, lengthDoc, i - 7, "__END__")) {
1100                     styler.ColourTo(i, SCE_RB_DATASECTION);
1101                     state = SCE_RB_DATASECTION;
1102                     // No need to handle this state -- we'll just move to the end
1103                     preferRE = false;
1104                 } else {
1105                                         int wordStartPos = styler.GetStartSegment();
1106                     int word_style = ClassifyWordRb(wordStartPos, i - 1, keywords, styler, prevWord);
1107                     switch (word_style) {
1108                         case SCE_RB_WORD:
1109                             preferRE = RE_CanFollowKeyword(prevWord);
1110                                                         break;
1111
1112                         case SCE_RB_WORD_DEMOTED:
1113                             preferRE = true;
1114                                                         break;
1115
1116                         case SCE_RB_IDENTIFIER:
1117                             if (isMatch(styler, lengthDoc, wordStartPos, "print")) {
1118                                 preferRE = true;
1119                             } else if (isEOLChar(ch)) {
1120                                 preferRE = true;
1121                             } else {
1122                                 preferRE = false;
1123                             }
1124                                                         break;
1125                         default:
1126                             preferRE = false;
1127                     }
1128                     if (ch == '.') {
1129                         // We might be redefining an operator-method
1130                         preferRE = false;
1131                     }
1132                     // And if it's the first
1133                     redo_char(i, ch, chNext, chNext2, state); // pass by ref
1134                 }
1135             }
1136         } else if (state == SCE_RB_NUMBER) {
1137             if (!is_real_number) {
1138                 if (ch != '\\') {
1139                     styler.ColourTo(i, state);
1140                     state = SCE_RB_DEFAULT;
1141                     preferRE = false;
1142                 } else if (strchr("\\ntrfvaebs", chNext)) {
1143                     // Terminal escape sequence -- handle it next time
1144                     // Nothing more to do this time through the loop
1145                 } else if (chNext == 'C' || chNext == 'M') {
1146                     if (chNext2 != '-') {
1147                         // \C or \M ends the sequence -- handle it next time
1148                     } else {
1149                         // Move from abc?\C-x
1150                         //               ^
1151                         // to
1152                         //                 ^
1153                         i += 2;
1154                         ch = chNext2;
1155                         chNext = styler.SafeGetCharAt(i + 1);
1156                     }
1157                 } else if (chNext == 'c') {
1158                     // Stay here, \c is a combining sequence
1159                     advance_char(i, ch, chNext, chNext2); // pass by ref
1160                 } else {
1161                     // ?\x, including ?\\ is final.
1162                     styler.ColourTo(i + 1, state);
1163                     state = SCE_RB_DEFAULT;
1164                     preferRE = false;
1165                     advance_char(i, ch, chNext, chNext2);
1166                 }
1167             } else if (isSafeAlnumOrHigh(ch) || ch == '_') {
1168                 // Keep going
1169             } else if (ch == '.' && chNext == '.') {
1170                 ++numDots;
1171                 styler.ColourTo(i - 1, state);
1172                 redo_char(i, ch, chNext, chNext2, state); // pass by ref
1173             } else if (ch == '.' && ++numDots == 1) {
1174                 // Keep going
1175             } else {
1176                 styler.ColourTo(i - 1, state);
1177                 redo_char(i, ch, chNext, chNext2, state); // pass by ref
1178                 preferRE = false;
1179             }
1180         } else if (state == SCE_RB_COMMENTLINE) {
1181                         if (isEOLChar(ch)) {
1182                 styler.ColourTo(i - 1, state);
1183                 state = SCE_RB_DEFAULT;
1184                 // Use whatever setting we had going into the comment
1185             }
1186         } else if (state == SCE_RB_HERE_DELIM) {
1187             // See the comment for SCE_RB_HERE_DELIM in LexPerl.cxx
1188             // Slightly different: if we find an immediate '-',
1189             // the target can appear indented.
1190
1191                         if (HereDoc.State == 0) { // '<<' encountered
1192                                 HereDoc.State = 1;
1193                 HereDoc.DelimiterLength = 0;
1194                 if (ch == '-') {
1195                     HereDoc.CanBeIndented = true;
1196                     advance_char(i, ch, chNext, chNext2); // pass by ref
1197                 } else {
1198                     HereDoc.CanBeIndented = false;
1199                 }
1200                 if (isEOLChar(ch)) {
1201                     // Bail out of doing a here doc if there's no target
1202                     state = SCE_RB_DEFAULT;
1203                     preferRE = false;
1204                 } else {
1205                     HereDoc.Quote = ch;
1206
1207                     if (ch == '\'' || ch == '"' || ch == '`') {
1208                         HereDoc.Quoted = true;
1209                         HereDoc.Delimiter[0] = '\0';
1210                     } else {
1211                         HereDoc.Quoted = false;
1212                         HereDoc.Delimiter[0] = ch;
1213                         HereDoc.Delimiter[1] = '\0';
1214                         HereDoc.DelimiterLength = 1;
1215                     }
1216                 }
1217                         } else if (HereDoc.State == 1) { // collect the delimiter
1218                 if (isEOLChar(ch)) {
1219                     // End the quote now, and go back for more
1220                     styler.ColourTo(i - 1, state);
1221                     state = SCE_RB_DEFAULT;
1222                     i--;
1223                     chNext = ch;
1224                     preferRE = false;
1225                 } else if (HereDoc.Quoted) {
1226                                         if (ch == HereDoc.Quote) { // closing quote => end of delimiter
1227                                                 styler.ColourTo(i, state);
1228                                                 state = SCE_RB_DEFAULT;
1229                         preferRE = false;
1230                     } else {
1231                                                 if (ch == '\\' && !isEOLChar(chNext)) {
1232                             advance_char(i, ch, chNext, chNext2);
1233                                                 }
1234                                                 HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch;
1235                                                 HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0';
1236                     }
1237                 } else { // an unquoted here-doc delimiter
1238                                         if (isSafeAlnumOrHigh(ch) || ch == '_') {
1239                                                 HereDoc.Delimiter[HereDoc.DelimiterLength++] = ch;
1240                                                 HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0';
1241                                         } else {
1242                                                 styler.ColourTo(i - 1, state);
1243                         redo_char(i, ch, chNext, chNext2, state);
1244                         preferRE = false;
1245                                         }
1246                 }
1247                                 if (HereDoc.DelimiterLength >= static_cast<int>(sizeof(HereDoc.Delimiter)) - 1) {
1248                                         styler.ColourTo(i - 1, state);
1249                                         state = SCE_RB_ERROR;
1250                     preferRE = false;
1251                                 }
1252             }
1253         } else if (state == SCE_RB_HERE_Q) {
1254             // Not needed: HereDoc.State == 2
1255             // Indentable here docs: look backwards
1256             // Non-indentable: look forwards, like in Perl
1257             //
1258             // Why: so we can quickly resolve things like <<-" abc"
1259
1260             if (!HereDoc.CanBeIndented) {
1261                 if (isEOLChar(chPrev)
1262                     && isMatch(styler, lengthDoc, i, HereDoc.Delimiter)) {
1263                     styler.ColourTo(i - 1, state);
1264                     i += HereDoc.DelimiterLength - 1;
1265                     chNext = styler.SafeGetCharAt(i + 1);
1266                     if (isEOLChar(chNext)) {
1267                         styler.ColourTo(i, SCE_RB_HERE_DELIM);
1268                         state = SCE_RB_DEFAULT;
1269                         HereDoc.State = 0;
1270                         preferRE = false;
1271                     }
1272                     // Otherwise we skipped through the here doc faster.
1273                 }
1274             } else if (isEOLChar(chNext)
1275                        && lookingAtHereDocDelim(styler,
1276                                                 i - HereDoc.DelimiterLength + 1,
1277                                                 lengthDoc,
1278                                                 HereDoc.Delimiter)) {
1279                 styler.ColourTo(i - 1 - HereDoc.DelimiterLength, state);
1280                 styler.ColourTo(i, SCE_RB_HERE_DELIM);
1281                 state = SCE_RB_DEFAULT;
1282                 preferRE = false;
1283                 HereDoc.State = 0;
1284             }
1285         } else if (state == SCE_RB_CLASS_VAR
1286                    || state == SCE_RB_INSTANCE_VAR
1287                    || state == SCE_RB_SYMBOL) {
1288             if (state == SCE_RB_SYMBOL &&
1289                  // FIDs suffices '?' and '!'
1290                 (((ch == '!' || ch == '?') && chNext != '=') ||
1291                  // identifier suffix '='
1292                  (ch == '=' && (chNext != '~' && chNext != '>' &&
1293                                (chNext != '=' || chNext2 == '>'))))) {
1294                 styler.ColourTo(i, state);
1295                 state = SCE_RB_DEFAULT;
1296                 preferRE = false;
1297             } else if (!isSafeWordcharOrHigh(ch)) {
1298                 styler.ColourTo(i - 1, state);
1299                 redo_char(i, ch, chNext, chNext2, state); // pass by ref
1300                 preferRE = false;
1301             }
1302         } else if (state == SCE_RB_GLOBAL) {
1303             if (!isSafeWordcharOrHigh(ch)) {
1304                 // handle special globals here as well
1305                 if (chPrev == '$') {
1306                     if (ch == '-') {
1307                         // Include the next char, like $-a
1308                         advance_char(i, ch, chNext, chNext2);
1309                     }
1310                     styler.ColourTo(i, state);
1311                     state = SCE_RB_DEFAULT;
1312                 } else {
1313                     styler.ColourTo(i - 1, state);
1314                     redo_char(i, ch, chNext, chNext2, state); // pass by ref
1315                 }
1316                 preferRE = false;
1317             }
1318         } else if (state == SCE_RB_POD) {
1319             // PODs end with ^=end\s, -- any whitespace can follow =end
1320             if (strchr(" \t\n\r", ch) != NULL
1321                 && i > 5
1322                 && isEOLChar(styler[i - 5])
1323                 && isMatch(styler, lengthDoc, i - 4, "=end")) {
1324                 styler.ColourTo(i - 1, state);
1325                 state = SCE_RB_DEFAULT;
1326                 preferRE = false;
1327             }
1328         } else if (state == SCE_RB_REGEX || state == SCE_RB_STRING_QR) {
1329             if (ch == '\\' && Quote.Up != '\\') {
1330                 // Skip one
1331                 advance_char(i, ch, chNext, chNext2);
1332             } else if (ch == Quote.Down) {
1333                 Quote.Count--;
1334                 if (Quote.Count == 0) {
1335                     // Include the options
1336                     while (isSafeAlpha(chNext)) {
1337                         i++;
1338                                                 ch = chNext;
1339                         chNext = styler.SafeGetCharAt(i + 1);
1340                     }
1341                     styler.ColourTo(i, state);
1342                     state = SCE_RB_DEFAULT;
1343                     preferRE = false;
1344                 }
1345             } else if (ch == Quote.Up) {
1346                 // Only if close quoter != open quoter
1347                 Quote.Count++;
1348
1349             } else if (ch == '#' ) {
1350                 if (chNext == '{'
1351                     && inner_string_count < INNER_STRINGS_MAX_COUNT) {
1352                     // process #{ ... }
1353                     styler.ColourTo(i - 1, state);
1354                     styler.ColourTo(i + 1, SCE_RB_OPERATOR);
1355                     enterInnerExpression(inner_string_types,
1356                                          inner_expn_brace_counts,
1357                                          inner_quotes,
1358                                          inner_string_count,
1359                                          state,
1360                                          brace_counts,
1361                                          Quote);
1362                     preferRE = true;
1363                     // Skip one
1364                     advance_char(i, ch, chNext, chNext2);
1365                 } else {
1366                     //todo: distinguish comments from pound chars
1367                     // for now, handle as comment
1368                     styler.ColourTo(i - 1, state);
1369                     bool inEscape = false;
1370                     while (++i < lengthDoc) {
1371                         ch = styler.SafeGetCharAt(i);
1372                         if (ch == '\\') {
1373                             inEscape = true;
1374                         } else if (isEOLChar(ch)) {
1375                             // Comment inside a regex
1376                             styler.ColourTo(i - 1, SCE_RB_COMMENTLINE);
1377                             break;
1378                         } else if (inEscape) {
1379                             inEscape = false;  // don't look at char
1380                         } else if (ch == Quote.Down) {
1381                             // Have the regular handler deal with this
1382                             // to get trailing modifiers.
1383                             i--;
1384                             ch = styler[i];
1385                             break;
1386                         }
1387                     }
1388                     chNext = styler.SafeGetCharAt(i + 1);
1389                 }
1390             }
1391         // Quotes of all kinds...
1392         } else if (state == SCE_RB_STRING_Q || state == SCE_RB_STRING_QQ ||
1393                    state == SCE_RB_STRING_QX || state == SCE_RB_STRING_QW ||
1394                    state == SCE_RB_STRING || state == SCE_RB_CHARACTER ||
1395                    state == SCE_RB_BACKTICKS) {
1396             if (!Quote.Down && !isspacechar(ch)) {
1397                 Quote.Open(ch);
1398             } else if (ch == '\\' && Quote.Up != '\\') {
1399                 //Riddle me this: Is it safe to skip *every* escaped char?
1400                 advance_char(i, ch, chNext, chNext2);
1401             } else if (ch == Quote.Down) {
1402                 Quote.Count--;
1403                 if (Quote.Count == 0) {
1404                     styler.ColourTo(i, state);
1405                     state = SCE_RB_DEFAULT;
1406                     preferRE = false;
1407                 }
1408             } else if (ch == Quote.Up) {
1409                 Quote.Count++;
1410             } else if (ch == '#' && chNext == '{'
1411                        && inner_string_count < INNER_STRINGS_MAX_COUNT
1412                        && state != SCE_RB_CHARACTER
1413                        && state != SCE_RB_STRING_Q) {
1414                 // process #{ ... }
1415                 styler.ColourTo(i - 1, state);
1416                 styler.ColourTo(i + 1, SCE_RB_OPERATOR);
1417                 enterInnerExpression(inner_string_types,
1418                                      inner_expn_brace_counts,
1419                                      inner_quotes,
1420                                      inner_string_count,
1421                                      state,
1422                                      brace_counts,
1423                                      Quote);
1424                 preferRE = true;
1425                 // Skip one
1426                 advance_char(i, ch, chNext, chNext2);
1427             }
1428         }
1429
1430         if (state == SCE_RB_ERROR) {
1431             break;
1432         }
1433         chPrev = ch;
1434     }
1435     if (state == SCE_RB_WORD) {
1436         // We've ended on a word, possibly at EOF, and need to
1437         // classify it.
1438         (void) ClassifyWordRb(styler.GetStartSegment(), lengthDoc - 1, keywords, styler, prevWord);
1439     } else {
1440         styler.ColourTo(lengthDoc - 1, state);
1441     }
1442 }
1443
1444 // Helper functions for folding, disambiguation keywords
1445 // Assert that there are no high-bit chars
1446
1447 static void getPrevWord(int pos,
1448                         char *prevWord,
1449                         Accessor &styler,
1450                         int word_state)
1451 {
1452     int i;
1453     styler.Flush();
1454     for (i = pos - 1; i > 0; i--) {
1455         if (actual_style(styler.StyleAt(i)) != word_state) {
1456             i++;
1457             break;
1458         }
1459     }
1460     if (i < pos - MAX_KEYWORD_LENGTH) // overflow
1461         i = pos - MAX_KEYWORD_LENGTH;
1462     char *dst = prevWord;
1463     for (; i <= pos; i++) {
1464         *dst++ = styler[i];
1465     }
1466         *dst = 0;
1467 }
1468
1469 static bool keywordIsAmbiguous(const char *prevWord)
1470 {
1471     // Order from most likely used to least likely
1472     // Lots of ways to do a loop in Ruby besides 'while/until'
1473     if (!strcmp(prevWord, "if")
1474         || !strcmp(prevWord, "do")
1475         || !strcmp(prevWord, "while")
1476         || !strcmp(prevWord, "unless")
1477         || !strcmp(prevWord, "until")
1478         || !strcmp(prevWord, "for")) {
1479         return true;
1480     } else {
1481         return false;
1482     }
1483 }
1484
1485 // Demote keywords in the following conditions:
1486 // if, while, unless, until modify a statement
1487 // do after a while or until, as a noise word (like then after if)
1488
1489 static bool keywordIsModifier(const char *word,
1490                               int pos,
1491                               Accessor &styler)
1492 {
1493     if (word[0] == 'd' && word[1] == 'o' && !word[2]) {
1494         return keywordDoStartsLoop(pos, styler);
1495     }
1496     char ch, chPrev, chPrev2;
1497     int style = SCE_RB_DEFAULT;
1498         int lineStart = styler.GetLine(pos);
1499     int lineStartPosn = styler.LineStart(lineStart);
1500     // We want to step backwards until we don't care about the current
1501     // position. But first move lineStartPosn back behind any
1502     // continuations immediately above word.
1503     while (lineStartPosn > 0) {
1504         ch = styler[lineStartPosn-1];
1505         if (ch == '\n' || ch == '\r') {
1506             chPrev  = styler.SafeGetCharAt(lineStartPosn-2);
1507             chPrev2 = styler.SafeGetCharAt(lineStartPosn-3);
1508             lineStart = styler.GetLine(lineStartPosn-1);
1509             // If we find a continuation line, include it in our analysis.
1510             if (chPrev == '\\') {
1511                 lineStartPosn = styler.LineStart(lineStart);
1512             } else if (ch == '\n' && chPrev == '\r' && chPrev2 == '\\') {
1513                 lineStartPosn = styler.LineStart(lineStart);
1514             } else {
1515                 break;
1516             }
1517         } else {
1518           break;
1519         }
1520     }
1521
1522     styler.Flush();
1523     while (--pos >= lineStartPosn) {
1524         style = actual_style(styler.StyleAt(pos));
1525                 if (style == SCE_RB_DEFAULT) {
1526                         if (iswhitespace(ch = styler[pos])) {
1527                                 //continue
1528                         } else if (ch == '\r' || ch == '\n') {
1529                                 // Scintilla's LineStart() and GetLine() routines aren't
1530                                 // platform-independent, so if we have text prepared with
1531                                 // a different system we can't rely on it.
1532
1533                 // Also, lineStartPosn may have been moved to more than one
1534                 // line above word's line while pushing past continuations.
1535                 chPrev = styler.SafeGetCharAt(pos - 1);
1536                 chPrev2 = styler.SafeGetCharAt(pos - 2);
1537                 if (chPrev == '\\') {
1538                     pos-=1;  // gloss over the "\\"
1539                     //continue
1540                 } else if (ch == '\n' && chPrev == '\r' && chPrev2 == '\\') {
1541                     pos-=2;  // gloss over the "\\\r"
1542                     //continue
1543                 } else {
1544                                     return false;
1545                 }
1546                         }
1547                 } else {
1548             break;
1549                 }
1550     }
1551     if (pos < lineStartPosn) {
1552         return false;
1553     }
1554     // First things where the action is unambiguous
1555     switch (style) {
1556         case SCE_RB_DEFAULT:
1557         case SCE_RB_COMMENTLINE:
1558         case SCE_RB_POD:
1559         case SCE_RB_CLASSNAME:
1560         case SCE_RB_DEFNAME:
1561         case SCE_RB_MODULE_NAME:
1562             return false;
1563         case SCE_RB_OPERATOR:
1564             break;
1565         case SCE_RB_WORD:
1566             // Watch out for uses of 'else if'
1567             //XXX: Make a list of other keywords where 'if' isn't a modifier
1568             //     and can appear legitimately
1569             // Formulate this to avoid warnings from most compilers
1570             if (strcmp(word, "if") == 0) {
1571                 char prevWord[MAX_KEYWORD_LENGTH + 1];
1572                 getPrevWord(pos, prevWord, styler, SCE_RB_WORD);
1573                 return strcmp(prevWord, "else") != 0;
1574             }
1575             return true;
1576         default:
1577             return true;
1578     }
1579     // Assume that if the keyword follows an operator,
1580     // usually it's a block assignment, like
1581     // a << if x then y else z
1582
1583     ch = styler[pos];
1584     switch (ch) {
1585         case ')':
1586         case ']':
1587         case '}':
1588             return true;
1589         default:
1590             return false;
1591     }
1592 }
1593
1594 #define WHILE_BACKWARDS "elihw"
1595 #define UNTIL_BACKWARDS "litnu"
1596 #define FOR_BACKWARDS "rof"
1597
1598 // Nothing fancy -- look to see if we follow a while/until somewhere
1599 // on the current line
1600
1601 static bool keywordDoStartsLoop(int pos,
1602                                 Accessor &styler)
1603 {
1604     char ch;
1605     int style;
1606         int lineStart = styler.GetLine(pos);
1607     int lineStartPosn = styler.LineStart(lineStart);
1608     styler.Flush();
1609     while (--pos >= lineStartPosn) {
1610         style = actual_style(styler.StyleAt(pos));
1611                 if (style == SCE_RB_DEFAULT) {
1612                         if ((ch = styler[pos]) == '\r' || ch == '\n') {
1613                                 // Scintilla's LineStart() and GetLine() routines aren't
1614                                 // platform-independent, so if we have text prepared with
1615                                 // a different system we can't rely on it.
1616                                 return false;
1617                         }
1618                 } else if (style == SCE_RB_WORD) {
1619             // Check for while or until, but write the word in backwards
1620             char prevWord[MAX_KEYWORD_LENGTH + 1]; // 1 byte for zero
1621             char *dst = prevWord;
1622             int wordLen = 0;
1623             int start_word;
1624             for (start_word = pos;
1625                  start_word >= lineStartPosn && actual_style(styler.StyleAt(start_word)) == SCE_RB_WORD;
1626                  start_word--) {
1627                 if (++wordLen < MAX_KEYWORD_LENGTH) {
1628                     *dst++ = styler[start_word];
1629                 }
1630             }
1631             *dst = 0;
1632             // Did we see our keyword?
1633             if (!strcmp(prevWord, WHILE_BACKWARDS)
1634                 || !strcmp(prevWord, UNTIL_BACKWARDS)
1635                 || !strcmp(prevWord, FOR_BACKWARDS)) {
1636                 return true;
1637             }
1638             // We can move pos to the beginning of the keyword, and then
1639             // accept another decrement, as we can never have two contiguous
1640             // keywords:
1641             // word1 word2
1642             //           ^
1643             //        <-  move to start_word
1644             //      ^
1645             //      <- loop decrement
1646             //     ^  # pointing to end of word1 is fine
1647             pos = start_word;
1648         }
1649     }
1650     return false;
1651 }
1652
1653 /*
1654  *  Folding Ruby
1655  *
1656  *  The language is quite complex to analyze without a full parse.
1657  *  For example, this line shouldn't affect fold level:
1658  *
1659  *   print "hello" if feeling_friendly?
1660  *
1661  *  Neither should this:
1662  *
1663  *   print "hello" \
1664  *      if feeling_friendly?
1665  *
1666  *
1667  *  But this should:
1668  *
1669  *   if feeling_friendly?  #++
1670  *     print "hello" \
1671  *     print "goodbye"
1672  *   end                   #--
1673  *
1674  *  So we cheat, by actually looking at the existing indentation
1675  *  levels for each line, and just echoing it back.  Like Python.
1676  *  Then if we get better at it, we'll take braces into consideration,
1677  *  which always affect folding levels.
1678
1679  *  How the keywords should work:
1680  *  No effect:
1681  *  __FILE__ __LINE__ BEGIN END alias and
1682  *  defined? false in nil not or self super then
1683  *  true undef
1684
1685  *  Always increment:
1686  *  begin  class def do for module when {
1687  *
1688  *  Always decrement:
1689  *  end }
1690  *
1691  *  Increment if these start a statement
1692  *  if unless until while -- do nothing if they're modifiers
1693
1694  *  These end a block if there's no modifier, but don't bother
1695  *  break next redo retry return yield
1696  *
1697  *  These temporarily de-indent, but re-indent
1698  *  case else elsif ensure rescue
1699  *
1700  *  This means that the folder reflects indentation rather
1701  *  than setting it.  The language-service updates indentation
1702  *  when users type return and finishes entering de-denters.
1703  *
1704  *  Later offer to fold POD, here-docs, strings, and blocks of comments
1705  */
1706
1707 static void FoldRbDoc(unsigned int startPos, int length, int initStyle,
1708                       WordList *[], Accessor &styler) {
1709         const bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0;
1710         bool foldComment = styler.GetPropertyInt("fold.comment") != 0;
1711
1712     synchronizeDocStart(startPos, length, initStyle, styler, // ref args
1713                         false);
1714         unsigned int endPos = startPos + length;
1715         int visibleChars = 0;
1716         int lineCurrent = styler.GetLine(startPos);
1717         int levelPrev = startPos == 0 ? 0 : (styler.LevelAt(lineCurrent)
1718                                          & SC_FOLDLEVELNUMBERMASK
1719                                          & ~SC_FOLDLEVELBASE);
1720         int levelCurrent = levelPrev;
1721         char chNext = styler[startPos];
1722         int styleNext = styler.StyleAt(startPos);
1723         int stylePrev = startPos <= 1 ? SCE_RB_DEFAULT : styler.StyleAt(startPos - 1);
1724     bool buffer_ends_with_eol = false;
1725         for (unsigned int i = startPos; i < endPos; i++) {
1726                 char ch = chNext;
1727                 chNext = styler.SafeGetCharAt(i + 1);
1728                 int style = styleNext;
1729                 styleNext = styler.StyleAt(i + 1);
1730                 bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n');
1731         if (style == SCE_RB_COMMENTLINE) {
1732             if (foldComment && stylePrev != SCE_RB_COMMENTLINE) {
1733                 if (chNext == '{') {
1734                                         levelCurrent++;
1735                                 } else if (chNext == '}' && levelCurrent > 0) {
1736                                         levelCurrent--;
1737                                 }
1738             }
1739         } else if (style == SCE_RB_OPERATOR) {
1740                         if (strchr("[{(", ch)) {
1741                                 levelCurrent++;
1742                         } else if (strchr(")}]", ch)) {
1743                 // Don't decrement below 0
1744                 if (levelCurrent > 0)
1745                     levelCurrent--;
1746                         }
1747         } else if (style == SCE_RB_WORD && styleNext != SCE_RB_WORD) {
1748             // Look at the keyword on the left and decide what to do
1749             char prevWord[MAX_KEYWORD_LENGTH + 1]; // 1 byte for zero
1750             prevWord[0] = 0;
1751             getPrevWord(i, prevWord, styler, SCE_RB_WORD);
1752             if (!strcmp(prevWord, "end")) {
1753                 // Don't decrement below 0
1754                 if (levelCurrent > 0)
1755                     levelCurrent--;
1756             } else if (   !strcmp(prevWord, "if")
1757                        || !strcmp(prevWord, "def")
1758                        || !strcmp(prevWord, "class")
1759                        || !strcmp(prevWord, "module")
1760                        || !strcmp(prevWord, "begin")
1761                        || !strcmp(prevWord, "case")
1762                        || !strcmp(prevWord, "do")
1763                        || !strcmp(prevWord, "while")
1764                        || !strcmp(prevWord, "unless")
1765                        || !strcmp(prevWord, "until")
1766                        || !strcmp(prevWord, "for")
1767                           ) {
1768                                 levelCurrent++;
1769             }
1770                 } else if (style == SCE_RB_HERE_DELIM) {
1771                         if (styler.SafeGetCharAt(i-2) == '<' && styler.SafeGetCharAt(i-1) == '<') {
1772                                 levelCurrent++;
1773                         } else if (styleNext == SCE_RB_DEFAULT) {
1774                                 levelCurrent--;
1775                         }
1776                 }
1777                 if (atEOL) {
1778                         int lev = levelPrev;
1779                         if (visibleChars == 0 && foldCompact)
1780                                 lev |= SC_FOLDLEVELWHITEFLAG;
1781                         if ((levelCurrent > levelPrev) && (visibleChars > 0))
1782                                 lev |= SC_FOLDLEVELHEADERFLAG;
1783             styler.SetLevel(lineCurrent, lev|SC_FOLDLEVELBASE);
1784                         lineCurrent++;
1785                         levelPrev = levelCurrent;
1786                         visibleChars = 0;
1787             buffer_ends_with_eol = true;
1788                 } else if (!isspacechar(ch)) {
1789                         visibleChars++;
1790             buffer_ends_with_eol = false;
1791         }
1792                 stylePrev = style;
1793     }
1794         // Fill in the real level of the next line, keeping the current flags as they will be filled in later
1795     if (!buffer_ends_with_eol) {
1796         lineCurrent++;
1797         int new_lev = levelCurrent;
1798         if (visibleChars == 0 && foldCompact)
1799             new_lev |= SC_FOLDLEVELWHITEFLAG;
1800                         if ((levelCurrent > levelPrev) && (visibleChars > 0))
1801                                 new_lev |= SC_FOLDLEVELHEADERFLAG;
1802             levelCurrent = new_lev;
1803     }
1804         styler.SetLevel(lineCurrent, levelCurrent|SC_FOLDLEVELBASE);
1805 }
1806
1807 static const char * const rubyWordListDesc[] = {
1808         "Keywords",
1809         0
1810 };
1811
1812 LexerModule lmRuby(SCLEX_RUBY, ColouriseRbDoc, "ruby", FoldRbDoc, rubyWordListDesc);