Update Scintilla to version 3.6.3
[geany-mirror.git] / scintilla / lexers / LexPerl.cxx
blob3b10b47e6b4d4e8d92f490370206847ff2aad50a
1 // Scintilla source code edit control
2 /** @file LexPerl.cxx
3 ** Lexer for Perl.
4 ** Converted to lexer object by "Udo Lechner" <dlchnr(at)gmx(dot)net>
5 **/
6 // Copyright 1998-2008 by Neil Hodgson <neilh@scintilla.org>
7 // Lexical analysis fixes by Kein-Hong Man <mkh@pl.jaring.my>
8 // The License.txt file describes the conditions under which this software may be distributed.
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdio.h>
13 #include <stdarg.h>
14 #include <assert.h>
15 #include <ctype.h>
17 #include <string>
18 #include <map>
20 #include "ILexer.h"
21 #include "Scintilla.h"
22 #include "SciLexer.h"
24 #include "WordList.h"
25 #include "LexAccessor.h"
26 #include "StyleContext.h"
27 #include "CharacterSet.h"
28 #include "LexerModule.h"
29 #include "OptionSet.h"
31 #ifdef SCI_NAMESPACE
32 using namespace Scintilla;
33 #endif
35 // Info for HERE document handling from perldata.pod (reformatted):
36 // ----------------------------------------------------------------
37 // A line-oriented form of quoting is based on the shell ``here-doc'' syntax.
38 // Following a << you specify a string to terminate the quoted material, and
39 // all lines following the current line down to the terminating string are
40 // the value of the item.
41 // * The terminating string may be either an identifier (a word), or some
42 // quoted text.
43 // * If quoted, the type of quotes you use determines the treatment of the
44 // text, just as in regular quoting.
45 // * An unquoted identifier works like double quotes.
46 // * There must be no space between the << and the identifier.
47 // (If you put a space it will be treated as a null identifier,
48 // which is valid, and matches the first empty line.)
49 // (This is deprecated, -w warns of this syntax)
50 // * The terminating string must appear by itself (unquoted and
51 // with no surrounding whitespace) on the terminating line.
53 #define HERE_DELIM_MAX 256 // maximum length of HERE doc delimiter
55 #define PERLNUM_BINARY 1 // order is significant: 1-3 cannot have a dot
56 #define PERLNUM_OCTAL 2
57 #define PERLNUM_FLOAT_EXP 3 // exponent part only
58 #define PERLNUM_HEX 4 // may be a hex float
59 #define PERLNUM_DECIMAL 5 // 1-5 are numbers; 6-7 are strings
60 #define PERLNUM_VECTOR 6
61 #define PERLNUM_V_VECTOR 7
62 #define PERLNUM_BAD 8
64 #define BACK_NONE 0 // lookback state for bareword disambiguation:
65 #define BACK_OPERATOR 1 // whitespace/comments are insignificant
66 #define BACK_KEYWORD 2 // operators/keywords are needed for disambiguation
68 #define SUB_BEGIN 0 // states for subroutine prototype scan:
69 #define SUB_HAS_PROTO 1 // only 'prototype' attribute allows prototypes
70 #define SUB_HAS_ATTRIB 2 // other attributes can exist leftward
71 #define SUB_HAS_MODULE 3 // sub name can have a ::identifier part
72 #define SUB_HAS_SUB 4 // 'sub' keyword
74 // all interpolated styles are different from their parent styles by a constant difference
75 // we also assume SCE_PL_STRING_VAR is the interpolated style with the smallest value
76 #define INTERPOLATE_SHIFT (SCE_PL_STRING_VAR - SCE_PL_STRING)
78 static bool isPerlKeyword(Sci_PositionU start, Sci_PositionU end, WordList &keywords, LexAccessor &styler) {
79 // old-style keyword matcher; needed because GetCurrent() needs
80 // current segment to be committed, but we may abandon early...
81 char s[100];
82 Sci_PositionU i, len = end - start;
83 if (len > 30) { len = 30; }
84 for (i = 0; i < len; i++, start++) s[i] = styler[start];
85 s[i] = '\0';
86 return keywords.InList(s);
89 static int disambiguateBareword(LexAccessor &styler, Sci_PositionU bk, Sci_PositionU fw,
90 int backFlag, Sci_PositionU backPos, Sci_PositionU endPos) {
91 // identifiers are recognized by Perl as barewords under some
92 // conditions, the following attempts to do the disambiguation
93 // by looking backward and forward; result in 2 LSB
94 int result = 0;
95 bool moreback = false; // true if passed newline/comments
96 bool brace = false; // true if opening brace found
97 // if BACK_NONE, neither operator nor keyword, so skip test
98 if (backFlag == BACK_NONE)
99 return result;
100 // first look backwards past whitespace/comments to set EOL flag
101 // (some disambiguation patterns must be on a single line)
102 if (backPos <= static_cast<Sci_PositionU>(styler.LineStart(styler.GetLine(bk))))
103 moreback = true;
104 // look backwards at last significant lexed item for disambiguation
105 bk = backPos - 1;
106 int ch = static_cast<unsigned char>(styler.SafeGetCharAt(bk));
107 if (ch == '{' && !moreback) {
108 // {bareword: possible variable spec
109 brace = true;
110 } else if ((ch == '&' && styler.SafeGetCharAt(bk - 1) != '&')
111 // &bareword: subroutine call
112 || styler.Match(bk - 1, "->")
113 // ->bareword: part of variable spec
114 || styler.Match(bk - 1, "::")
115 // ::bareword: part of module spec
116 || styler.Match(bk - 2, "sub")) {
117 // sub bareword: subroutine declaration
118 // (implied BACK_KEYWORD, no keywords end in 'sub'!)
119 result |= 1;
121 // next, scan forward after word past tab/spaces only;
122 // if ch isn't one of '[{(,' we can skip the test
123 if ((ch == '{' || ch == '(' || ch == '['|| ch == ',')
124 && fw < endPos) {
125 while (ch = static_cast<unsigned char>(styler.SafeGetCharAt(fw)),
126 IsASpaceOrTab(ch) && fw < endPos) {
127 fw++;
129 if ((ch == '}' && brace)
130 // {bareword}: variable spec
131 || styler.Match(fw, "=>")) {
132 // [{(, bareword=>: hash literal
133 result |= 2;
136 return result;
139 static void skipWhitespaceComment(LexAccessor &styler, Sci_PositionU &p) {
140 // when backtracking, we need to skip whitespace and comments
141 int style;
142 while ((p > 0) && (style = styler.StyleAt(p),
143 style == SCE_PL_DEFAULT || style == SCE_PL_COMMENTLINE))
144 p--;
147 static int findPrevLexeme(LexAccessor &styler, Sci_PositionU &bk, int &style) {
148 // scan backward past whitespace and comments to find a lexeme
149 skipWhitespaceComment(styler, bk);
150 if (bk == 0)
151 return 0;
152 int sz = 1;
153 style = styler.StyleAt(bk);
154 while (bk > 0) { // find extent of lexeme
155 if (styler.StyleAt(bk - 1) == style) {
156 bk--; sz++;
157 } else
158 break;
160 return sz;
163 static int styleBeforeBracePair(LexAccessor &styler, Sci_PositionU bk) {
164 // backtrack to find open '{' corresponding to a '}', balanced
165 // return significant style to be tested for '/' disambiguation
166 int braceCount = 1;
167 if (bk == 0)
168 return SCE_PL_DEFAULT;
169 while (--bk > 0) {
170 if (styler.StyleAt(bk) == SCE_PL_OPERATOR) {
171 int bkch = static_cast<unsigned char>(styler.SafeGetCharAt(bk));
172 if (bkch == ';') { // early out
173 break;
174 } else if (bkch == '}') {
175 braceCount++;
176 } else if (bkch == '{') {
177 if (--braceCount == 0) break;
181 if (bk > 0 && braceCount == 0) {
182 // balanced { found, bk > 0, skip more whitespace/comments
183 bk--;
184 skipWhitespaceComment(styler, bk);
185 return styler.StyleAt(bk);
187 return SCE_PL_DEFAULT;
190 static int styleCheckIdentifier(LexAccessor &styler, Sci_PositionU bk) {
191 // backtrack to classify sub-styles of identifier under test
192 // return sub-style to be tested for '/' disambiguation
193 if (styler.SafeGetCharAt(bk) == '>') // inputsymbol, like <foo>
194 return 1;
195 // backtrack to check for possible "->" or "::" before identifier
196 while (bk > 0 && styler.StyleAt(bk) == SCE_PL_IDENTIFIER) {
197 bk--;
199 while (bk > 0) {
200 int bkstyle = styler.StyleAt(bk);
201 if (bkstyle == SCE_PL_DEFAULT
202 || bkstyle == SCE_PL_COMMENTLINE) {
203 // skip whitespace, comments
204 } else if (bkstyle == SCE_PL_OPERATOR) {
205 // test for "->" and "::"
206 if (styler.Match(bk - 1, "->") || styler.Match(bk - 1, "::"))
207 return 2;
208 } else
209 return 3; // bare identifier
210 bk--;
212 return 0;
215 static int podLineScan(LexAccessor &styler, Sci_PositionU &pos, Sci_PositionU endPos) {
216 // forward scan the current line to classify line for POD style
217 int state = -1;
218 while (pos < endPos) {
219 int ch = static_cast<unsigned char>(styler.SafeGetCharAt(pos));
220 if (ch == '\n' || ch == '\r') {
221 if (ch == '\r' && styler.SafeGetCharAt(pos + 1) == '\n') pos++;
222 break;
224 if (IsASpaceOrTab(ch)) { // whitespace, take note
225 if (state == -1)
226 state = SCE_PL_DEFAULT;
227 } else if (state == SCE_PL_DEFAULT) { // verbatim POD line
228 state = SCE_PL_POD_VERB;
229 } else if (state != SCE_PL_POD_VERB) { // regular POD line
230 state = SCE_PL_POD;
232 pos++;
234 if (state == -1)
235 state = SCE_PL_DEFAULT;
236 return state;
239 static bool styleCheckSubPrototype(LexAccessor &styler, Sci_PositionU bk) {
240 // backtrack to identify if we're starting a subroutine prototype
241 // we also need to ignore whitespace/comments, format is like:
242 // sub abc::pqr :const :prototype(...)
243 // lexemes are tested in pairs, e.g. '::'+'pqr', ':'+'const', etc.
244 // and a state machine generates legal subroutine syntax matches
245 styler.Flush();
246 int state = SUB_BEGIN;
247 do {
248 // find two lexemes, lexeme 2 follows lexeme 1
249 int style2 = SCE_PL_DEFAULT;
250 Sci_PositionU pos2 = bk;
251 int len2 = findPrevLexeme(styler, pos2, style2);
252 int style1 = SCE_PL_DEFAULT;
253 Sci_PositionU pos1 = pos2;
254 if (pos1 > 0) pos1--;
255 int len1 = findPrevLexeme(styler, pos1, style1);
256 if (len1 == 0 || len2 == 0) // lexeme pair must exist
257 break;
259 // match parts of syntax, if invalid subroutine syntax, break off
260 if (style1 == SCE_PL_OPERATOR && len1 == 1 &&
261 styler.SafeGetCharAt(pos1) == ':') { // ':'
262 if (style2 == SCE_PL_IDENTIFIER || style2 == SCE_PL_WORD) {
263 if (len2 == 9 && styler.Match(pos2, "prototype")) { // ':' 'prototype'
264 if (state == SUB_BEGIN) {
265 state = SUB_HAS_PROTO;
266 } else
267 break;
268 } else { // ':' <attribute>
269 if (state == SUB_HAS_PROTO || state == SUB_HAS_ATTRIB) {
270 state = SUB_HAS_ATTRIB;
271 } else
272 break;
274 } else
275 break;
276 } else if (style1 == SCE_PL_OPERATOR && len1 == 2 &&
277 styler.Match(pos1, "::")) { // '::'
278 if (style2 == SCE_PL_IDENTIFIER) { // '::' <identifier>
279 state = SUB_HAS_MODULE;
280 } else
281 break;
282 } else if (style1 == SCE_PL_WORD && len1 == 3 &&
283 styler.Match(pos1, "sub")) { // 'sub'
284 if (style2 == SCE_PL_IDENTIFIER) { // 'sub' <identifier>
285 state = SUB_HAS_SUB;
286 } else
287 break;
288 } else
289 break;
290 bk = pos1; // set position for finding next lexeme pair
291 if (bk > 0) bk--;
292 } while (state != SUB_HAS_SUB);
293 return (state == SUB_HAS_SUB);
296 static int actualNumStyle(int numberStyle) {
297 if (numberStyle == PERLNUM_VECTOR || numberStyle == PERLNUM_V_VECTOR) {
298 return SCE_PL_STRING;
299 } else if (numberStyle == PERLNUM_BAD) {
300 return SCE_PL_ERROR;
302 return SCE_PL_NUMBER;
305 static int opposite(int ch) {
306 if (ch == '(') return ')';
307 if (ch == '[') return ']';
308 if (ch == '{') return '}';
309 if (ch == '<') return '>';
310 return ch;
313 static bool IsCommentLine(Sci_Position line, LexAccessor &styler) {
314 Sci_Position pos = styler.LineStart(line);
315 Sci_Position eol_pos = styler.LineStart(line + 1) - 1;
316 for (Sci_Position i = pos; i < eol_pos; i++) {
317 char ch = styler[i];
318 int style = styler.StyleAt(i);
319 if (ch == '#' && style == SCE_PL_COMMENTLINE)
320 return true;
321 else if (!IsASpaceOrTab(ch))
322 return false;
324 return false;
327 static bool IsPackageLine(Sci_Position line, LexAccessor &styler) {
328 Sci_Position pos = styler.LineStart(line);
329 int style = styler.StyleAt(pos);
330 if (style == SCE_PL_WORD && styler.Match(pos, "package")) {
331 return true;
333 return false;
336 static int PodHeadingLevel(Sci_Position pos, LexAccessor &styler) {
337 int lvl = static_cast<unsigned char>(styler.SafeGetCharAt(pos + 5));
338 if (lvl >= '1' && lvl <= '4') {
339 return lvl - '0';
341 return 0;
344 // An individual named option for use in an OptionSet
346 // Options used for LexerPerl
347 struct OptionsPerl {
348 bool fold;
349 bool foldComment;
350 bool foldCompact;
351 // Custom folding of POD and packages
352 bool foldPOD; // fold.perl.pod
353 // Enable folding Pod blocks when using the Perl lexer.
354 bool foldPackage; // fold.perl.package
355 // Enable folding packages when using the Perl lexer.
357 bool foldCommentExplicit;
359 bool foldAtElse;
361 OptionsPerl() {
362 fold = false;
363 foldComment = false;
364 foldCompact = true;
365 foldPOD = true;
366 foldPackage = true;
367 foldCommentExplicit = true;
368 foldAtElse = false;
372 static const char *const perlWordListDesc[] = {
373 "Keywords",
377 struct OptionSetPerl : public OptionSet<OptionsPerl> {
378 OptionSetPerl() {
379 DefineProperty("fold", &OptionsPerl::fold);
381 DefineProperty("fold.comment", &OptionsPerl::foldComment);
383 DefineProperty("fold.compact", &OptionsPerl::foldCompact);
385 DefineProperty("fold.perl.pod", &OptionsPerl::foldPOD,
386 "Set to 0 to disable folding Pod blocks when using the Perl lexer.");
388 DefineProperty("fold.perl.package", &OptionsPerl::foldPackage,
389 "Set to 0 to disable folding packages when using the Perl lexer.");
391 DefineProperty("fold.perl.comment.explicit", &OptionsPerl::foldCommentExplicit,
392 "Set to 0 to disable explicit folding.");
394 DefineProperty("fold.perl.at.else", &OptionsPerl::foldAtElse,
395 "This option enables Perl folding on a \"} else {\" line of an if statement.");
397 DefineWordListSets(perlWordListDesc);
401 class LexerPerl : public ILexer {
402 CharacterSet setWordStart;
403 CharacterSet setWord;
404 CharacterSet setSpecialVar;
405 CharacterSet setControlVar;
406 WordList keywords;
407 OptionsPerl options;
408 OptionSetPerl osPerl;
409 public:
410 LexerPerl() :
411 setWordStart(CharacterSet::setAlpha, "_", 0x80, true),
412 setWord(CharacterSet::setAlphaNum, "_", 0x80, true),
413 setSpecialVar(CharacterSet::setNone, "\"$;<>&`'+,./\\%:=~!?@[]"),
414 setControlVar(CharacterSet::setNone, "ACDEFHILMNOPRSTVWX") {
416 virtual ~LexerPerl() {
418 void SCI_METHOD Release() {
419 delete this;
421 int SCI_METHOD Version() const {
422 return lvOriginal;
424 const char *SCI_METHOD PropertyNames() {
425 return osPerl.PropertyNames();
427 int SCI_METHOD PropertyType(const char *name) {
428 return osPerl.PropertyType(name);
430 const char *SCI_METHOD DescribeProperty(const char *name) {
431 return osPerl.DescribeProperty(name);
433 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val);
434 const char *SCI_METHOD DescribeWordListSets() {
435 return osPerl.DescribeWordListSets();
437 Sci_Position SCI_METHOD WordListSet(int n, const char *wl);
438 void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess);
439 void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess);
441 void *SCI_METHOD PrivateCall(int, void *) {
442 return 0;
445 static ILexer *LexerFactoryPerl() {
446 return new LexerPerl();
448 int InputSymbolScan(StyleContext &sc);
449 void InterpolateSegment(StyleContext &sc, int maxSeg, bool isPattern=false);
452 Sci_Position SCI_METHOD LexerPerl::PropertySet(const char *key, const char *val) {
453 if (osPerl.PropertySet(&options, key, val)) {
454 return 0;
456 return -1;
459 Sci_Position SCI_METHOD LexerPerl::WordListSet(int n, const char *wl) {
460 WordList *wordListN = 0;
461 switch (n) {
462 case 0:
463 wordListN = &keywords;
464 break;
466 Sci_Position firstModification = -1;
467 if (wordListN) {
468 WordList wlNew;
469 wlNew.Set(wl);
470 if (*wordListN != wlNew) {
471 wordListN->Set(wl);
472 firstModification = 0;
475 return firstModification;
478 int LexerPerl::InputSymbolScan(StyleContext &sc) {
479 // forward scan for matching > on same line; file handles
480 int c, sLen = 0;
481 while ((c = sc.GetRelativeCharacter(++sLen)) != 0) {
482 if (c == '\r' || c == '\n') {
483 return 0;
484 } else if (c == '>') {
485 if (sc.Match("<=>")) // '<=>' case
486 return 0;
487 return sLen;
490 return 0;
493 void LexerPerl::InterpolateSegment(StyleContext &sc, int maxSeg, bool isPattern) {
494 // interpolate a segment (with no active backslashes or delimiters within)
495 // switch in or out of an interpolation style or continue current style
496 // commit variable patterns if found, trim segment, repeat until done
497 while (maxSeg > 0) {
498 bool isVar = false;
499 int sLen = 0;
500 if ((maxSeg > 1) && (sc.ch == '$' || sc.ch == '@')) {
501 // $#[$]*word [$@][$]*word (where word or {word} is always present)
502 bool braces = false;
503 sLen = 1;
504 if (sc.ch == '$' && sc.chNext == '#') { // starts with $#
505 sLen++;
507 while ((maxSeg > sLen) && (sc.GetRelativeCharacter(sLen) == '$')) // >0 $ dereference within
508 sLen++;
509 if ((maxSeg > sLen) && (sc.GetRelativeCharacter(sLen) == '{')) { // { start for {word}
510 sLen++;
511 braces = true;
513 if (maxSeg > sLen) {
514 int c = sc.GetRelativeCharacter(sLen);
515 if (setWordStart.Contains(c)) { // word (various)
516 sLen++;
517 isVar = true;
518 while (maxSeg > sLen) {
519 if (!setWord.Contains(sc.GetRelativeCharacter(sLen)))
520 break;
521 sLen++;
523 } else if (braces && IsADigit(c) && (sLen == 2)) { // digit for ${digit}
524 sLen++;
525 isVar = true;
528 if (braces) {
529 if ((maxSeg > sLen) && (sc.GetRelativeCharacter(sLen) == '}')) { // } end for {word}
530 sLen++;
531 } else
532 isVar = false;
535 if (!isVar && (maxSeg > 1)) { // $- or @-specific variable patterns
536 int c = sc.chNext;
537 if (sc.ch == '$') {
538 sLen = 1;
539 if (IsADigit(c)) { // $[0-9] and slurp trailing digits
540 sLen++;
541 isVar = true;
542 while ((maxSeg > sLen) && IsADigit(sc.GetRelativeCharacter(sLen)))
543 sLen++;
544 } else if (setSpecialVar.Contains(c)) { // $ special variables
545 sLen++;
546 isVar = true;
547 } else if (!isPattern && ((c == '(') || (c == ')') || (c == '|'))) { // $ additional
548 sLen++;
549 isVar = true;
550 } else if (c == '^') { // $^A control-char style
551 sLen++;
552 if ((maxSeg > sLen) && setControlVar.Contains(sc.GetRelativeCharacter(sLen))) {
553 sLen++;
554 isVar = true;
557 } else if (sc.ch == '@') {
558 sLen = 1;
559 if (!isPattern && ((c == '+') || (c == '-'))) { // @ specials non-pattern
560 sLen++;
561 isVar = true;
565 if (isVar) { // commit as interpolated variable or normal character
566 if (sc.state < SCE_PL_STRING_VAR)
567 sc.SetState(sc.state + INTERPOLATE_SHIFT);
568 sc.Forward(sLen);
569 maxSeg -= sLen;
570 } else {
571 if (sc.state >= SCE_PL_STRING_VAR)
572 sc.SetState(sc.state - INTERPOLATE_SHIFT);
573 sc.Forward();
574 maxSeg--;
577 if (sc.state >= SCE_PL_STRING_VAR)
578 sc.SetState(sc.state - INTERPOLATE_SHIFT);
581 void SCI_METHOD LexerPerl::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
582 LexAccessor styler(pAccess);
584 // keywords that forces /PATTERN/ at all times; should track vim's behaviour
585 WordList reWords;
586 reWords.Set("elsif if split while");
588 // charset classes
589 CharacterSet setSingleCharOp(CharacterSet::setNone, "rwxoRWXOezsfdlpSbctugkTBMAC");
590 // lexing of "%*</" operators is non-trivial; these are missing in the set below
591 CharacterSet setPerlOperator(CharacterSet::setNone, "^&\\()-+=|{}[]:;>,?!.~");
592 CharacterSet setQDelim(CharacterSet::setNone, "qrwx");
593 CharacterSet setModifiers(CharacterSet::setAlpha);
594 CharacterSet setPreferRE(CharacterSet::setNone, "*/<%");
595 // setArray and setHash also accepts chars for special vars like $_,
596 // which are then truncated when the next char does not match setVar
597 CharacterSet setVar(CharacterSet::setAlphaNum, "#$_'", 0x80, true);
598 CharacterSet setArray(CharacterSet::setAlpha, "#$_+-", 0x80, true);
599 CharacterSet setHash(CharacterSet::setAlpha, "#$_!^+-", 0x80, true);
600 CharacterSet &setPOD = setModifiers;
601 CharacterSet setNonHereDoc(CharacterSet::setDigits, "=$@");
602 CharacterSet setHereDocDelim(CharacterSet::setAlphaNum, "_");
603 CharacterSet setSubPrototype(CharacterSet::setNone, "\\[$@%&*+];_ \t");
604 CharacterSet setRepetition(CharacterSet::setDigits, ")\"'");
605 // for format identifiers
606 CharacterSet setFormatStart(CharacterSet::setAlpha, "_=");
607 CharacterSet &setFormat = setHereDocDelim;
609 // Lexer for perl often has to backtrack to start of current style to determine
610 // which characters are being used as quotes, how deeply nested is the
611 // start position and what the termination string is for HERE documents.
613 class HereDocCls { // Class to manage HERE doc sequence
614 public:
615 int State;
616 // 0: '<<' encountered
617 // 1: collect the delimiter
618 // 2: here doc text (lines after the delimiter)
619 int Quote; // the char after '<<'
620 bool Quoted; // true if Quote in ('\'','"','`')
621 int DelimiterLength; // strlen(Delimiter)
622 char Delimiter[HERE_DELIM_MAX]; // the Delimiter
623 HereDocCls() {
624 State = 0;
625 Quote = 0;
626 Quoted = false;
627 DelimiterLength = 0;
628 Delimiter[0] = '\0';
630 void Append(int ch) {
631 Delimiter[DelimiterLength++] = static_cast<char>(ch);
632 Delimiter[DelimiterLength] = '\0';
634 ~HereDocCls() {
637 HereDocCls HereDoc; // TODO: FIFO for stacked here-docs
639 class QuoteCls { // Class to manage quote pairs
640 public:
641 int Rep;
642 int Count;
643 int Up, Down;
644 QuoteCls() {
645 New(1);
647 void New(int r = 1) {
648 Rep = r;
649 Count = 0;
650 Up = '\0';
651 Down = '\0';
653 void Open(int u) {
654 Count++;
655 Up = u;
656 Down = opposite(Up);
659 QuoteCls Quote;
661 // additional state for number lexing
662 int numState = PERLNUM_DECIMAL;
663 int dotCount = 0;
665 Sci_PositionU endPos = startPos + length;
667 // Backtrack to beginning of style if required...
668 // If in a long distance lexical state, backtrack to find quote characters.
669 // Includes strings (may be multi-line), numbers (additional state), format
670 // bodies, as well as POD sections.
671 if (initStyle == SCE_PL_HERE_Q
672 || initStyle == SCE_PL_HERE_QQ
673 || initStyle == SCE_PL_HERE_QX
674 || initStyle == SCE_PL_FORMAT
675 || initStyle == SCE_PL_HERE_QQ_VAR
676 || initStyle == SCE_PL_HERE_QX_VAR
678 // backtrack through multiple styles to reach the delimiter start
679 int delim = (initStyle == SCE_PL_FORMAT) ? SCE_PL_FORMAT_IDENT:SCE_PL_HERE_DELIM;
680 while ((startPos > 1) && (styler.StyleAt(startPos) != delim)) {
681 startPos--;
683 startPos = styler.LineStart(styler.GetLine(startPos));
684 initStyle = styler.StyleAt(startPos - 1);
686 if (initStyle == SCE_PL_STRING
687 || initStyle == SCE_PL_STRING_QQ
688 || initStyle == SCE_PL_BACKTICKS
689 || initStyle == SCE_PL_STRING_QX
690 || initStyle == SCE_PL_REGEX
691 || initStyle == SCE_PL_STRING_QR
692 || initStyle == SCE_PL_REGSUBST
693 || initStyle == SCE_PL_STRING_VAR
694 || initStyle == SCE_PL_STRING_QQ_VAR
695 || initStyle == SCE_PL_BACKTICKS_VAR
696 || initStyle == SCE_PL_STRING_QX_VAR
697 || initStyle == SCE_PL_REGEX_VAR
698 || initStyle == SCE_PL_STRING_QR_VAR
699 || initStyle == SCE_PL_REGSUBST_VAR
701 // for interpolation, must backtrack through a mix of two different styles
702 int otherStyle = (initStyle >= SCE_PL_STRING_VAR) ?
703 initStyle - INTERPOLATE_SHIFT : initStyle + INTERPOLATE_SHIFT;
704 while (startPos > 1) {
705 int st = styler.StyleAt(startPos - 1);
706 if ((st != initStyle) && (st != otherStyle))
707 break;
708 startPos--;
710 initStyle = SCE_PL_DEFAULT;
711 } else if (initStyle == SCE_PL_STRING_Q
712 || initStyle == SCE_PL_STRING_QW
713 || initStyle == SCE_PL_XLAT
714 || initStyle == SCE_PL_CHARACTER
715 || initStyle == SCE_PL_NUMBER
716 || initStyle == SCE_PL_IDENTIFIER
717 || initStyle == SCE_PL_ERROR
718 || initStyle == SCE_PL_SUB_PROTOTYPE
720 while ((startPos > 1) && (styler.StyleAt(startPos - 1) == initStyle)) {
721 startPos--;
723 initStyle = SCE_PL_DEFAULT;
724 } else if (initStyle == SCE_PL_POD
725 || initStyle == SCE_PL_POD_VERB
727 // POD backtracking finds preceding blank lines and goes back past them
728 Sci_Position ln = styler.GetLine(startPos);
729 if (ln > 0) {
730 initStyle = styler.StyleAt(styler.LineStart(--ln));
731 if (initStyle == SCE_PL_POD || initStyle == SCE_PL_POD_VERB) {
732 while (ln > 0 && styler.GetLineState(ln) == SCE_PL_DEFAULT)
733 ln--;
735 startPos = styler.LineStart(++ln);
736 initStyle = styler.StyleAt(startPos - 1);
737 } else {
738 startPos = 0;
739 initStyle = SCE_PL_DEFAULT;
743 // backFlag, backPos are additional state to aid identifier corner cases.
744 // Look backwards past whitespace and comments in order to detect either
745 // operator or keyword. Later updated as we go along.
746 int backFlag = BACK_NONE;
747 Sci_PositionU backPos = startPos;
748 if (backPos > 0) {
749 backPos--;
750 skipWhitespaceComment(styler, backPos);
751 if (styler.StyleAt(backPos) == SCE_PL_OPERATOR)
752 backFlag = BACK_OPERATOR;
753 else if (styler.StyleAt(backPos) == SCE_PL_WORD)
754 backFlag = BACK_KEYWORD;
755 backPos++;
758 StyleContext sc(startPos, endPos - startPos, initStyle, styler, static_cast<char>(STYLE_MAX));
760 for (; sc.More(); sc.Forward()) {
762 // Determine if the current state should terminate.
763 switch (sc.state) {
764 case SCE_PL_OPERATOR:
765 sc.SetState(SCE_PL_DEFAULT);
766 backFlag = BACK_OPERATOR;
767 backPos = sc.currentPos;
768 break;
769 case SCE_PL_IDENTIFIER: // identifier, bareword, inputsymbol
770 if ((!setWord.Contains(sc.ch) && sc.ch != '\'')
771 || sc.Match('.', '.')
772 || sc.chPrev == '>') { // end of inputsymbol
773 sc.SetState(SCE_PL_DEFAULT);
775 break;
776 case SCE_PL_WORD: // keyword, plus special cases
777 if (!setWord.Contains(sc.ch)) {
778 char s[100];
779 sc.GetCurrent(s, sizeof(s));
780 if ((strcmp(s, "__DATA__") == 0) || (strcmp(s, "__END__") == 0)) {
781 sc.ChangeState(SCE_PL_DATASECTION);
782 } else {
783 if ((strcmp(s, "format") == 0)) {
784 sc.SetState(SCE_PL_FORMAT_IDENT);
785 HereDoc.State = 0;
786 } else {
787 sc.SetState(SCE_PL_DEFAULT);
789 backFlag = BACK_KEYWORD;
790 backPos = sc.currentPos;
793 break;
794 case SCE_PL_SCALAR:
795 case SCE_PL_ARRAY:
796 case SCE_PL_HASH:
797 case SCE_PL_SYMBOLTABLE:
798 if (sc.Match(':', ':')) { // skip ::
799 sc.Forward();
800 } else if (!setVar.Contains(sc.ch)) {
801 if (sc.LengthCurrent() == 1) {
802 // Special variable: $(, $_ etc.
803 sc.Forward();
805 sc.SetState(SCE_PL_DEFAULT);
807 break;
808 case SCE_PL_NUMBER:
809 // if no early break, number style is terminated at "(go through)"
810 if (sc.ch == '.') {
811 if (sc.chNext == '.') {
812 // double dot is always an operator (go through)
813 } else if (numState <= PERLNUM_FLOAT_EXP) {
814 // non-decimal number or float exponent, consume next dot
815 sc.SetState(SCE_PL_OPERATOR);
816 break;
817 } else { // decimal or vectors allows dots
818 dotCount++;
819 if (numState == PERLNUM_DECIMAL) {
820 if (dotCount <= 1) // number with one dot in it
821 break;
822 if (IsADigit(sc.chNext)) { // really a vector
823 numState = PERLNUM_VECTOR;
824 break;
826 // number then dot (go through)
827 } else if (numState == PERLNUM_HEX) {
828 if (dotCount <= 1 && IsADigit(sc.chNext, 16)) {
829 break; // hex with one dot is a hex float
830 } else {
831 sc.SetState(SCE_PL_OPERATOR);
832 break;
834 // hex then dot (go through)
835 } else if (IsADigit(sc.chNext)) // vectors
836 break;
837 // vector then dot (go through)
839 } else if (sc.ch == '_') {
840 // permissive underscoring for number and vector literals
841 break;
842 } else if (numState == PERLNUM_DECIMAL) {
843 if (sc.ch == 'E' || sc.ch == 'e') { // exponent, sign
844 numState = PERLNUM_FLOAT_EXP;
845 if (sc.chNext == '+' || sc.chNext == '-') {
846 sc.Forward();
848 break;
849 } else if (IsADigit(sc.ch))
850 break;
851 // number then word (go through)
852 } else if (numState == PERLNUM_HEX) {
853 if (sc.ch == 'P' || sc.ch == 'p') { // hex float exponent, sign
854 numState = PERLNUM_FLOAT_EXP;
855 if (sc.chNext == '+' || sc.chNext == '-') {
856 sc.Forward();
858 break;
859 } else if (IsADigit(sc.ch, 16))
860 break;
861 // hex or hex float then word (go through)
862 } else if (numState == PERLNUM_VECTOR || numState == PERLNUM_V_VECTOR) {
863 if (IsADigit(sc.ch)) // vector
864 break;
865 if (setWord.Contains(sc.ch) && dotCount == 0) { // change to word
866 sc.ChangeState(SCE_PL_IDENTIFIER);
867 break;
869 // vector then word (go through)
870 } else if (IsADigit(sc.ch)) {
871 if (numState == PERLNUM_FLOAT_EXP) {
872 break;
873 } else if (numState == PERLNUM_OCTAL) {
874 if (sc.ch <= '7') break;
875 } else if (numState == PERLNUM_BINARY) {
876 if (sc.ch <= '1') break;
878 // mark invalid octal, binary numbers (go through)
879 numState = PERLNUM_BAD;
880 break;
882 // complete current number or vector
883 sc.ChangeState(actualNumStyle(numState));
884 sc.SetState(SCE_PL_DEFAULT);
885 break;
886 case SCE_PL_COMMENTLINE:
887 if (sc.atLineEnd) {
888 sc.SetState(SCE_PL_DEFAULT);
890 break;
891 case SCE_PL_HERE_DELIM:
892 if (HereDoc.State == 0) { // '<<' encountered
893 int delim_ch = sc.chNext;
894 Sci_Position ws_skip = 0;
895 HereDoc.State = 1; // pre-init HERE doc class
896 HereDoc.Quote = sc.chNext;
897 HereDoc.Quoted = false;
898 HereDoc.DelimiterLength = 0;
899 HereDoc.Delimiter[HereDoc.DelimiterLength] = '\0';
900 if (IsASpaceOrTab(delim_ch)) {
901 // skip whitespace; legal only for quoted delimiters
902 Sci_PositionU i = sc.currentPos + 1;
903 while ((i < endPos) && IsASpaceOrTab(delim_ch)) {
904 i++;
905 delim_ch = static_cast<unsigned char>(styler.SafeGetCharAt(i));
907 ws_skip = i - sc.currentPos - 1;
909 if (delim_ch == '\'' || delim_ch == '"' || delim_ch == '`') {
910 // a quoted here-doc delimiter; skip any whitespace
911 sc.Forward(ws_skip + 1);
912 HereDoc.Quote = delim_ch;
913 HereDoc.Quoted = true;
914 } else if ((ws_skip == 0 && setNonHereDoc.Contains(sc.chNext))
915 || ws_skip > 0) {
916 // left shift << or <<= operator cases
917 // restore position if operator
918 sc.ChangeState(SCE_PL_OPERATOR);
919 sc.ForwardSetState(SCE_PL_DEFAULT);
920 backFlag = BACK_OPERATOR;
921 backPos = sc.currentPos;
922 HereDoc.State = 0;
923 } else {
924 // specially handle initial '\' for identifier
925 if (ws_skip == 0 && HereDoc.Quote == '\\')
926 sc.Forward();
927 // an unquoted here-doc delimiter, no special handling
928 // (cannot be prefixed by spaces/tabs), or
929 // symbols terminates; deprecated zero-length delimiter
931 } else if (HereDoc.State == 1) { // collect the delimiter
932 backFlag = BACK_NONE;
933 if (HereDoc.Quoted) { // a quoted here-doc delimiter
934 if (sc.ch == HereDoc.Quote) { // closing quote => end of delimiter
935 sc.ForwardSetState(SCE_PL_DEFAULT);
936 } else if (!sc.atLineEnd) {
937 if (sc.Match('\\', static_cast<char>(HereDoc.Quote))) { // escaped quote
938 sc.Forward();
940 if (sc.ch != '\r') { // skip CR if CRLF
941 int i = 0; // else append char, possibly an extended char
942 while (i < sc.width) {
943 HereDoc.Append(static_cast<unsigned char>(styler.SafeGetCharAt(sc.currentPos + i)));
944 i++;
948 } else { // an unquoted here-doc delimiter, no extended charsets
949 if (setHereDocDelim.Contains(sc.ch)) {
950 HereDoc.Append(sc.ch);
951 } else {
952 sc.SetState(SCE_PL_DEFAULT);
955 if (HereDoc.DelimiterLength >= HERE_DELIM_MAX - 1) {
956 sc.SetState(SCE_PL_ERROR);
957 HereDoc.State = 0;
960 break;
961 case SCE_PL_HERE_Q:
962 case SCE_PL_HERE_QQ:
963 case SCE_PL_HERE_QX:
964 // also implies HereDoc.State == 2
965 sc.Complete();
966 if (HereDoc.DelimiterLength == 0 || sc.Match(HereDoc.Delimiter)) {
967 int c = sc.GetRelative(HereDoc.DelimiterLength);
968 if (c == '\r' || c == '\n') { // peek first, do not consume match
969 sc.ForwardBytes(HereDoc.DelimiterLength);
970 sc.SetState(SCE_PL_DEFAULT);
971 backFlag = BACK_NONE;
972 HereDoc.State = 0;
973 if (!sc.atLineEnd)
974 sc.Forward();
975 break;
978 if (sc.state == SCE_PL_HERE_Q) { // \EOF and 'EOF' non-interpolated
979 while (!sc.atLineEnd)
980 sc.Forward();
981 break;
983 while (!sc.atLineEnd) { // "EOF" and `EOF` interpolated
984 int c, sLen = 0, endType = 0;
985 while ((c = sc.GetRelativeCharacter(sLen)) != 0) {
986 // scan to break string into segments
987 if (c == '\\') {
988 endType = 1; break;
989 } else if (c == '\r' || c == '\n') {
990 endType = 2; break;
992 sLen++;
994 if (sLen > 0) // process non-empty segments
995 InterpolateSegment(sc, sLen);
996 if (endType == 1) {
997 sc.Forward();
998 // \ at end-of-line does not appear to have any effect, skip
999 if (sc.ch != '\r' && sc.ch != '\n')
1000 sc.Forward();
1001 } else if (endType == 2) {
1002 if (!sc.atLineEnd)
1003 sc.Forward();
1006 break;
1007 case SCE_PL_POD:
1008 case SCE_PL_POD_VERB: {
1009 Sci_PositionU fw = sc.currentPos;
1010 Sci_Position ln = styler.GetLine(fw);
1011 if (sc.atLineStart && sc.Match("=cut")) { // end of POD
1012 sc.SetState(SCE_PL_POD);
1013 sc.Forward(4);
1014 sc.SetState(SCE_PL_DEFAULT);
1015 styler.SetLineState(ln, SCE_PL_POD);
1016 break;
1018 int pod = podLineScan(styler, fw, endPos); // classify POD line
1019 styler.SetLineState(ln, pod);
1020 if (pod == SCE_PL_DEFAULT) {
1021 if (sc.state == SCE_PL_POD_VERB) {
1022 Sci_PositionU fw2 = fw;
1023 while (fw2 < (endPos - 1) && pod == SCE_PL_DEFAULT) {
1024 fw = fw2++; // penultimate line (last blank line)
1025 pod = podLineScan(styler, fw2, endPos);
1026 styler.SetLineState(styler.GetLine(fw2), pod);
1028 if (pod == SCE_PL_POD) { // truncate verbatim POD early
1029 sc.SetState(SCE_PL_POD);
1030 } else
1031 fw = fw2;
1033 } else {
1034 if (pod == SCE_PL_POD_VERB // still part of current paragraph
1035 && (styler.GetLineState(ln - 1) == SCE_PL_POD)) {
1036 pod = SCE_PL_POD;
1037 styler.SetLineState(ln, pod);
1038 } else if (pod == SCE_PL_POD
1039 && (styler.GetLineState(ln - 1) == SCE_PL_POD_VERB)) {
1040 pod = SCE_PL_POD_VERB;
1041 styler.SetLineState(ln, pod);
1043 sc.SetState(pod);
1045 sc.ForwardBytes(fw - sc.currentPos); // commit style
1047 break;
1048 case SCE_PL_REGEX:
1049 case SCE_PL_STRING_QR:
1050 if (Quote.Rep <= 0) {
1051 if (!setModifiers.Contains(sc.ch))
1052 sc.SetState(SCE_PL_DEFAULT);
1053 } else if (!Quote.Up && !IsASpace(sc.ch)) {
1054 Quote.Open(sc.ch);
1055 } else {
1056 int c, sLen = 0, endType = 0;
1057 while ((c = sc.GetRelativeCharacter(sLen)) != 0) {
1058 // scan to break string into segments
1059 if (IsASpace(c)) {
1060 break;
1061 } else if (c == '\\' && Quote.Up != '\\') {
1062 endType = 1; break;
1063 } else if (c == Quote.Down) {
1064 Quote.Count--;
1065 if (Quote.Count == 0) {
1066 Quote.Rep--;
1067 break;
1069 } else if (c == Quote.Up)
1070 Quote.Count++;
1071 sLen++;
1073 if (sLen > 0) { // process non-empty segments
1074 if (Quote.Up != '\'') {
1075 InterpolateSegment(sc, sLen, true);
1076 } else // non-interpolated path
1077 sc.Forward(sLen);
1079 if (endType == 1)
1080 sc.Forward();
1082 break;
1083 case SCE_PL_REGSUBST:
1084 case SCE_PL_XLAT:
1085 if (Quote.Rep <= 0) {
1086 if (!setModifiers.Contains(sc.ch))
1087 sc.SetState(SCE_PL_DEFAULT);
1088 } else if (!Quote.Up && !IsASpace(sc.ch)) {
1089 Quote.Open(sc.ch);
1090 } else {
1091 int c, sLen = 0, endType = 0;
1092 bool isPattern = (Quote.Rep == 2);
1093 while ((c = sc.GetRelativeCharacter(sLen)) != 0) {
1094 // scan to break string into segments
1095 if (c == '\\' && Quote.Up != '\\') {
1096 endType = 2; break;
1097 } else if (Quote.Count == 0 && Quote.Rep == 1) {
1098 // We matched something like s(...) or tr{...}, Perl 5.10
1099 // appears to allow almost any character for use as the
1100 // next delimiters. Whitespace and comments are accepted in
1101 // between, but we'll limit to whitespace here.
1102 // For '#', if no whitespace in between, it's a delimiter.
1103 if (IsASpace(c)) {
1104 // Keep going
1105 } else if (c == '#' && IsASpaceOrTab(sc.GetRelativeCharacter(sLen - 1))) {
1106 endType = 3;
1107 } else
1108 Quote.Open(c);
1109 break;
1110 } else if (c == Quote.Down) {
1111 Quote.Count--;
1112 if (Quote.Count == 0) {
1113 Quote.Rep--;
1114 endType = 1;
1116 if (Quote.Up == Quote.Down)
1117 Quote.Count++;
1118 if (endType == 1)
1119 break;
1120 } else if (c == Quote.Up) {
1121 Quote.Count++;
1122 } else if (IsASpace(c))
1123 break;
1124 sLen++;
1126 if (sLen > 0) { // process non-empty segments
1127 if (sc.state == SCE_PL_REGSUBST && Quote.Up != '\'') {
1128 InterpolateSegment(sc, sLen, isPattern);
1129 } else // non-interpolated path
1130 sc.Forward(sLen);
1132 if (endType == 2) {
1133 sc.Forward();
1134 } else if (endType == 3)
1135 sc.SetState(SCE_PL_DEFAULT);
1137 break;
1138 case SCE_PL_STRING_Q:
1139 case SCE_PL_STRING_QQ:
1140 case SCE_PL_STRING_QX:
1141 case SCE_PL_STRING_QW:
1142 case SCE_PL_STRING:
1143 case SCE_PL_CHARACTER:
1144 case SCE_PL_BACKTICKS:
1145 if (!Quote.Down && !IsASpace(sc.ch)) {
1146 Quote.Open(sc.ch);
1147 } else {
1148 int c, sLen = 0, endType = 0;
1149 while ((c = sc.GetRelativeCharacter(sLen)) != 0) {
1150 // scan to break string into segments
1151 if (IsASpace(c)) {
1152 break;
1153 } else if (c == '\\' && Quote.Up != '\\') {
1154 endType = 2; break;
1155 } else if (c == Quote.Down) {
1156 Quote.Count--;
1157 if (Quote.Count == 0) {
1158 endType = 3; break;
1160 } else if (c == Quote.Up)
1161 Quote.Count++;
1162 sLen++;
1164 if (sLen > 0) { // process non-empty segments
1165 switch (sc.state) {
1166 case SCE_PL_STRING:
1167 case SCE_PL_STRING_QQ:
1168 case SCE_PL_BACKTICKS:
1169 InterpolateSegment(sc, sLen);
1170 break;
1171 case SCE_PL_STRING_QX:
1172 if (Quote.Up != '\'') {
1173 InterpolateSegment(sc, sLen);
1174 break;
1176 // (continued for ' delim)
1177 default: // non-interpolated path
1178 sc.Forward(sLen);
1181 if (endType == 2) {
1182 sc.Forward();
1183 } else if (endType == 3)
1184 sc.ForwardSetState(SCE_PL_DEFAULT);
1186 break;
1187 case SCE_PL_SUB_PROTOTYPE: {
1188 int i = 0;
1189 // forward scan; must all be valid proto characters
1190 while (setSubPrototype.Contains(sc.GetRelative(i)))
1191 i++;
1192 if (sc.GetRelative(i) == ')') { // valid sub prototype
1193 sc.ForwardBytes(i);
1194 sc.ForwardSetState(SCE_PL_DEFAULT);
1195 } else {
1196 // abandon prototype, restart from '('
1197 sc.ChangeState(SCE_PL_OPERATOR);
1198 sc.SetState(SCE_PL_DEFAULT);
1201 break;
1202 case SCE_PL_FORMAT: {
1203 sc.Complete();
1204 if (sc.Match('.')) {
1205 sc.Forward();
1206 if (sc.atLineEnd || ((sc.ch == '\r' && sc.chNext == '\n')))
1207 sc.SetState(SCE_PL_DEFAULT);
1209 while (!sc.atLineEnd)
1210 sc.Forward();
1212 break;
1213 case SCE_PL_ERROR:
1214 break;
1216 // Needed for specific continuation styles (one follows the other)
1217 switch (sc.state) {
1218 // continued from SCE_PL_WORD
1219 case SCE_PL_FORMAT_IDENT:
1220 // occupies HereDoc state 3 to avoid clashing with HERE docs
1221 if (IsASpaceOrTab(sc.ch)) { // skip whitespace
1222 sc.ChangeState(SCE_PL_DEFAULT);
1223 while (IsASpaceOrTab(sc.ch) && !sc.atLineEnd)
1224 sc.Forward();
1225 sc.SetState(SCE_PL_FORMAT_IDENT);
1227 if (setFormatStart.Contains(sc.ch)) { // identifier or '='
1228 if (sc.ch != '=') {
1229 do {
1230 sc.Forward();
1231 } while (setFormat.Contains(sc.ch));
1233 while (IsASpaceOrTab(sc.ch) && !sc.atLineEnd)
1234 sc.Forward();
1235 if (sc.ch == '=') {
1236 sc.ForwardSetState(SCE_PL_DEFAULT);
1237 HereDoc.State = 3;
1238 } else {
1239 // invalid identifier; inexact fallback, but hey
1240 sc.ChangeState(SCE_PL_IDENTIFIER);
1241 sc.SetState(SCE_PL_DEFAULT);
1243 } else {
1244 sc.ChangeState(SCE_PL_DEFAULT); // invalid identifier
1246 backFlag = BACK_NONE;
1247 break;
1250 // Must check end of HereDoc states here before default state is handled
1251 if (HereDoc.State == 1 && sc.atLineEnd) {
1252 // Begin of here-doc (the line after the here-doc delimiter):
1253 // Lexically, the here-doc starts from the next line after the >>, but the
1254 // first line of here-doc seem to follow the style of the last EOL sequence
1255 int st_new = SCE_PL_HERE_QQ;
1256 HereDoc.State = 2;
1257 if (HereDoc.Quoted) {
1258 if (sc.state == SCE_PL_HERE_DELIM) {
1259 // Missing quote at end of string! We are stricter than perl.
1260 // Colour here-doc anyway while marking this bit as an error.
1261 sc.ChangeState(SCE_PL_ERROR);
1263 switch (HereDoc.Quote) {
1264 case '\'':
1265 st_new = SCE_PL_HERE_Q;
1266 break;
1267 case '"' :
1268 st_new = SCE_PL_HERE_QQ;
1269 break;
1270 case '`' :
1271 st_new = SCE_PL_HERE_QX;
1272 break;
1274 } else {
1275 if (HereDoc.Quote == '\\')
1276 st_new = SCE_PL_HERE_Q;
1278 sc.SetState(st_new);
1280 if (HereDoc.State == 3 && sc.atLineEnd) {
1281 // Start of format body.
1282 HereDoc.State = 0;
1283 sc.SetState(SCE_PL_FORMAT);
1286 // Determine if a new state should be entered.
1287 if (sc.state == SCE_PL_DEFAULT) {
1288 if (IsADigit(sc.ch) ||
1289 (IsADigit(sc.chNext) && (sc.ch == '.' || sc.ch == 'v'))) {
1290 sc.SetState(SCE_PL_NUMBER);
1291 backFlag = BACK_NONE;
1292 numState = PERLNUM_DECIMAL;
1293 dotCount = 0;
1294 if (sc.ch == '0') { // hex,bin,octal
1295 if (sc.chNext == 'x' || sc.chNext == 'X') {
1296 numState = PERLNUM_HEX;
1297 } else if (sc.chNext == 'b' || sc.chNext == 'B') {
1298 numState = PERLNUM_BINARY;
1299 } else if (IsADigit(sc.chNext)) {
1300 numState = PERLNUM_OCTAL;
1302 if (numState != PERLNUM_DECIMAL) {
1303 sc.Forward();
1305 } else if (sc.ch == 'v') { // vector
1306 numState = PERLNUM_V_VECTOR;
1308 } else if (setWord.Contains(sc.ch)) {
1309 // if immediately prefixed by '::', always a bareword
1310 sc.SetState(SCE_PL_WORD);
1311 if (sc.chPrev == ':' && sc.GetRelative(-2) == ':') {
1312 sc.ChangeState(SCE_PL_IDENTIFIER);
1314 Sci_PositionU bk = sc.currentPos;
1315 Sci_PositionU fw = sc.currentPos + 1;
1316 // first check for possible quote-like delimiter
1317 if (sc.ch == 's' && !setWord.Contains(sc.chNext)) {
1318 sc.ChangeState(SCE_PL_REGSUBST);
1319 Quote.New(2);
1320 } else if (sc.ch == 'm' && !setWord.Contains(sc.chNext)) {
1321 sc.ChangeState(SCE_PL_REGEX);
1322 Quote.New();
1323 } else if (sc.ch == 'q' && !setWord.Contains(sc.chNext)) {
1324 sc.ChangeState(SCE_PL_STRING_Q);
1325 Quote.New();
1326 } else if (sc.ch == 'y' && !setWord.Contains(sc.chNext)) {
1327 sc.ChangeState(SCE_PL_XLAT);
1328 Quote.New(2);
1329 } else if (sc.Match('t', 'r') && !setWord.Contains(sc.GetRelative(2))) {
1330 sc.ChangeState(SCE_PL_XLAT);
1331 Quote.New(2);
1332 sc.Forward();
1333 fw++;
1334 } else if (sc.ch == 'q' && setQDelim.Contains(sc.chNext)
1335 && !setWord.Contains(sc.GetRelative(2))) {
1336 if (sc.chNext == 'q') sc.ChangeState(SCE_PL_STRING_QQ);
1337 else if (sc.chNext == 'x') sc.ChangeState(SCE_PL_STRING_QX);
1338 else if (sc.chNext == 'r') sc.ChangeState(SCE_PL_STRING_QR);
1339 else sc.ChangeState(SCE_PL_STRING_QW); // sc.chNext == 'w'
1340 Quote.New();
1341 sc.Forward();
1342 fw++;
1343 } else if (sc.ch == 'x' && (sc.chNext == '=' || // repetition
1344 !setWord.Contains(sc.chNext) ||
1345 (setRepetition.Contains(sc.chPrev) && IsADigit(sc.chNext)))) {
1346 sc.ChangeState(SCE_PL_OPERATOR);
1348 // if potentially a keyword, scan forward and grab word, then check
1349 // if it's really one; if yes, disambiguation test is performed
1350 // otherwise it is always a bareword and we skip a lot of scanning
1351 if (sc.state == SCE_PL_WORD) {
1352 while (setWord.Contains(static_cast<unsigned char>(styler.SafeGetCharAt(fw))))
1353 fw++;
1354 if (!isPerlKeyword(styler.GetStartSegment(), fw, keywords, styler)) {
1355 sc.ChangeState(SCE_PL_IDENTIFIER);
1358 // if already SCE_PL_IDENTIFIER, then no ambiguity, skip this
1359 // for quote-like delimiters/keywords, attempt to disambiguate
1360 // to select for bareword, change state -> SCE_PL_IDENTIFIER
1361 if (sc.state != SCE_PL_IDENTIFIER && bk > 0) {
1362 if (disambiguateBareword(styler, bk, fw, backFlag, backPos, endPos))
1363 sc.ChangeState(SCE_PL_IDENTIFIER);
1365 backFlag = BACK_NONE;
1366 } else if (sc.ch == '#') {
1367 sc.SetState(SCE_PL_COMMENTLINE);
1368 } else if (sc.ch == '\"') {
1369 sc.SetState(SCE_PL_STRING);
1370 Quote.New();
1371 Quote.Open(sc.ch);
1372 backFlag = BACK_NONE;
1373 } else if (sc.ch == '\'') {
1374 if (sc.chPrev == '&' && setWordStart.Contains(sc.chNext)) {
1375 // Archaic call
1376 sc.SetState(SCE_PL_IDENTIFIER);
1377 } else {
1378 sc.SetState(SCE_PL_CHARACTER);
1379 Quote.New();
1380 Quote.Open(sc.ch);
1382 backFlag = BACK_NONE;
1383 } else if (sc.ch == '`') {
1384 sc.SetState(SCE_PL_BACKTICKS);
1385 Quote.New();
1386 Quote.Open(sc.ch);
1387 backFlag = BACK_NONE;
1388 } else if (sc.ch == '$') {
1389 sc.SetState(SCE_PL_SCALAR);
1390 if (sc.chNext == '{') {
1391 sc.ForwardSetState(SCE_PL_OPERATOR);
1392 } else if (IsASpace(sc.chNext)) {
1393 sc.ForwardSetState(SCE_PL_DEFAULT);
1394 } else {
1395 sc.Forward();
1396 if (sc.Match('`', '`') || sc.Match(':', ':')) {
1397 sc.Forward();
1400 backFlag = BACK_NONE;
1401 } else if (sc.ch == '@') {
1402 sc.SetState(SCE_PL_ARRAY);
1403 if (setArray.Contains(sc.chNext)) {
1404 // no special treatment
1405 } else if (sc.chNext == ':' && sc.GetRelative(2) == ':') {
1406 sc.ForwardBytes(2);
1407 } else if (sc.chNext == '{' || sc.chNext == '[') {
1408 sc.ForwardSetState(SCE_PL_OPERATOR);
1409 } else {
1410 sc.ChangeState(SCE_PL_OPERATOR);
1412 backFlag = BACK_NONE;
1413 } else if (setPreferRE.Contains(sc.ch)) {
1414 // Explicit backward peeking to set a consistent preferRE for
1415 // any slash found, so no longer need to track preferRE state.
1416 // Find first previous significant lexed element and interpret.
1417 // A few symbols shares this code for disambiguation.
1418 bool preferRE = false;
1419 bool isHereDoc = sc.Match('<', '<');
1420 bool hereDocSpace = false; // for: SCALAR [whitespace] '<<'
1421 Sci_PositionU bk = (sc.currentPos > 0) ? sc.currentPos - 1: 0;
1422 sc.Complete();
1423 styler.Flush();
1424 if (styler.StyleAt(bk) == SCE_PL_DEFAULT)
1425 hereDocSpace = true;
1426 skipWhitespaceComment(styler, bk);
1427 if (bk == 0) {
1428 // avoid backward scanning breakage
1429 preferRE = true;
1430 } else {
1431 int bkstyle = styler.StyleAt(bk);
1432 int bkch = static_cast<unsigned char>(styler.SafeGetCharAt(bk));
1433 switch (bkstyle) {
1434 case SCE_PL_OPERATOR:
1435 preferRE = true;
1436 if (bkch == ')' || bkch == ']') {
1437 preferRE = false;
1438 } else if (bkch == '}') {
1439 // backtrack by counting balanced brace pairs
1440 // needed to test for variables like ${}, @{} etc.
1441 bkstyle = styleBeforeBracePair(styler, bk);
1442 if (bkstyle == SCE_PL_SCALAR
1443 || bkstyle == SCE_PL_ARRAY
1444 || bkstyle == SCE_PL_HASH
1445 || bkstyle == SCE_PL_SYMBOLTABLE
1446 || bkstyle == SCE_PL_OPERATOR) {
1447 preferRE = false;
1449 } else if (bkch == '+' || bkch == '-') {
1450 if (bkch == static_cast<unsigned char>(styler.SafeGetCharAt(bk - 1))
1451 && bkch != static_cast<unsigned char>(styler.SafeGetCharAt(bk - 2)))
1452 // exceptions for operators: unary suffixes ++, --
1453 preferRE = false;
1455 break;
1456 case SCE_PL_IDENTIFIER:
1457 preferRE = true;
1458 bkstyle = styleCheckIdentifier(styler, bk);
1459 if ((bkstyle == 1) || (bkstyle == 2)) {
1460 // inputsymbol or var with "->" or "::" before identifier
1461 preferRE = false;
1462 } else if (bkstyle == 3) {
1463 // bare identifier, test cases follows:
1464 if (sc.ch == '/') {
1465 // if '/', /PATTERN/ unless digit/space immediately after '/'
1466 // if '//', always expect defined-or operator to follow identifier
1467 if (IsASpace(sc.chNext) || IsADigit(sc.chNext) || sc.chNext == '/')
1468 preferRE = false;
1469 } else if (sc.ch == '*' || sc.ch == '%') {
1470 if (IsASpace(sc.chNext) || IsADigit(sc.chNext) || sc.Match('*', '*'))
1471 preferRE = false;
1472 } else if (sc.ch == '<') {
1473 if (IsASpace(sc.chNext) || sc.chNext == '=')
1474 preferRE = false;
1477 break;
1478 case SCE_PL_SCALAR: // for $var<< case:
1479 if (isHereDoc && hereDocSpace) // if SCALAR whitespace '<<', *always* a HERE doc
1480 preferRE = true;
1481 break;
1482 case SCE_PL_WORD:
1483 preferRE = true;
1484 // for HERE docs, always true
1485 if (sc.ch == '/') {
1486 // adopt heuristics similar to vim-style rules:
1487 // keywords always forced as /PATTERN/: split, if, elsif, while
1488 // everything else /PATTERN/ unless digit/space immediately after '/'
1489 // for '//', defined-or favoured unless special keywords
1490 Sci_PositionU bkend = bk + 1;
1491 while (bk > 0 && styler.StyleAt(bk - 1) == SCE_PL_WORD) {
1492 bk--;
1494 if (isPerlKeyword(bk, bkend, reWords, styler))
1495 break;
1496 if (IsASpace(sc.chNext) || IsADigit(sc.chNext) || sc.chNext == '/')
1497 preferRE = false;
1498 } else if (sc.ch == '*' || sc.ch == '%') {
1499 if (IsASpace(sc.chNext) || IsADigit(sc.chNext) || sc.Match('*', '*'))
1500 preferRE = false;
1501 } else if (sc.ch == '<') {
1502 if (IsASpace(sc.chNext) || sc.chNext == '=')
1503 preferRE = false;
1505 break;
1507 // other styles uses the default, preferRE=false
1508 case SCE_PL_POD:
1509 case SCE_PL_HERE_Q:
1510 case SCE_PL_HERE_QQ:
1511 case SCE_PL_HERE_QX:
1512 preferRE = true;
1513 break;
1516 backFlag = BACK_NONE;
1517 if (isHereDoc) { // handle '<<', HERE doc
1518 if (sc.Match("<<>>")) { // double-diamond operator (5.22)
1519 sc.SetState(SCE_PL_OPERATOR);
1520 sc.Forward(3);
1521 } else if (preferRE) {
1522 sc.SetState(SCE_PL_HERE_DELIM);
1523 HereDoc.State = 0;
1524 } else { // << operator
1525 sc.SetState(SCE_PL_OPERATOR);
1526 sc.Forward();
1528 } else if (sc.ch == '*') { // handle '*', typeglob
1529 if (preferRE) {
1530 sc.SetState(SCE_PL_SYMBOLTABLE);
1531 if (sc.chNext == ':' && sc.GetRelative(2) == ':') {
1532 sc.ForwardBytes(2);
1533 } else if (sc.chNext == '{') {
1534 sc.ForwardSetState(SCE_PL_OPERATOR);
1535 } else {
1536 sc.Forward();
1538 } else {
1539 sc.SetState(SCE_PL_OPERATOR);
1540 if (sc.chNext == '*') // exponentiation
1541 sc.Forward();
1543 } else if (sc.ch == '%') { // handle '%', hash
1544 if (preferRE) {
1545 sc.SetState(SCE_PL_HASH);
1546 if (setHash.Contains(sc.chNext)) {
1547 sc.Forward();
1548 } else if (sc.chNext == ':' && sc.GetRelative(2) == ':') {
1549 sc.ForwardBytes(2);
1550 } else if (sc.chNext == '{') {
1551 sc.ForwardSetState(SCE_PL_OPERATOR);
1552 } else {
1553 sc.ChangeState(SCE_PL_OPERATOR);
1555 } else {
1556 sc.SetState(SCE_PL_OPERATOR);
1558 } else if (sc.ch == '<') { // handle '<', inputsymbol
1559 if (preferRE) {
1560 // forward scan
1561 int i = InputSymbolScan(sc);
1562 if (i > 0) {
1563 sc.SetState(SCE_PL_IDENTIFIER);
1564 sc.Forward(i);
1565 } else {
1566 sc.SetState(SCE_PL_OPERATOR);
1568 } else {
1569 sc.SetState(SCE_PL_OPERATOR);
1571 } else { // handle '/', regexp
1572 if (preferRE) {
1573 sc.SetState(SCE_PL_REGEX);
1574 Quote.New();
1575 Quote.Open(sc.ch);
1576 } else { // / and // operators
1577 sc.SetState(SCE_PL_OPERATOR);
1578 if (sc.chNext == '/') {
1579 sc.Forward();
1583 } else if (sc.ch == '=' // POD
1584 && setPOD.Contains(sc.chNext)
1585 && sc.atLineStart) {
1586 sc.SetState(SCE_PL_POD);
1587 backFlag = BACK_NONE;
1588 } else if (sc.ch == '-' && setWordStart.Contains(sc.chNext)) { // extended '-' cases
1589 Sci_PositionU bk = sc.currentPos;
1590 Sci_PositionU fw = 2;
1591 if (setSingleCharOp.Contains(sc.chNext) && // file test operators
1592 !setWord.Contains(sc.GetRelative(2))) {
1593 sc.SetState(SCE_PL_WORD);
1594 } else {
1595 // nominally a minus and bareword; find extent of bareword
1596 while (setWord.Contains(sc.GetRelative(fw)))
1597 fw++;
1598 sc.SetState(SCE_PL_OPERATOR);
1600 // force to bareword for hash key => or {variable literal} cases
1601 if (disambiguateBareword(styler, bk, bk + fw, backFlag, backPos, endPos) & 2) {
1602 sc.ChangeState(SCE_PL_IDENTIFIER);
1604 backFlag = BACK_NONE;
1605 } else if (sc.ch == '(' && sc.currentPos > 0) { // '(' or subroutine prototype
1606 sc.Complete();
1607 if (styleCheckSubPrototype(styler, sc.currentPos - 1)) {
1608 sc.SetState(SCE_PL_SUB_PROTOTYPE);
1609 backFlag = BACK_NONE;
1610 } else {
1611 sc.SetState(SCE_PL_OPERATOR);
1613 } else if (setPerlOperator.Contains(sc.ch)) { // operators
1614 sc.SetState(SCE_PL_OPERATOR);
1615 if (sc.Match('.', '.')) { // .. and ...
1616 sc.Forward();
1617 if (sc.chNext == '.') sc.Forward();
1619 } else if (sc.ch == 4 || sc.ch == 26) { // ^D and ^Z ends valid perl source
1620 sc.SetState(SCE_PL_DATASECTION);
1621 } else {
1622 // keep colouring defaults
1623 sc.Complete();
1627 sc.Complete();
1628 if (sc.state == SCE_PL_HERE_Q
1629 || sc.state == SCE_PL_HERE_QQ
1630 || sc.state == SCE_PL_HERE_QX
1631 || sc.state == SCE_PL_FORMAT) {
1632 styler.ChangeLexerState(sc.currentPos, styler.Length());
1634 sc.Complete();
1637 #define PERL_HEADFOLD_SHIFT 4
1638 #define PERL_HEADFOLD_MASK 0xF0
1640 void SCI_METHOD LexerPerl::Fold(Sci_PositionU startPos, Sci_Position length, int /* initStyle */, IDocument *pAccess) {
1642 if (!options.fold)
1643 return;
1645 LexAccessor styler(pAccess);
1647 Sci_PositionU endPos = startPos + length;
1648 int visibleChars = 0;
1649 Sci_Position lineCurrent = styler.GetLine(startPos);
1651 // Backtrack to previous line in case need to fix its fold status
1652 if (startPos > 0) {
1653 if (lineCurrent > 0) {
1654 lineCurrent--;
1655 startPos = styler.LineStart(lineCurrent);
1659 int levelPrev = SC_FOLDLEVELBASE;
1660 if (lineCurrent > 0)
1661 levelPrev = styler.LevelAt(lineCurrent - 1) >> 16;
1662 int levelCurrent = levelPrev;
1663 char chNext = styler[startPos];
1664 char chPrev = styler.SafeGetCharAt(startPos - 1);
1665 int styleNext = styler.StyleAt(startPos);
1666 // Used at end of line to determine if the line was a package definition
1667 bool isPackageLine = false;
1668 int podHeading = 0;
1669 for (Sci_PositionU i = startPos; i < endPos; i++) {
1670 char ch = chNext;
1671 chNext = styler.SafeGetCharAt(i + 1);
1672 int style = styleNext;
1673 styleNext = styler.StyleAt(i + 1);
1674 int stylePrevCh = (i) ? styler.StyleAt(i - 1):SCE_PL_DEFAULT;
1675 bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n');
1676 bool atLineStart = ((chPrev == '\r') || (chPrev == '\n')) || i == 0;
1677 // Comment folding
1678 if (options.foldComment && atEOL && IsCommentLine(lineCurrent, styler)) {
1679 if (!IsCommentLine(lineCurrent - 1, styler)
1680 && IsCommentLine(lineCurrent + 1, styler))
1681 levelCurrent++;
1682 else if (IsCommentLine(lineCurrent - 1, styler)
1683 && !IsCommentLine(lineCurrent + 1, styler))
1684 levelCurrent--;
1686 // {} [] block folding
1687 if (style == SCE_PL_OPERATOR) {
1688 if (ch == '{') {
1689 if (options.foldAtElse && levelCurrent < levelPrev)
1690 --levelPrev;
1691 levelCurrent++;
1692 } else if (ch == '}') {
1693 levelCurrent--;
1695 if (ch == '[') {
1696 if (options.foldAtElse && levelCurrent < levelPrev)
1697 --levelPrev;
1698 levelCurrent++;
1699 } else if (ch == ']') {
1700 levelCurrent--;
1703 // POD folding
1704 if (options.foldPOD && atLineStart) {
1705 if (style == SCE_PL_POD) {
1706 if (stylePrevCh != SCE_PL_POD && stylePrevCh != SCE_PL_POD_VERB)
1707 levelCurrent++;
1708 else if (styler.Match(i, "=cut"))
1709 levelCurrent = (levelCurrent & ~PERL_HEADFOLD_MASK) - 1;
1710 else if (styler.Match(i, "=head"))
1711 podHeading = PodHeadingLevel(i, styler);
1712 } else if (style == SCE_PL_DATASECTION) {
1713 if (ch == '=' && IsASCII(chNext) && isalpha(chNext) && levelCurrent == SC_FOLDLEVELBASE)
1714 levelCurrent++;
1715 else if (styler.Match(i, "=cut") && levelCurrent > SC_FOLDLEVELBASE)
1716 levelCurrent = (levelCurrent & ~PERL_HEADFOLD_MASK) - 1;
1717 else if (styler.Match(i, "=head"))
1718 podHeading = PodHeadingLevel(i, styler);
1719 // if package used or unclosed brace, level > SC_FOLDLEVELBASE!
1720 // reset needed as level test is vs. SC_FOLDLEVELBASE
1721 else if (stylePrevCh != SCE_PL_DATASECTION)
1722 levelCurrent = SC_FOLDLEVELBASE;
1725 // package folding
1726 if (options.foldPackage && atLineStart) {
1727 if (IsPackageLine(lineCurrent, styler)
1728 && !IsPackageLine(lineCurrent + 1, styler))
1729 isPackageLine = true;
1732 //heredoc folding
1733 switch (style) {
1734 case SCE_PL_HERE_QQ :
1735 case SCE_PL_HERE_Q :
1736 case SCE_PL_HERE_QX :
1737 switch (stylePrevCh) {
1738 case SCE_PL_HERE_QQ :
1739 case SCE_PL_HERE_Q :
1740 case SCE_PL_HERE_QX :
1741 //do nothing;
1742 break;
1743 default :
1744 levelCurrent++;
1745 break;
1747 break;
1748 default:
1749 switch (stylePrevCh) {
1750 case SCE_PL_HERE_QQ :
1751 case SCE_PL_HERE_Q :
1752 case SCE_PL_HERE_QX :
1753 levelCurrent--;
1754 break;
1755 default :
1756 //do nothing;
1757 break;
1759 break;
1762 //explicit folding
1763 if (options.foldCommentExplicit && style == SCE_PL_COMMENTLINE && ch == '#') {
1764 if (chNext == '{') {
1765 levelCurrent++;
1766 } else if (levelCurrent > SC_FOLDLEVELBASE && chNext == '}') {
1767 levelCurrent--;
1771 if (atEOL) {
1772 int lev = levelPrev;
1773 // POD headings occupy bits 7-4, leaving some breathing room for
1774 // non-standard practice -- POD sections stuck in blocks, etc.
1775 if (podHeading > 0) {
1776 levelCurrent = (lev & ~PERL_HEADFOLD_MASK) | (podHeading << PERL_HEADFOLD_SHIFT);
1777 lev = levelCurrent - 1;
1778 lev |= SC_FOLDLEVELHEADERFLAG;
1779 podHeading = 0;
1781 // Check if line was a package declaration
1782 // because packages need "special" treatment
1783 if (isPackageLine) {
1784 lev = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG;
1785 levelCurrent = SC_FOLDLEVELBASE + 1;
1786 isPackageLine = false;
1788 lev |= levelCurrent << 16;
1789 if (visibleChars == 0 && options.foldCompact)
1790 lev |= SC_FOLDLEVELWHITEFLAG;
1791 if ((levelCurrent > levelPrev) && (visibleChars > 0))
1792 lev |= SC_FOLDLEVELHEADERFLAG;
1793 if (lev != styler.LevelAt(lineCurrent)) {
1794 styler.SetLevel(lineCurrent, lev);
1796 lineCurrent++;
1797 levelPrev = levelCurrent;
1798 visibleChars = 0;
1800 if (!isspacechar(ch))
1801 visibleChars++;
1802 chPrev = ch;
1804 // Fill in the real level of the next line, keeping the current flags as they will be filled in later
1805 int flagsNext = styler.LevelAt(lineCurrent) & ~SC_FOLDLEVELNUMBERMASK;
1806 styler.SetLevel(lineCurrent, levelPrev | flagsNext);
1809 LexerModule lmPerl(SCLEX_PERL, LexerPerl::LexerFactoryPerl, "perl", perlWordListDesc);