ext/scintilla/lexers/LexJSON.cxx

   1 // Scintilla source code edit control
   2 /**
   3  * @file LexJSON.cxx
   4  * @date February 19, 2016
   5  * @brief Lexer for JSON and JSON-LD formats
   6  * @author nkmathew
   7  *
   8  * The License.txt file describes the conditions under which this software may
   9  * be distributed.
  10  *
  11  */
  12
  13 #include <cstdlib>
  14 #include <cassert>
  15 #include <cctype>
  16 #include <cstdio>
  17 #include <string>
  18 #include <vector>
  19 #include <map>
  20
  21 #include "ILexer.h"
  22 #include "Scintilla.h"
  23 #include "SciLexer.h"
  24 #include "WordList.h"
  25 #include "LexAccessor.h"
  26 #include "StyleContext.h"
  27 #include "CharacterSet.h"
  28 #include "LexerModule.h"
  29 #include "OptionSet.h"
  30 #include "DefaultLexer.h"
  31
  32 using namespace Scintilla;
  33
  34 static const char *const JSONWordListDesc[] = {
  35         "JSON Keywords",
  36         "JSON-LD Keywords",
  37         0
  38 };
  39
  40 /**
  41  * Used to detect compact IRI/URLs in JSON-LD without first looking ahead for the
  42  * colon separating the prefix and suffix
  43  *
  44  * https://www.w3.org/TR/json-ld/#dfn-compact-iri
  45  */
  46 struct CompactIRI {
  47         int colonCount;
  48         bool foundInvalidChar;
  49         CharacterSet setCompactIRI;
  50         CompactIRI() {
  51                 colonCount = 0;
  52                 foundInvalidChar = false;
  53                 setCompactIRI = CharacterSet(CharacterSet::setAlpha, "$_-");
  54         }
  55         void resetState() {
  56                 colonCount = 0;
  57                 foundInvalidChar = false;
  58         }
  59         void checkChar(int ch) {
  60                 if (ch == ':') {
  61                         colonCount++;
  62                 } else {
  63                         foundInvalidChar |= !setCompactIRI.Contains(ch);
  64                 }
  65         }
  66         bool shouldHighlight() const {
  67                 return !foundInvalidChar && colonCount == 1;
  68         }
  69 };
  70
  71 /**
  72  * Keeps track of escaped characters in strings as per:
  73  *
  74  * https://tools.ietf.org/html/rfc7159#section-7
  75  */
  76 struct EscapeSequence {
  77         int digitsLeft;
  78         CharacterSet setHexDigits;
  79         CharacterSet setEscapeChars;
  80         EscapeSequence() {
  81                 digitsLeft = 0;
  82                 setHexDigits = CharacterSet(CharacterSet::setDigits, "ABCDEFabcdef");
  83                 setEscapeChars = CharacterSet(CharacterSet::setNone, "\\\"tnbfru/");
  84         }
  85         // Returns true if the following character is a valid escaped character
  86         bool newSequence(int nextChar) {
  87                 digitsLeft = 0;
  88                 if (nextChar == 'u') {
  89                         digitsLeft = 5;
  90                 } else if (!setEscapeChars.Contains(nextChar)) {
  91                         return false;
  92                 }
  93                 return true;
  94         }
  95         bool atEscapeEnd() const {
  96                 return digitsLeft <= 0;
  97         }
  98         bool isInvalidChar(int currChar) const {
  99                 return !setHexDigits.Contains(currChar);
 100         }
 101 };
 102
 103 struct OptionsJSON {
 104         bool foldCompact;
 105         bool fold;
 106         bool allowComments;
 107         bool escapeSequence;
 108         OptionsJSON() {
 109                 foldCompact = false;
 110                 fold = false;
 111                 allowComments = false;
 112                 escapeSequence = false;
 113         }
 114 };
 115
 116 struct OptionSetJSON : public OptionSet<OptionsJSON> {
 117         OptionSetJSON() {
 118                 DefineProperty("lexer.json.escape.sequence", &OptionsJSON::escapeSequence,
 119                                            "Set to 1 to enable highlighting of escape sequences in strings");
 120
 121                 DefineProperty("lexer.json.allow.comments", &OptionsJSON::allowComments,
 122                                            "Set to 1 to enable highlighting of line/block comments in JSON");
 123
 124                 DefineProperty("fold.compact", &OptionsJSON::foldCompact);
 125                 DefineProperty("fold", &OptionsJSON::fold);
 126                 DefineWordListSets(JSONWordListDesc);
 127         }
 128 };
 129
 130 class LexerJSON : public DefaultLexer {
 131         OptionsJSON options;
 132         OptionSetJSON optSetJSON;
 133         EscapeSequence escapeSeq;
 134         WordList keywordsJSON;
 135         WordList keywordsJSONLD;
 136         CharacterSet setOperators;
 137         CharacterSet setURL;
 138         CharacterSet setKeywordJSONLD;
 139         CharacterSet setKeywordJSON;
 140         CompactIRI compactIRI;
 141
 142         static bool IsNextNonWhitespace(LexAccessor &styler, Sci_Position start, char ch) {
 143                 Sci_Position i = 0;
 144                 while (i < 50) {
 145                         i++;
 146                         char curr = styler.SafeGetCharAt(start+i, '\0');
 147                         char next = styler.SafeGetCharAt(start+i+1, '\0');
 148                         bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
 149                         if (curr == ch) {
 150                                 return true;
 151                         } else if (!isspacechar(curr) || atEOL) {
 152                                 return false;
 153                         }
 154                 }
 155                 return false;
 156         }
 157
 158         /**
 159          * Looks for the colon following the end quote
 160          *
 161          * Assumes property names of lengths no longer than a 100 characters.
 162          * The colon is also expected to be less than 50 spaces after the end
 163          * quote for the string to be considered a property name
 164          */
 165         static bool AtPropertyName(LexAccessor &styler, Sci_Position start) {
 166                 Sci_Position i = 0;
 167                 bool escaped = false;
 168                 while (i < 100) {
 169                         i++;
 170                         char curr = styler.SafeGetCharAt(start+i, '\0');
 171                         if (escaped) {
 172                                 escaped = false;
 173                                 continue;
 174                         }
 175                         escaped = curr == '\\';
 176                         if (curr == '"') {
 177                                 return IsNextNonWhitespace(styler, start+i, ':');
 178                         } else if (!curr) {
 179                                 return false;
 180                         }
 181                 }
 182                 return false;
 183         }
 184
 185         static bool IsNextWordInList(WordList &keywordList, CharacterSet wordSet,
 186                                                                  StyleContext &context, LexAccessor &styler) {
 187                 char word[51];
 188                 Sci_Position currPos = (Sci_Position) context.currentPos;
 189                 int i = 0;
 190                 while (i < 50) {
 191                         char ch = styler.SafeGetCharAt(currPos + i);
 192                         if (!wordSet.Contains(ch)) {
 193                                 break;
 194                         }
 195                         word[i] = ch;
 196                         i++;
 197                 }
 198                 word[i] = '\0';
 199                 return keywordList.InList(word);
 200         }
 201
 202         public:
 203         LexerJSON() :
 204                 setOperators(CharacterSet::setNone, "[{}]:,"),
 205                 setURL(CharacterSet::setAlphaNum, "-._~:/?#[]@!$&'()*+,),="),
 206                 setKeywordJSONLD(CharacterSet::setAlpha, ":@"),
 207                 setKeywordJSON(CharacterSet::setAlpha, "$_") {
 208         }
 209         virtual ~LexerJSON() {}
 210         int SCI_METHOD Version() const override {
 211                 return lvRelease4;
 212         }
 213         void SCI_METHOD Release() override {
 214                 delete this;
 215         }
 216         const char *SCI_METHOD PropertyNames() override {
 217                 return optSetJSON.PropertyNames();
 218         }
 219         int SCI_METHOD PropertyType(const char *name) override {
 220                 return optSetJSON.PropertyType(name);
 221         }
 222         const char *SCI_METHOD DescribeProperty(const char *name) override {
 223                 return optSetJSON.DescribeProperty(name);
 224         }
 225         Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override {
 226                 if (optSetJSON.PropertySet(&options, key, val)) {
 227                         return 0;
 228                 }
 229                 return -1;
 230         }
 231         Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override {
 232                 WordList *wordListN = 0;
 233                 switch (n) {
 234                         case 0:
 235                                 wordListN = &keywordsJSON;
 236                                 break;
 237                         case 1:
 238                                 wordListN = &keywordsJSONLD;
 239                                 break;
 240                 }
 241                 Sci_Position firstModification = -1;
 242                 if (wordListN) {
 243                         WordList wlNew;
 244                         wlNew.Set(wl);
 245                         if (*wordListN != wlNew) {
 246                                 wordListN->Set(wl);
 247                                 firstModification = 0;
 248                         }
 249                 }
 250                 return firstModification;
 251         }
 252         void *SCI_METHOD PrivateCall(int, void *) override {
 253                 return 0;
 254         }
 255         static ILexer4 *LexerFactoryJSON() {
 256                 return new LexerJSON;
 257         }
 258         const char *SCI_METHOD DescribeWordListSets() override {
 259                 return optSetJSON.DescribeWordListSets();
 260         }
 261         void SCI_METHOD Lex(Sci_PositionU startPos,
 262                                                                 Sci_Position length,
 263                                                                 int initStyle,
 264                                                                 IDocument *pAccess) override;
 265         void SCI_METHOD Fold(Sci_PositionU startPos,
 266                                                                  Sci_Position length,
 267                                                                  int initStyle,
 268                                                                  IDocument *pAccess) override;
 269 };
 270
 271 void SCI_METHOD LexerJSON::Lex(Sci_PositionU startPos,
 272                                                            Sci_Position length,
 273                                                            int initStyle,
 274                                                            IDocument *pAccess) {
 275         LexAccessor styler(pAccess);
 276         StyleContext context(startPos, length, initStyle, styler);
 277         int stringStyleBefore = SCE_JSON_STRING;
 278         while (context.More()) {
 279                 switch (context.state) {
 280                         case SCE_JSON_BLOCKCOMMENT:
 281                                 if (context.Match("*/")) {
 282                                         context.Forward();
 283                                         context.ForwardSetState(SCE_JSON_DEFAULT);
 284                                 }
 285                                 break;
 286                         case SCE_JSON_LINECOMMENT:
 287                                 if (context.atLineEnd) {
 288                                         context.SetState(SCE_JSON_DEFAULT);
 289                                 }
 290                                 break;
 291                         case SCE_JSON_STRINGEOL:
 292                                 if (context.atLineStart) {
 293                                         context.SetState(SCE_JSON_DEFAULT);
 294                                 }
 295                                 break;
 296                         case SCE_JSON_ESCAPESEQUENCE:
 297                                 escapeSeq.digitsLeft--;
 298                                 if (!escapeSeq.atEscapeEnd()) {
 299                                         if (escapeSeq.isInvalidChar(context.ch)) {
 300                                                 context.SetState(SCE_JSON_ERROR);
 301                                         }
 302                                         break;
 303                                 }
 304                                 if (context.ch == '"') {
 305                                         context.SetState(stringStyleBefore);
 306                                         context.ForwardSetState(SCE_C_DEFAULT);
 307                                 } else if (context.ch == '\\') {
 308                                         if (!escapeSeq.newSequence(context.chNext)) {
 309                                                 context.SetState(SCE_JSON_ERROR);
 310                                         }
 311                                         context.Forward();
 312                                 } else {
 313                                         context.SetState(stringStyleBefore);
 314                                         if (context.atLineEnd) {
 315                                                 context.ChangeState(SCE_JSON_STRINGEOL);
 316                                         }
 317                                 }
 318                                 break;
 319                         case SCE_JSON_PROPERTYNAME:
 320                         case SCE_JSON_STRING:
 321                                 if (context.ch == '"') {
 322                                         if (compactIRI.shouldHighlight()) {
 323                                                 context.ChangeState(SCE_JSON_COMPACTIRI);
 324                                                 context.ForwardSetState(SCE_JSON_DEFAULT);
 325                                                 compactIRI.resetState();
 326                                         } else {
 327                                                 context.ForwardSetState(SCE_JSON_DEFAULT);
 328                                         }
 329                                 } else if (context.atLineEnd) {
 330                                         context.ChangeState(SCE_JSON_STRINGEOL);
 331                                 } else if (context.ch == '\\') {
 332                                         stringStyleBefore = context.state;
 333                                         if (options.escapeSequence) {
 334                                                 context.SetState(SCE_JSON_ESCAPESEQUENCE);
 335                                                 if (!escapeSeq.newSequence(context.chNext)) {
 336                                                         context.SetState(SCE_JSON_ERROR);
 337                                                 }
 338                                         }
 339                                         context.Forward();
 340                                 } else if (context.Match("https://") ||
 341                                                    context.Match("http://") ||
 342                                                    context.Match("ssh://") ||
 343                                                    context.Match("git://") ||
 344                                                    context.Match("svn://") ||
 345                                                    context.Match("ftp://") ||
 346                                                    context.Match("mailto:")) {
 347                                         // Handle most common URI schemes only
 348                                         stringStyleBefore = context.state;
 349                                         context.SetState(SCE_JSON_URI);
 350                                 } else if (context.ch == '@') {
 351                                         // https://www.w3.org/TR/json-ld/#dfn-keyword
 352                                         if (IsNextWordInList(keywordsJSONLD, setKeywordJSONLD, context, styler)) {
 353                                                 stringStyleBefore = context.state;
 354                                                 context.SetState(SCE_JSON_LDKEYWORD);
 355                                         }
 356                                 } else {
 357                                         compactIRI.checkChar(context.ch);
 358                                 }
 359                                 break;
 360                         case SCE_JSON_LDKEYWORD:
 361                         case SCE_JSON_URI:
 362                                 if ((!setKeywordJSONLD.Contains(context.ch) &&
 363                                          (context.state == SCE_JSON_LDKEYWORD)) ||
 364                                         (!setURL.Contains(context.ch))) {
 365                                         context.SetState(stringStyleBefore);
 366                                 }
 367                                 if (context.ch == '"') {
 368                                         context.ForwardSetState(SCE_JSON_DEFAULT);
 369                                 } else if (context.atLineEnd) {
 370                                         context.ChangeState(SCE_JSON_STRINGEOL);
 371                                 }
 372                                 break;
 373                         case SCE_JSON_OPERATOR:
 374                         case SCE_JSON_NUMBER:
 375                                 context.SetState(SCE_JSON_DEFAULT);
 376                                 break;
 377                         case SCE_JSON_ERROR:
 378                                 if (context.atLineEnd) {
 379                                         context.SetState(SCE_JSON_DEFAULT);
 380                                 }
 381                                 break;
 382                         case SCE_JSON_KEYWORD:
 383                                 if (!setKeywordJSON.Contains(context.ch)) {
 384                                         context.SetState(SCE_JSON_DEFAULT);
 385                                 }
 386                                 break;
 387                 }
 388                 if (context.state == SCE_JSON_DEFAULT) {
 389                         if (context.ch == '"') {
 390                                 compactIRI.resetState();
 391                                 context.SetState(SCE_JSON_STRING);
 392                                 Sci_Position currPos = static_cast<Sci_Position>(context.currentPos);
 393                                 if (AtPropertyName(styler, currPos)) {
 394                                         context.SetState(SCE_JSON_PROPERTYNAME);
 395                                 }
 396                         } else if (setOperators.Contains(context.ch)) {
 397                                 context.SetState(SCE_JSON_OPERATOR);
 398                         } else if (options.allowComments && context.Match("/*")) {
 399                                 context.SetState(SCE_JSON_BLOCKCOMMENT);
 400                                 context.Forward();
 401                         } else if (options.allowComments && context.Match("//")) {
 402                                 context.SetState(SCE_JSON_LINECOMMENT);
 403                         } else if (setKeywordJSON.Contains(context.ch)) {
 404                                 if (IsNextWordInList(keywordsJSON, setKeywordJSON, context, styler)) {
 405                                         context.SetState(SCE_JSON_KEYWORD);
 406                                 }
 407                         }
 408                         bool numberStart =
 409                                 IsADigit(context.ch) && (context.chPrev == '+'||
 410                                                                                  context.chPrev == '-' ||
 411                                                                                  context.atLineStart ||
 412                                                                                  IsASpace(context.chPrev) ||
 413                                                                                  setOperators.Contains(context.chPrev));
 414                         bool exponentPart =
 415                                 tolower(context.ch) == 'e' &&
 416                                 IsADigit(context.chPrev) &&
 417                                 (IsADigit(context.chNext) ||
 418                                  context.chNext == '+' ||
 419                                  context.chNext == '-');
 420                         bool signPart =
 421                                 (context.ch == '-' || context.ch == '+') &&
 422                                 ((tolower(context.chPrev) == 'e' && IsADigit(context.chNext)) ||
 423                                  ((IsASpace(context.chPrev) || setOperators.Contains(context.chPrev))
 424                                   && IsADigit(context.chNext)));
 425                         bool adjacentDigit =
 426                                 IsADigit(context.ch) && IsADigit(context.chPrev);
 427                         bool afterExponent = IsADigit(context.ch) && tolower(context.chPrev) == 'e';
 428                         bool dotPart = context.ch == '.' &&
 429                                 IsADigit(context.chPrev) &&
 430                                 IsADigit(context.chNext);
 431                         bool afterDot = IsADigit(context.ch) && context.chPrev == '.';
 432                         if (numberStart ||
 433                                 exponentPart ||
 434                                 signPart ||
 435                                 adjacentDigit ||
 436                                 dotPart ||
 437                                 afterExponent ||
 438                                 afterDot) {
 439                                 context.SetState(SCE_JSON_NUMBER);
 440                         } else if (context.state == SCE_JSON_DEFAULT && !IsASpace(context.ch)) {
 441                                 context.SetState(SCE_JSON_ERROR);
 442                         }
 443                 }
 444                 context.Forward();
 445         }
 446         context.Complete();
 447 }
 448
 449 void SCI_METHOD LexerJSON::Fold(Sci_PositionU startPos,
 450                                                                 Sci_Position length,
 451                                                                 int,
 452                                                                 IDocument *pAccess) {
 453         if (!options.fold) {
 454                 return;
 455         }
 456         LexAccessor styler(pAccess);
 457         Sci_PositionU currLine = styler.GetLine(startPos);
 458         Sci_PositionU endPos = startPos + length;
 459         int currLevel = SC_FOLDLEVELBASE;
 460         if (currLine > 0)
 461                 currLevel = styler.LevelAt(currLine - 1) >> 16;
 462         int nextLevel = currLevel;
 463         int visibleChars = 0;
 464         for (Sci_PositionU i = startPos; i < endPos; i++) {
 465                 char curr = styler.SafeGetCharAt(i);
 466                 char next = styler.SafeGetCharAt(i+1);
 467                 bool atEOL = (curr == '\r' && next != '\n') || (curr == '\n');
 468                 if (styler.StyleAt(i) == SCE_JSON_OPERATOR) {
 469                         if (curr == '{' || curr == '[') {
 470                                 nextLevel++;
 471                         } else if (curr == '}' || curr == ']') {
 472                                 nextLevel--;
 473                         }
 474                 }
 475                 if (atEOL || i == (endPos-1)) {
 476                         int level = currLevel | nextLevel << 16;
 477                         if (!visibleChars && options.foldCompact) {
 478                                 level |= SC_FOLDLEVELWHITEFLAG;
 479                         } else if (nextLevel > currLevel) {
 480                                 level |= SC_FOLDLEVELHEADERFLAG;
 481                         }
 482                         if (level != styler.LevelAt(currLine)) {
 483                                 styler.SetLevel(currLine, level);
 484                         }
 485                         currLine++;
 486                         currLevel = nextLevel;
 487                         visibleChars = 0;
 488                 }
 489                 if (!isspacechar(curr)) {
 490                         visibleChars++;
 491                 }
 492         }
 493 }
 494
 495 LexerModule lmJSON(SCLEX_JSON,
 496                                    LexerJSON::LexerFactoryJSON,
 497                                    "json",
 498                                    JSONWordListDesc);