scintilla: Update scintilla with changeset 3662:1d1c06df8a2f using gtk+3
[anjuta-extras.git] / plugins / scintilla / scintilla / LexCaml.cxx
blobf576e3e2349d13c5193f0f44cba8b5dcc0fc0f19
1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3 ** Lexer for Objective Caml.
4 **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /* Release History
8 20050204 Initial release.
9 20050205 Quick compiler standards/"cleanliness" adjustment.
10 20050206 Added cast for IsLeadByte().
11 20050209 Changes to "external" build support.
12 20050306 Fix for 1st-char-in-doc "corner" case.
13 20050502 Fix for [harmless] one-past-the-end coloring.
14 20050515 Refined numeric token recognition logic.
15 20051125 Added 2nd "optional" keywords class.
16 20051129 Support "magic" (read-only) comments for RCaml.
17 20051204 Swtich to using StyleContext infrastructure.
18 20090629 Add full Standard ML '97 support.
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdarg.h>
25 #include <assert.h>
26 #include <ctype.h>
28 #include "ILexer.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
32 #include "PropSetSimple.h"
33 #include "WordList.h"
34 #include "LexAccessor.h"
35 #include "Accessor.h"
36 #include "StyleContext.h"
37 #include "CharacterSet.h"
38 #include "LexerModule.h"
40 // Since the Microsoft __iscsym[f] funcs are not ANSI...
41 inline int iscaml(int c) {return isalnum(c) || c == '_';}
42 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
44 static const int baseT[24] = {
45 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* A - L */
46 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16 /* M - X */
49 #ifdef SCI_NAMESPACE
50 using namespace Scintilla;
51 #endif
53 #ifdef BUILD_AS_EXTERNAL_LEXER
55 (actually seems to work!)
57 #include <string>
58 #include "WindowAccessor.h"
59 #include "ExternalLexer.h"
61 #undef EXT_LEXER_DECL
62 #define EXT_LEXER_DECL __declspec( dllexport ) __stdcall
64 #if PLAT_WIN
65 #include <windows.h>
66 #endif
68 static void ColouriseCamlDoc(
69 unsigned int startPos, int length,
70 int initStyle,
71 WordList *keywordlists[],
72 Accessor &styler);
74 static void FoldCamlDoc(
75 unsigned int startPos, int length,
76 int initStyle,
77 WordList *keywordlists[],
78 Accessor &styler);
80 static void InternalLexOrFold(int lexOrFold, unsigned int startPos, int length,
81 int initStyle, char *words[], WindowID window, char *props);
83 static const char* LexerName = "caml";
85 #ifdef TRACE
86 void Platform::DebugPrintf(const char *format, ...) {
87 char buffer[2000];
88 va_list pArguments;
89 va_start(pArguments, format);
90 vsprintf(buffer,format,pArguments);
91 va_end(pArguments);
92 Platform::DebugDisplay(buffer);
94 #else
95 void Platform::DebugPrintf(const char *, ...) {
97 #endif
99 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
100 return ::IsDBCSLeadByteEx(codePage, ch) != 0;
103 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
104 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
107 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
108 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
109 reinterpret_cast<LPARAM>(lParam));
112 void EXT_LEXER_DECL Fold(unsigned int lexer, unsigned int startPos, int length,
113 int initStyle, char *words[], WindowID window, char *props)
115 // below useless evaluation(s) to supress "not used" warnings
116 lexer;
117 // build expected data structures and do the Fold
118 InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
122 int EXT_LEXER_DECL GetLexerCount()
124 return 1; // just us [Objective] Caml lexers here!
127 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
129 // below useless evaluation(s) to supress "not used" warnings
130 Index;
131 // return as much of our lexer name as will fit (what's up with Index?)
132 if (buflength > 0) {
133 buflength--;
134 int n = strlen(LexerName);
135 if (n > buflength)
136 n = buflength;
137 memcpy(name, LexerName, n), name[n] = '\0';
141 void EXT_LEXER_DECL Lex(unsigned int lexer, unsigned int startPos, int length,
142 int initStyle, char *words[], WindowID window, char *props)
144 // below useless evaluation(s) to supress "not used" warnings
145 lexer;
146 // build expected data structures and do the Lex
147 InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
150 static void InternalLexOrFold(int foldOrLex, unsigned int startPos, int length,
151 int initStyle, char *words[], WindowID window, char *props)
153 // create and initialize a WindowAccessor (including contained PropSet)
154 PropSetSimple ps;
155 ps.SetMultiple(props);
156 WindowAccessor wa(window, ps);
157 // create and initialize WordList(s)
158 int nWL = 0;
159 for (; words[nWL]; nWL++) ; // count # of WordList PTRs needed
160 WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
161 int i = 0;
162 for (; i < nWL; i++) {
163 wl[i] = new WordList(); // (works or THROWS bad_alloc EXCEPTION)
164 wl[i]->Set(words[i]);
166 wl[i] = 0;
167 // call our "internal" folder/lexer (... then do Flush!)
168 if (foldOrLex)
169 FoldCamlDoc(startPos, length, initStyle, wl, wa);
170 else
171 ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
172 wa.Flush();
173 // clean up before leaving
174 for (i = nWL - 1; i >= 0; i--)
175 delete wl[i];
176 delete [] wl;
179 static
180 #endif /* BUILD_AS_EXTERNAL_LEXER */
182 void ColouriseCamlDoc(
183 unsigned int startPos, int length,
184 int initStyle,
185 WordList *keywordlists[],
186 Accessor &styler)
188 // initialize styler
189 StyleContext sc(startPos, length, initStyle, styler);
191 int chBase = 0, chToken = 0, chLit = 0;
192 WordList& keywords = *keywordlists[0];
193 WordList& keywords2 = *keywordlists[1];
194 WordList& keywords3 = *keywordlists[2];
195 const bool isSML = keywords.InList("andalso");
196 const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
198 // set up [initial] state info (terminating states that shouldn't "bleed")
199 const int state_ = sc.state & 0x0f;
200 if (state_ <= SCE_CAML_CHAR
201 || (isSML && state_ == SCE_CAML_STRING))
202 sc.state = SCE_CAML_DEFAULT;
203 int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
205 // foreach char in range...
206 while (sc.More()) {
207 // set up [per-char] state info
208 int state2 = -1; // (ASSUME no state change)
209 int chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
210 bool advance = true; // (ASSUME scanner "eats" 1 char)
212 // step state machine
213 switch (sc.state & 0x0f) {
214 case SCE_CAML_DEFAULT:
215 chToken = sc.currentPos; // save [possible] token start (JIC)
216 // it's wide open; what do we have?
217 if (iscamlf(sc.ch))
218 state2 = SCE_CAML_IDENTIFIER;
219 else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
220 state2 = SCE_CAML_TAGNAME;
221 else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
222 state2 = SCE_CAML_LINENUM;
223 else if (isdigit(sc.ch)) {
224 // it's a number, assume base 10
225 state2 = SCE_CAML_NUMBER, chBase = 10;
226 if (sc.Match('0')) {
227 // there MAY be a base specified...
228 const char* baseC = "bBoOxX";
229 if (isSML) {
230 if (sc.chNext == 'w')
231 sc.Forward(); // (consume SML "word" indicator)
232 baseC = "x";
234 // ... change to specified base AS REQUIRED
235 if (strchr(baseC, sc.chNext))
236 chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
238 } else if (!isSML && sc.Match('\'')) // (Caml char literal?)
239 state2 = SCE_CAML_CHAR, chLit = 0;
240 else if (isSML && sc.Match('#', '"')) // (SML char literal?)
241 state2 = SCE_CAML_CHAR, sc.Forward();
242 else if (sc.Match('"'))
243 state2 = SCE_CAML_STRING;
244 else if (sc.Match('(', '*'))
245 state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
246 else if (strchr("!?~" /* Caml "prefix-symbol" */
247 "=<>@^|&+-*/$%" /* Caml "infix-symbol" */
248 "()[]{};,:.#", sc.ch) // Caml "bracket" or ;,:.#
249 // SML "extra" ident chars
250 || (isSML && (sc.Match('\\') || sc.Match('`'))))
251 state2 = SCE_CAML_OPERATOR;
252 break;
254 case SCE_CAML_IDENTIFIER:
255 // [try to] interpret as [additional] identifier char
256 if (!(iscaml(sc.ch) || sc.Match('\''))) {
257 const int n = sc.currentPos - chToken;
258 if (n < 24) {
259 // length is believable as keyword, [re-]construct token
260 char t[24];
261 for (int i = -n; i < 0; i++)
262 t[n + i] = static_cast<char>(sc.GetRelative(i));
263 t[n] = '\0';
264 // special-case "_" token as KEYWORD
265 if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
266 sc.ChangeState(SCE_CAML_KEYWORD);
267 else if (keywords2.InList(t))
268 sc.ChangeState(SCE_CAML_KEYWORD2);
269 else if (keywords3.InList(t))
270 sc.ChangeState(SCE_CAML_KEYWORD3);
272 state2 = SCE_CAML_DEFAULT, advance = false;
274 break;
276 case SCE_CAML_TAGNAME:
277 // [try to] interpret as [additional] tagname char
278 if (!(iscaml(sc.ch) || sc.Match('\'')))
279 state2 = SCE_CAML_DEFAULT, advance = false;
280 break;
282 /*case SCE_CAML_KEYWORD:
283 case SCE_CAML_KEYWORD2:
284 case SCE_CAML_KEYWORD3:
285 // [try to] interpret as [additional] keyword char
286 if (!iscaml(ch))
287 state2 = SCE_CAML_DEFAULT, advance = false;
288 break;*/
290 case SCE_CAML_LINENUM:
291 // [try to] interpret as [additional] linenum directive char
292 if (!isdigit(sc.ch))
293 state2 = SCE_CAML_DEFAULT, advance = false;
294 break;
296 case SCE_CAML_OPERATOR: {
297 // [try to] interpret as [additional] operator char
298 const char* o = 0;
299 if (iscaml(sc.ch) || isspace(sc.ch) // ident or whitespace
300 || (o = strchr(")]};,\'\"#", sc.ch),o) // "termination" chars
301 || (!isSML && sc.Match('`')) // Caml extra term char
302 || (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
303 // SML extra ident chars
304 && !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
305 // check for INCLUSIVE termination
306 if (o && strchr(")]};,", sc.ch)) {
307 if ((sc.Match(')') && sc.chPrev == '(')
308 || (sc.Match(']') && sc.chPrev == '['))
309 // special-case "()" and "[]" tokens as KEYWORDS
310 sc.ChangeState(SCE_CAML_KEYWORD);
311 chColor++;
312 } else
313 advance = false;
314 state2 = SCE_CAML_DEFAULT;
316 break;
319 case SCE_CAML_NUMBER:
320 // [try to] interpret as [additional] numeric literal char
321 if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
322 break;
323 // how about an integer suffix?
324 if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
325 && (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
326 break;
327 // or a floating-point literal?
328 if (chBase == 10) {
329 // with a decimal point?
330 if (sc.Match('.')
331 && ((!isSML && sc.chPrev == '_')
332 || IsADigit(sc.chPrev, chBase)))
333 break;
334 // with an exponent? (I)
335 if ((sc.Match('e') || sc.Match('E'))
336 && ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
337 || IsADigit(sc.chPrev, chBase)))
338 break;
339 // with an exponent? (II)
340 if (((!isSML && (sc.Match('+') || sc.Match('-')))
341 || (isSML && sc.Match('~')))
342 && (sc.chPrev == 'e' || sc.chPrev == 'E'))
343 break;
345 // it looks like we have run out of number
346 state2 = SCE_CAML_DEFAULT, advance = false;
347 break;
349 case SCE_CAML_CHAR:
350 if (!isSML) {
351 // [try to] interpret as [additional] char literal char
352 if (sc.Match('\\')) {
353 chLit = 1; // (definitely IS a char literal)
354 if (sc.chPrev == '\\')
355 sc.ch = ' '; // (...\\')
356 // should we be terminating - one way or another?
357 } else if ((sc.Match('\'') && sc.chPrev != '\\')
358 || sc.atLineEnd) {
359 state2 = SCE_CAML_DEFAULT;
360 if (sc.Match('\''))
361 chColor++;
362 else
363 sc.ChangeState(SCE_CAML_IDENTIFIER);
364 // ... maybe a char literal, maybe not
365 } else if (chLit < 1 && sc.currentPos - chToken >= 2)
366 sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
367 break;
368 }/* else
369 // fall through for SML char literal (handle like string) */
371 case SCE_CAML_STRING:
372 // [try to] interpret as [additional] [SML char/] string literal char
373 if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
374 state2 = SCE_CAML_WHITE;
375 else if (sc.Match('\\') && sc.chPrev == '\\')
376 sc.ch = ' '; // (...\\")
377 // should we be terminating - one way or another?
378 else if ((sc.Match('"') && sc.chPrev != '\\')
379 || (isSML && sc.atLineEnd)) {
380 state2 = SCE_CAML_DEFAULT;
381 if (sc.Match('"'))
382 chColor++;
384 break;
386 case SCE_CAML_WHITE:
387 // [try to] interpret as [additional] SML embedded whitespace char
388 if (sc.Match('\\')) {
389 // style this puppy NOW...
390 state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
391 styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
392 // ... then backtrack to determine original SML literal type
393 int p = chColor - 2;
394 for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
395 if (p >= 0)
396 state2 = static_cast<int>(styler.StyleAt(p));
397 // take care of state change NOW
398 sc.ChangeState(state2), state2 = -1;
400 break;
402 case SCE_CAML_COMMENT:
403 case SCE_CAML_COMMENT1:
404 case SCE_CAML_COMMENT2:
405 case SCE_CAML_COMMENT3:
406 // we're IN a comment - does this start a NESTED comment?
407 if (sc.Match('(', '*'))
408 state2 = sc.state + 1, chToken = sc.currentPos,
409 sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
410 // [try to] interpret as [additional] comment char
411 else if (sc.Match(')') && sc.chPrev == '*') {
412 if (nesting)
413 state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
414 else
415 state2 = SCE_CAML_DEFAULT;
416 chColor++;
417 // enable "magic" (read-only) comment AS REQUIRED
418 } else if (useMagic && sc.currentPos - chToken == 4
419 && sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
420 sc.state |= 0x10; // (switch to read-only comment style)
421 break;
424 // handle state change and char coloring AS REQUIRED
425 if (state2 >= 0)
426 styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
427 // move to next char UNLESS re-scanning current char
428 if (advance)
429 sc.Forward();
432 // do any required terminal char coloring (JIC)
433 sc.Complete();
436 #ifdef BUILD_AS_EXTERNAL_LEXER
437 static
438 #endif /* BUILD_AS_EXTERNAL_LEXER */
439 void FoldCamlDoc(
440 unsigned int, int,
441 int,
442 WordList *[],
443 Accessor &)
447 static const char * const camlWordListDesc[] = {
448 "Keywords", // primary Objective Caml keywords
449 "Keywords2", // "optional" keywords (typically from Pervasives)
450 "Keywords3", // "optional" keywords (typically typenames)
454 #ifndef BUILD_AS_EXTERNAL_LEXER
455 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
456 #endif /* BUILD_AS_EXTERNAL_LEXER */