Merge pull request #629 from kugel-/pluxy
[geany-mirror.git] / scintilla / lexers / LexCaml.cxx
blobdf87815419d6186ea23636fcdc9912b80f2dc6dd
1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3 ** Lexer for Objective Caml.
4 **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /* Release History
8 20050204 Initial release.
9 20050205 Quick compiler standards/"cleanliness" adjustment.
10 20050206 Added cast for IsLeadByte().
11 20050209 Changes to "external" build support.
12 20050306 Fix for 1st-char-in-doc "corner" case.
13 20050502 Fix for [harmless] one-past-the-end coloring.
14 20050515 Refined numeric token recognition logic.
15 20051125 Added 2nd "optional" keywords class.
16 20051129 Support "magic" (read-only) comments for RCaml.
17 20051204 Swtich to using StyleContext infrastructure.
18 20090629 Add full Standard ML '97 support.
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 #include <stdarg.h>
25 #include <assert.h>
26 #include <ctype.h>
28 #include "ILexer.h"
29 #include "Scintilla.h"
30 #include "SciLexer.h"
32 #include "PropSetSimple.h"
33 #include "WordList.h"
34 #include "LexAccessor.h"
35 #include "Accessor.h"
36 #include "StyleContext.h"
37 #include "CharacterSet.h"
38 #include "LexerModule.h"
40 // Since the Microsoft __iscsym[f] funcs are not ANSI...
41 inline int iscaml(int c) {return isalnum(c) || c == '_';}
42 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
44 static const int baseT[24] = {
45 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* A - L */
46 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16 /* M - X */
49 #ifdef SCI_NAMESPACE
50 using namespace Scintilla;
51 #endif
53 #ifdef BUILD_AS_EXTERNAL_LEXER
55 (actually seems to work!)
57 #include <string>
58 #include "WindowAccessor.h"
59 #include "ExternalLexer.h"
61 #undef EXT_LEXER_DECL
62 #define EXT_LEXER_DECL __declspec( dllexport ) __stdcall
64 #if PLAT_WIN
65 #include <windows.h>
66 #endif
68 static void ColouriseCamlDoc(
69 Sci_PositionU startPos, Sci_Position length,
70 int initStyle,
71 WordList *keywordlists[],
72 Accessor &styler);
74 static void FoldCamlDoc(
75 Sci_PositionU startPos, Sci_Position length,
76 int initStyle,
77 WordList *keywordlists[],
78 Accessor &styler);
80 static void InternalLexOrFold(int lexOrFold, Sci_PositionU startPos, Sci_Position length,
81 int initStyle, char *words[], WindowID window, char *props);
83 static const char* LexerName = "caml";
85 #ifdef TRACE
86 void Platform::DebugPrintf(const char *format, ...) {
87 char buffer[2000];
88 va_list pArguments;
89 va_start(pArguments, format);
90 vsprintf(buffer,format,pArguments);
91 va_end(pArguments);
92 Platform::DebugDisplay(buffer);
94 #else
95 void Platform::DebugPrintf(const char *, ...) {
97 #endif
99 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
100 return ::IsDBCSLeadByteEx(codePage, ch) != 0;
103 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
104 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
107 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
108 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
109 reinterpret_cast<LPARAM>(lParam));
112 void EXT_LEXER_DECL Fold(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
113 int initStyle, char *words[], WindowID window, char *props)
115 // below useless evaluation(s) to supress "not used" warnings
116 lexer;
117 // build expected data structures and do the Fold
118 InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
122 int EXT_LEXER_DECL GetLexerCount()
124 return 1; // just us [Objective] Caml lexers here!
127 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
129 // below useless evaluation(s) to supress "not used" warnings
130 Index;
131 // return as much of our lexer name as will fit (what's up with Index?)
132 if (buflength > 0) {
133 buflength--;
134 int n = strlen(LexerName);
135 if (n > buflength)
136 n = buflength;
137 memcpy(name, LexerName, n), name[n] = '\0';
141 void EXT_LEXER_DECL Lex(unsigned int lexer, Sci_PositionU startPos, Sci_Position length,
142 int initStyle, char *words[], WindowID window, char *props)
144 // below useless evaluation(s) to supress "not used" warnings
145 lexer;
146 // build expected data structures and do the Lex
147 InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
150 static void InternalLexOrFold(int foldOrLex, Sci_PositionU startPos, Sci_Position length,
151 int initStyle, char *words[], WindowID window, char *props)
153 // create and initialize a WindowAccessor (including contained PropSet)
154 PropSetSimple ps;
155 ps.SetMultiple(props);
156 WindowAccessor wa(window, ps);
157 // create and initialize WordList(s)
158 int nWL = 0;
159 for (; words[nWL]; nWL++) ; // count # of WordList PTRs needed
160 WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
161 int i = 0;
162 for (; i < nWL; i++) {
163 wl[i] = new WordList(); // (works or THROWS bad_alloc EXCEPTION)
164 wl[i]->Set(words[i]);
166 wl[i] = 0;
167 // call our "internal" folder/lexer (... then do Flush!)
168 if (foldOrLex)
169 FoldCamlDoc(startPos, length, initStyle, wl, wa);
170 else
171 ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
172 wa.Flush();
173 // clean up before leaving
174 for (i = nWL - 1; i >= 0; i--)
175 delete wl[i];
176 delete [] wl;
179 static
180 #endif /* BUILD_AS_EXTERNAL_LEXER */
182 void ColouriseCamlDoc(
183 Sci_PositionU startPos, Sci_Position length,
184 int initStyle,
185 WordList *keywordlists[],
186 Accessor &styler)
188 // initialize styler
189 StyleContext sc(startPos, length, initStyle, styler);
191 Sci_PositionU chToken = 0;
192 int chBase = 0, chLit = 0;
193 WordList& keywords = *keywordlists[0];
194 WordList& keywords2 = *keywordlists[1];
195 WordList& keywords3 = *keywordlists[2];
196 const bool isSML = keywords.InList("andalso");
197 const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
199 // set up [initial] state info (terminating states that shouldn't "bleed")
200 const int state_ = sc.state & 0x0f;
201 if (state_ <= SCE_CAML_CHAR
202 || (isSML && state_ == SCE_CAML_STRING))
203 sc.state = SCE_CAML_DEFAULT;
204 int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
206 // foreach char in range...
207 while (sc.More()) {
208 // set up [per-char] state info
209 int state2 = -1; // (ASSUME no state change)
210 Sci_Position chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
211 bool advance = true; // (ASSUME scanner "eats" 1 char)
213 // step state machine
214 switch (sc.state & 0x0f) {
215 case SCE_CAML_DEFAULT:
216 chToken = sc.currentPos; // save [possible] token start (JIC)
217 // it's wide open; what do we have?
218 if (iscamlf(sc.ch))
219 state2 = SCE_CAML_IDENTIFIER;
220 else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
221 state2 = SCE_CAML_TAGNAME;
222 else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
223 state2 = SCE_CAML_LINENUM;
224 else if (isdigit(sc.ch)) {
225 // it's a number, assume base 10
226 state2 = SCE_CAML_NUMBER, chBase = 10;
227 if (sc.Match('0')) {
228 // there MAY be a base specified...
229 const char* baseC = "bBoOxX";
230 if (isSML) {
231 if (sc.chNext == 'w')
232 sc.Forward(); // (consume SML "word" indicator)
233 baseC = "x";
235 // ... change to specified base AS REQUIRED
236 if (strchr(baseC, sc.chNext))
237 chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
239 } else if (!isSML && sc.Match('\'')) // (Caml char literal?)
240 state2 = SCE_CAML_CHAR, chLit = 0;
241 else if (isSML && sc.Match('#', '"')) // (SML char literal?)
242 state2 = SCE_CAML_CHAR, sc.Forward();
243 else if (sc.Match('"'))
244 state2 = SCE_CAML_STRING;
245 else if (sc.Match('(', '*'))
246 state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
247 else if (strchr("!?~" /* Caml "prefix-symbol" */
248 "=<>@^|&+-*/$%" /* Caml "infix-symbol" */
249 "()[]{};,:.#", sc.ch) // Caml "bracket" or ;,:.#
250 // SML "extra" ident chars
251 || (isSML && (sc.Match('\\') || sc.Match('`'))))
252 state2 = SCE_CAML_OPERATOR;
253 break;
255 case SCE_CAML_IDENTIFIER:
256 // [try to] interpret as [additional] identifier char
257 if (!(iscaml(sc.ch) || sc.Match('\''))) {
258 const Sci_Position n = sc.currentPos - chToken;
259 if (n < 24) {
260 // length is believable as keyword, [re-]construct token
261 char t[24];
262 for (Sci_Position i = -n; i < 0; i++)
263 t[n + i] = static_cast<char>(sc.GetRelative(i));
264 t[n] = '\0';
265 // special-case "_" token as KEYWORD
266 if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
267 sc.ChangeState(SCE_CAML_KEYWORD);
268 else if (keywords2.InList(t))
269 sc.ChangeState(SCE_CAML_KEYWORD2);
270 else if (keywords3.InList(t))
271 sc.ChangeState(SCE_CAML_KEYWORD3);
273 state2 = SCE_CAML_DEFAULT, advance = false;
275 break;
277 case SCE_CAML_TAGNAME:
278 // [try to] interpret as [additional] tagname char
279 if (!(iscaml(sc.ch) || sc.Match('\'')))
280 state2 = SCE_CAML_DEFAULT, advance = false;
281 break;
283 /*case SCE_CAML_KEYWORD:
284 case SCE_CAML_KEYWORD2:
285 case SCE_CAML_KEYWORD3:
286 // [try to] interpret as [additional] keyword char
287 if (!iscaml(ch))
288 state2 = SCE_CAML_DEFAULT, advance = false;
289 break;*/
291 case SCE_CAML_LINENUM:
292 // [try to] interpret as [additional] linenum directive char
293 if (!isdigit(sc.ch))
294 state2 = SCE_CAML_DEFAULT, advance = false;
295 break;
297 case SCE_CAML_OPERATOR: {
298 // [try to] interpret as [additional] operator char
299 const char* o = 0;
300 if (iscaml(sc.ch) || isspace(sc.ch) // ident or whitespace
301 || (o = strchr(")]};,\'\"#", sc.ch),o) // "termination" chars
302 || (!isSML && sc.Match('`')) // Caml extra term char
303 || (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
304 // SML extra ident chars
305 && !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
306 // check for INCLUSIVE termination
307 if (o && strchr(")]};,", sc.ch)) {
308 if ((sc.Match(')') && sc.chPrev == '(')
309 || (sc.Match(']') && sc.chPrev == '['))
310 // special-case "()" and "[]" tokens as KEYWORDS
311 sc.ChangeState(SCE_CAML_KEYWORD);
312 chColor++;
313 } else
314 advance = false;
315 state2 = SCE_CAML_DEFAULT;
317 break;
320 case SCE_CAML_NUMBER:
321 // [try to] interpret as [additional] numeric literal char
322 if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
323 break;
324 // how about an integer suffix?
325 if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
326 && (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
327 break;
328 // or a floating-point literal?
329 if (chBase == 10) {
330 // with a decimal point?
331 if (sc.Match('.')
332 && ((!isSML && sc.chPrev == '_')
333 || IsADigit(sc.chPrev, chBase)))
334 break;
335 // with an exponent? (I)
336 if ((sc.Match('e') || sc.Match('E'))
337 && ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
338 || IsADigit(sc.chPrev, chBase)))
339 break;
340 // with an exponent? (II)
341 if (((!isSML && (sc.Match('+') || sc.Match('-')))
342 || (isSML && sc.Match('~')))
343 && (sc.chPrev == 'e' || sc.chPrev == 'E'))
344 break;
346 // it looks like we have run out of number
347 state2 = SCE_CAML_DEFAULT, advance = false;
348 break;
350 case SCE_CAML_CHAR:
351 if (!isSML) {
352 // [try to] interpret as [additional] char literal char
353 if (sc.Match('\\')) {
354 chLit = 1; // (definitely IS a char literal)
355 if (sc.chPrev == '\\')
356 sc.ch = ' '; // (...\\')
357 // should we be terminating - one way or another?
358 } else if ((sc.Match('\'') && sc.chPrev != '\\')
359 || sc.atLineEnd) {
360 state2 = SCE_CAML_DEFAULT;
361 if (sc.Match('\''))
362 chColor++;
363 else
364 sc.ChangeState(SCE_CAML_IDENTIFIER);
365 // ... maybe a char literal, maybe not
366 } else if (chLit < 1 && sc.currentPos - chToken >= 2)
367 sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
368 break;
369 }/* else
370 // fall through for SML char literal (handle like string) */
372 case SCE_CAML_STRING:
373 // [try to] interpret as [additional] [SML char/] string literal char
374 if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
375 state2 = SCE_CAML_WHITE;
376 else if (sc.Match('\\') && sc.chPrev == '\\')
377 sc.ch = ' '; // (...\\")
378 // should we be terminating - one way or another?
379 else if ((sc.Match('"') && sc.chPrev != '\\')
380 || (isSML && sc.atLineEnd)) {
381 state2 = SCE_CAML_DEFAULT;
382 if (sc.Match('"'))
383 chColor++;
385 break;
387 case SCE_CAML_WHITE:
388 // [try to] interpret as [additional] SML embedded whitespace char
389 if (sc.Match('\\')) {
390 // style this puppy NOW...
391 state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
392 styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
393 // ... then backtrack to determine original SML literal type
394 Sci_Position p = chColor - 2;
395 for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
396 if (p >= 0)
397 state2 = static_cast<int>(styler.StyleAt(p));
398 // take care of state change NOW
399 sc.ChangeState(state2), state2 = -1;
401 break;
403 case SCE_CAML_COMMENT:
404 case SCE_CAML_COMMENT1:
405 case SCE_CAML_COMMENT2:
406 case SCE_CAML_COMMENT3:
407 // we're IN a comment - does this start a NESTED comment?
408 if (sc.Match('(', '*'))
409 state2 = sc.state + 1, chToken = sc.currentPos,
410 sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
411 // [try to] interpret as [additional] comment char
412 else if (sc.Match(')') && sc.chPrev == '*') {
413 if (nesting)
414 state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
415 else
416 state2 = SCE_CAML_DEFAULT;
417 chColor++;
418 // enable "magic" (read-only) comment AS REQUIRED
419 } else if (useMagic && sc.currentPos - chToken == 4
420 && sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
421 sc.state |= 0x10; // (switch to read-only comment style)
422 break;
425 // handle state change and char coloring AS REQUIRED
426 if (state2 >= 0)
427 styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
428 // move to next char UNLESS re-scanning current char
429 if (advance)
430 sc.Forward();
433 // do any required terminal char coloring (JIC)
434 sc.Complete();
437 #ifdef BUILD_AS_EXTERNAL_LEXER
438 static
439 #endif /* BUILD_AS_EXTERNAL_LEXER */
440 void FoldCamlDoc(
441 Sci_PositionU, Sci_Position,
442 int,
443 WordList *[],
444 Accessor &)
448 static const char * const camlWordListDesc[] = {
449 "Keywords", // primary Objective Caml keywords
450 "Keywords2", // "optional" keywords (typically from Pervasives)
451 "Keywords3", // "optional" keywords (typically typenames)
455 #ifndef BUILD_AS_EXTERNAL_LEXER
456 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
457 #endif /* BUILD_AS_EXTERNAL_LEXER */