Fix indenting a snippet when there is whitespace after the
[geany-mirror.git] / scintilla / LexCaml.cxx
blob0d11622593880d9ef23480fbcb4a5af3af0e0a32
1 // Scintilla source code edit control
2 /** @file LexCaml.cxx
3 ** Lexer for Objective Caml.
4 **/
5 // Copyright 2005-2009 by Robert Roessler <robertr@rftp.com>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 /* Release History
8 20050204 Initial release.
9 20050205 Quick compiler standards/"cleanliness" adjustment.
10 20050206 Added cast for IsLeadByte().
11 20050209 Changes to "external" build support.
12 20050306 Fix for 1st-char-in-doc "corner" case.
13 20050502 Fix for [harmless] one-past-the-end coloring.
14 20050515 Refined numeric token recognition logic.
15 20051125 Added 2nd "optional" keywords class.
16 20051129 Support "magic" (read-only) comments for RCaml.
17 20051204 Swtich to using StyleContext infrastructure.
18 20090629 Add full Standard ML '97 support.
21 #include <stdlib.h>
22 #include <string.h>
23 #include <ctype.h>
24 #include <stdio.h>
25 #include <stdarg.h>
27 #include "Platform.h"
29 #include "PropSet.h"
30 #include "PropSetSimple.h"
31 #include "Accessor.h"
32 #include "StyleContext.h"
33 #include "KeyWords.h"
34 #include "Scintilla.h"
35 #include "SciLexer.h"
37 // Since the Microsoft __iscsym[f] funcs are not ANSI...
38 inline int iscaml(int c) {return isalnum(c) || c == '_';}
39 inline int iscamlf(int c) {return isalpha(c) || c == '_';}
41 static const int baseT[24] = {
42 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* A - L */
43 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0,16 /* M - X */
46 #ifdef SCI_NAMESPACE
47 using namespace Scintilla;
48 #endif
50 #ifdef BUILD_AS_EXTERNAL_LEXER
52 (actually seems to work!)
54 #include "WindowAccessor.h"
55 #include "ExternalLexer.h"
57 #if PLAT_WIN
58 #include <windows.h>
59 #endif
61 static void ColouriseCamlDoc(
62 unsigned int startPos, int length,
63 int initStyle,
64 WordList *keywordlists[],
65 Accessor &styler);
67 static void FoldCamlDoc(
68 unsigned int startPos, int length,
69 int initStyle,
70 WordList *keywordlists[],
71 Accessor &styler);
73 static void InternalLexOrFold(int lexOrFold, unsigned int startPos, int length,
74 int initStyle, char *words[], WindowID window, char *props);
76 static const char* LexerName = "caml";
78 #ifdef TRACE
79 void Platform::DebugPrintf(const char *format, ...) {
80 char buffer[2000];
81 va_list pArguments;
82 va_start(pArguments, format);
83 vsprintf(buffer,format,pArguments);
84 va_end(pArguments);
85 Platform::DebugDisplay(buffer);
87 #else
88 void Platform::DebugPrintf(const char *, ...) {
90 #endif
92 bool Platform::IsDBCSLeadByte(int codePage, char ch) {
93 return ::IsDBCSLeadByteEx(codePage, ch) != 0;
96 long Platform::SendScintilla(WindowID w, unsigned int msg, unsigned long wParam, long lParam) {
97 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam, lParam);
100 long Platform::SendScintillaPointer(WindowID w, unsigned int msg, unsigned long wParam, void *lParam) {
101 return ::SendMessage(reinterpret_cast<HWND>(w), msg, wParam,
102 reinterpret_cast<LPARAM>(lParam));
105 void EXT_LEXER_DECL Fold(unsigned int lexer, unsigned int startPos, int length,
106 int initStyle, char *words[], WindowID window, char *props)
108 // below useless evaluation(s) to supress "not used" warnings
109 lexer;
110 // build expected data structures and do the Fold
111 InternalLexOrFold(1, startPos, length, initStyle, words, window, props);
115 int EXT_LEXER_DECL GetLexerCount()
117 return 1; // just us [Objective] Caml lexers here!
120 void EXT_LEXER_DECL GetLexerName(unsigned int Index, char *name, int buflength)
122 // below useless evaluation(s) to supress "not used" warnings
123 Index;
124 // return as much of our lexer name as will fit (what's up with Index?)
125 if (buflength > 0) {
126 buflength--;
127 int n = strlen(LexerName);
128 if (n > buflength)
129 n = buflength;
130 memcpy(name, LexerName, n), name[n] = '\0';
134 void EXT_LEXER_DECL Lex(unsigned int lexer, unsigned int startPos, int length,
135 int initStyle, char *words[], WindowID window, char *props)
137 // below useless evaluation(s) to supress "not used" warnings
138 lexer;
139 // build expected data structures and do the Lex
140 InternalLexOrFold(0, startPos, length, initStyle, words, window, props);
143 static void InternalLexOrFold(int foldOrLex, unsigned int startPos, int length,
144 int initStyle, char *words[], WindowID window, char *props)
146 // create and initialize a WindowAccessor (including contained PropSet)
147 PropSetSimple ps;
148 ps.SetMultiple(props);
149 WindowAccessor wa(window, ps);
150 // create and initialize WordList(s)
151 int nWL = 0;
152 for (; words[nWL]; nWL++) ; // count # of WordList PTRs needed
153 WordList** wl = new WordList* [nWL + 1];// alloc WordList PTRs
154 int i = 0;
155 for (; i < nWL; i++) {
156 wl[i] = new WordList(); // (works or THROWS bad_alloc EXCEPTION)
157 wl[i]->Set(words[i]);
159 wl[i] = 0;
160 // call our "internal" folder/lexer (... then do Flush!)
161 if (foldOrLex)
162 FoldCamlDoc(startPos, length, initStyle, wl, wa);
163 else
164 ColouriseCamlDoc(startPos, length, initStyle, wl, wa);
165 wa.Flush();
166 // clean up before leaving
167 for (i = nWL - 1; i >= 0; i--)
168 delete wl[i];
169 delete [] wl;
172 static
173 #endif /* BUILD_AS_EXTERNAL_LEXER */
175 void ColouriseCamlDoc(
176 unsigned int startPos, int length,
177 int initStyle,
178 WordList *keywordlists[],
179 Accessor &styler)
181 // initialize styler
182 StyleContext sc(startPos, length, initStyle, styler);
184 int chBase = 0, chToken = 0, chLit = 0;
185 WordList& keywords = *keywordlists[0];
186 WordList& keywords2 = *keywordlists[1];
187 WordList& keywords3 = *keywordlists[2];
188 const bool isSML = keywords.InList("andalso");
189 const int useMagic = styler.GetPropertyInt("lexer.caml.magic", 0);
191 // set up [initial] state info (terminating states that shouldn't "bleed")
192 const int state_ = sc.state & 0x0f;
193 if (state_ <= SCE_CAML_CHAR
194 || (isSML && state_ == SCE_CAML_STRING))
195 sc.state = SCE_CAML_DEFAULT;
196 int nesting = (state_ >= SCE_CAML_COMMENT)? (state_ - SCE_CAML_COMMENT): 0;
198 // foreach char in range...
199 while (sc.More()) {
200 // set up [per-char] state info
201 int state2 = -1; // (ASSUME no state change)
202 int chColor = sc.currentPos - 1;// (ASSUME standard coloring range)
203 bool advance = true; // (ASSUME scanner "eats" 1 char)
205 // step state machine
206 switch (sc.state & 0x0f) {
207 case SCE_CAML_DEFAULT:
208 chToken = sc.currentPos; // save [possible] token start (JIC)
209 // it's wide open; what do we have?
210 if (iscamlf(sc.ch))
211 state2 = SCE_CAML_IDENTIFIER;
212 else if (!isSML && sc.Match('`') && iscamlf(sc.chNext))
213 state2 = SCE_CAML_TAGNAME;
214 else if (!isSML && sc.Match('#') && isdigit(sc.chNext))
215 state2 = SCE_CAML_LINENUM;
216 else if (isdigit(sc.ch)) {
217 // it's a number, assume base 10
218 state2 = SCE_CAML_NUMBER, chBase = 10;
219 if (sc.Match('0')) {
220 // there MAY be a base specified...
221 const char* baseC = "bBoOxX";
222 if (isSML) {
223 if (sc.chNext == 'w')
224 sc.Forward(); // (consume SML "word" indicator)
225 baseC = "x";
227 // ... change to specified base AS REQUIRED
228 if (strchr(baseC, sc.chNext))
229 chBase = baseT[tolower(sc.chNext) - 'a'], sc.Forward();
231 } else if (!isSML && sc.Match('\'')) // (Caml char literal?)
232 state2 = SCE_CAML_CHAR, chLit = 0;
233 else if (isSML && sc.Match('#', '"')) // (SML char literal?)
234 state2 = SCE_CAML_CHAR, sc.Forward();
235 else if (sc.Match('"'))
236 state2 = SCE_CAML_STRING;
237 else if (sc.Match('(', '*'))
238 state2 = SCE_CAML_COMMENT, sc.Forward(), sc.ch = ' '; // (*)...
239 else if (strchr("!?~" /* Caml "prefix-symbol" */
240 "=<>@^|&+-*/$%" /* Caml "infix-symbol" */
241 "()[]{};,:.#", sc.ch) // Caml "bracket" or ;,:.#
242 // SML "extra" ident chars
243 || (isSML && (sc.Match('\\') || sc.Match('`'))))
244 state2 = SCE_CAML_OPERATOR;
245 break;
247 case SCE_CAML_IDENTIFIER:
248 // [try to] interpret as [additional] identifier char
249 if (!(iscaml(sc.ch) || sc.Match('\''))) {
250 const int n = sc.currentPos - chToken;
251 if (n < 24) {
252 // length is believable as keyword, [re-]construct token
253 char t[24];
254 for (int i = -n; i < 0; i++)
255 t[n + i] = static_cast<char>(sc.GetRelative(i));
256 t[n] = '\0';
257 // special-case "_" token as KEYWORD
258 if ((n == 1 && sc.chPrev == '_') || keywords.InList(t))
259 sc.ChangeState(SCE_CAML_KEYWORD);
260 else if (keywords2.InList(t))
261 sc.ChangeState(SCE_CAML_KEYWORD2);
262 else if (keywords3.InList(t))
263 sc.ChangeState(SCE_CAML_KEYWORD3);
265 state2 = SCE_CAML_DEFAULT, advance = false;
267 break;
269 case SCE_CAML_TAGNAME:
270 // [try to] interpret as [additional] tagname char
271 if (!(iscaml(sc.ch) || sc.Match('\'')))
272 state2 = SCE_CAML_DEFAULT, advance = false;
273 break;
275 /*case SCE_CAML_KEYWORD:
276 case SCE_CAML_KEYWORD2:
277 case SCE_CAML_KEYWORD3:
278 // [try to] interpret as [additional] keyword char
279 if (!iscaml(ch))
280 state2 = SCE_CAML_DEFAULT, advance = false;
281 break;*/
283 case SCE_CAML_LINENUM:
284 // [try to] interpret as [additional] linenum directive char
285 if (!isdigit(sc.ch))
286 state2 = SCE_CAML_DEFAULT, advance = false;
287 break;
289 case SCE_CAML_OPERATOR: {
290 // [try to] interpret as [additional] operator char
291 const char* o = 0;
292 if (iscaml(sc.ch) || isspace(sc.ch) // ident or whitespace
293 || (o = strchr(")]};,\'\"#", sc.ch),o) // "termination" chars
294 || (!isSML && sc.Match('`')) // Caml extra term char
295 || (!strchr("!$%&*+-./:<=>?@^|~", sc.ch)// "operator" chars
296 // SML extra ident chars
297 && !(isSML && (sc.Match('\\') || sc.Match('`'))))) {
298 // check for INCLUSIVE termination
299 if (o && strchr(")]};,", sc.ch)) {
300 if ((sc.Match(')') && sc.chPrev == '(')
301 || (sc.Match(']') && sc.chPrev == '['))
302 // special-case "()" and "[]" tokens as KEYWORDS
303 sc.ChangeState(SCE_CAML_KEYWORD);
304 chColor++;
305 } else
306 advance = false;
307 state2 = SCE_CAML_DEFAULT;
309 break;
312 case SCE_CAML_NUMBER:
313 // [try to] interpret as [additional] numeric literal char
314 if ((!isSML && sc.Match('_')) || IsADigit(sc.ch, chBase))
315 break;
316 // how about an integer suffix?
317 if (!isSML && (sc.Match('l') || sc.Match('L') || sc.Match('n'))
318 && (sc.chPrev == '_' || IsADigit(sc.chPrev, chBase)))
319 break;
320 // or a floating-point literal?
321 if (chBase == 10) {
322 // with a decimal point?
323 if (sc.Match('.')
324 && ((!isSML && sc.chPrev == '_')
325 || IsADigit(sc.chPrev, chBase)))
326 break;
327 // with an exponent? (I)
328 if ((sc.Match('e') || sc.Match('E'))
329 && ((!isSML && (sc.chPrev == '.' || sc.chPrev == '_'))
330 || IsADigit(sc.chPrev, chBase)))
331 break;
332 // with an exponent? (II)
333 if (((!isSML && (sc.Match('+') || sc.Match('-')))
334 || (isSML && sc.Match('~')))
335 && (sc.chPrev == 'e' || sc.chPrev == 'E'))
336 break;
338 // it looks like we have run out of number
339 state2 = SCE_CAML_DEFAULT, advance = false;
340 break;
342 case SCE_CAML_CHAR:
343 if (!isSML) {
344 // [try to] interpret as [additional] char literal char
345 if (sc.Match('\\')) {
346 chLit = 1; // (definitely IS a char literal)
347 if (sc.chPrev == '\\')
348 sc.ch = ' '; // (...\\')
349 // should we be terminating - one way or another?
350 } else if ((sc.Match('\'') && sc.chPrev != '\\')
351 || sc.atLineEnd) {
352 state2 = SCE_CAML_DEFAULT;
353 if (sc.Match('\''))
354 chColor++;
355 else
356 sc.ChangeState(SCE_CAML_IDENTIFIER);
357 // ... maybe a char literal, maybe not
358 } else if (chLit < 1 && sc.currentPos - chToken >= 2)
359 sc.ChangeState(SCE_CAML_IDENTIFIER), advance = false;
360 break;
361 }/* else
362 // fall through for SML char literal (handle like string) */
364 case SCE_CAML_STRING:
365 // [try to] interpret as [additional] [SML char/] string literal char
366 if (isSML && sc.Match('\\') && sc.chPrev != '\\' && isspace(sc.chNext))
367 state2 = SCE_CAML_WHITE;
368 else if (sc.Match('\\') && sc.chPrev == '\\')
369 sc.ch = ' '; // (...\\")
370 // should we be terminating - one way or another?
371 else if ((sc.Match('"') && sc.chPrev != '\\')
372 || (isSML && sc.atLineEnd)) {
373 state2 = SCE_CAML_DEFAULT;
374 if (sc.Match('"'))
375 chColor++;
377 break;
379 case SCE_CAML_WHITE:
380 // [try to] interpret as [additional] SML embedded whitespace char
381 if (sc.Match('\\')) {
382 // style this puppy NOW...
383 state2 = SCE_CAML_STRING, sc.ch = ' ' /* (...\") */, chColor++,
384 styler.ColourTo(chColor, SCE_CAML_WHITE), styler.Flush();
385 // ... then backtrack to determine original SML literal type
386 int p = chColor - 2;
387 for (; p >= 0 && styler.StyleAt(p) == SCE_CAML_WHITE; p--) ;
388 if (p >= 0)
389 state2 = static_cast<int>(styler.StyleAt(p));
390 // take care of state change NOW
391 sc.ChangeState(state2), state2 = -1;
393 break;
395 case SCE_CAML_COMMENT:
396 case SCE_CAML_COMMENT1:
397 case SCE_CAML_COMMENT2:
398 case SCE_CAML_COMMENT3:
399 // we're IN a comment - does this start a NESTED comment?
400 if (sc.Match('(', '*'))
401 state2 = sc.state + 1, chToken = sc.currentPos,
402 sc.Forward(), sc.ch = ' ' /* (*)... */, nesting++;
403 // [try to] interpret as [additional] comment char
404 else if (sc.Match(')') && sc.chPrev == '*') {
405 if (nesting)
406 state2 = (sc.state & 0x0f) - 1, chToken = 0, nesting--;
407 else
408 state2 = SCE_CAML_DEFAULT;
409 chColor++;
410 // enable "magic" (read-only) comment AS REQUIRED
411 } else if (useMagic && sc.currentPos - chToken == 4
412 && sc.Match('c') && sc.chPrev == 'r' && sc.GetRelative(-2) == '@')
413 sc.state |= 0x10; // (switch to read-only comment style)
414 break;
417 // handle state change and char coloring AS REQUIRED
418 if (state2 >= 0)
419 styler.ColourTo(chColor, sc.state), sc.ChangeState(state2);
420 // move to next char UNLESS re-scanning current char
421 if (advance)
422 sc.Forward();
425 // do any required terminal char coloring (JIC)
426 sc.Complete();
429 #ifdef BUILD_AS_EXTERNAL_LEXER
430 static
431 #endif /* BUILD_AS_EXTERNAL_LEXER */
432 void FoldCamlDoc(
433 unsigned int startPos, int length,
434 int initStyle,
435 WordList *keywordlists[],
436 Accessor &styler)
438 // below useless evaluation(s) to supress "not used" warnings
439 startPos || length || initStyle || keywordlists[0] || styler.Length();
442 static const char * const camlWordListDesc[] = {
443 "Keywords", // primary Objective Caml keywords
444 "Keywords2", // "optional" keywords (typically from Pervasives)
445 "Keywords3", // "optional" keywords (typically typenames)
449 #ifndef BUILD_AS_EXTERNAL_LEXER
450 LexerModule lmCaml(SCLEX_CAML, ColouriseCamlDoc, "caml", FoldCamlDoc, camlWordListDesc);
451 #endif /* BUILD_AS_EXTERNAL_LEXER */