Update HACKING
[geany-mirror.git] / ctags / parsers / python.c
blob0571b5cf3f6497b0f5388c29f83cdc991272d6ca
1 /*
2 * Copyright (c) 2000-2003, Darren Hiebert
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
7 * This module contains functions for generating tags for Python language
8 * files.
9 */
11 * INCLUDE FILES
13 #include "general.h" /* must always come first */
15 #include <string.h>
17 #include "entry.h"
18 #include "nestlevel.h"
19 #include "options.h"
20 #include "read.h"
21 #include "parse.h"
22 #include "vstring.h"
23 #include "routines.h"
24 #include "debug.h"
25 #include "xtag.h"
28 * DATA DEFINITIONS
31 struct corkInfo {
32 int index;
35 struct nestingLevelUserData {
36 int indentation;
38 #define PY_NL_INDENTATION(nl) ((struct nestingLevelUserData *)nestingLevelGetUserData(nl))->indentation
41 typedef enum {
42 K_CLASS, K_FUNCTION, K_METHOD, K_VARIABLE, K_IMPORT
43 } pythonKind;
45 static kindDefinition PythonKinds[] = {
46 {true, 'c', "class", "classes"},
47 {true, 'f', "function", "functions"},
48 {true, 'm', "member", "class members"},
49 {true, 'v', "variable", "variables"},
50 {true, 'x', "unknown", "name referring a classe/variable/function/module defined in other module"}
53 typedef enum {
54 A_PUBLIC, A_PRIVATE, A_PROTECTED
55 } pythonAccess;
57 static const char *const PythonAccesses[] = {
58 "public", "private", "protected"
61 static char const * const singletriple = "'''";
62 static char const * const doubletriple = "\"\"\"";
65 * FUNCTION DEFINITIONS
68 static bool isIdentifierFirstCharacter (int c)
70 return (bool) (isalpha (c) || c == '_');
73 static bool isIdentifierCharacter (int c)
75 return (bool) (isalnum (c) || c == '_');
78 /* follows PEP-8, and always reports single-underscores as protected
79 * See:
80 * - http://www.python.org/dev/peps/pep-0008/#method-names-and-instance-variables
81 * - http://www.python.org/dev/peps/pep-0008/#designing-for-inheritance
83 static pythonAccess accessFromIdentifier (const vString *const ident,
84 pythonKind kind, bool has_parent, bool parent_is_class)
86 const char *const p = vStringValue (ident);
87 const size_t len = vStringLength (ident);
89 /* inside a function/method, private */
90 if (has_parent && !parent_is_class)
91 return A_PRIVATE;
92 /* not starting with "_", public */
93 else if (len < 1 || p[0] != '_')
94 return A_PUBLIC;
95 /* "__...__": magic methods */
96 else if (kind == K_METHOD && parent_is_class &&
97 len > 3 && p[1] == '_' && p[len - 2] == '_' && p[len - 1] == '_')
98 return A_PUBLIC;
99 /* "__...": name mangling */
100 else if (parent_is_class && len > 1 && p[1] == '_')
101 return A_PRIVATE;
102 /* "_...": suggested as non-public, but easily accessible */
103 else
104 return A_PROTECTED;
107 static void addAccessFields (tagEntryInfo *const entry,
108 const vString *const ident, pythonKind kind,
109 bool has_parent, bool parent_is_class)
111 pythonAccess access;
113 access = accessFromIdentifier (ident, kind, has_parent, parent_is_class);
114 entry->extensionFields.access = PythonAccesses [access];
115 /* FIXME: should we really set isFileScope in addition to access? */
116 if (access == A_PRIVATE)
117 entry->isFileScope = true;
120 /* Given a string with the contents of a line directly after the "def" keyword,
121 * extract all relevant information and create a tag.
123 static struct corkInfo makeFunctionTag (vString *const function,
124 vString *const parent, int is_class_parent, const char *arglist)
126 tagEntryInfo tag;
127 int corkIndex;
128 struct corkInfo info;
130 if (vStringLength (parent) > 0)
132 if (is_class_parent)
134 initTagEntry (&tag, vStringValue (function), K_METHOD);
135 tag.extensionFields.scopeKindIndex = K_CLASS;
137 else
139 initTagEntry (&tag, vStringValue (function), K_FUNCTION);
140 tag.extensionFields.scopeKindIndex = K_FUNCTION;
142 tag.extensionFields.scopeName = vStringValue (parent);
144 else
145 initTagEntry (&tag, vStringValue (function), K_FUNCTION);
147 tag.extensionFields.signature = arglist;
149 addAccessFields (&tag, function, is_class_parent ? K_METHOD : K_FUNCTION,
150 vStringLength (parent) > 0, is_class_parent);
152 corkIndex = makeTagEntry (&tag);
154 info.index = corkIndex;
155 return info;
158 /* Given a string with the contents of the line directly after the "class"
159 * keyword, extract all necessary information and create a tag.
161 static int makeClassTag (vString *const class, vString *const inheritance,
162 vString *const parent, int is_class_parent)
164 tagEntryInfo tag;
165 initTagEntry (&tag, vStringValue (class), K_CLASS);
166 if (vStringLength (parent) > 0)
168 if (is_class_parent)
170 tag.extensionFields.scopeKindIndex = K_CLASS;
171 tag.extensionFields.scopeName = vStringValue (parent);
173 else
175 tag.extensionFields.scopeKindIndex = K_FUNCTION;
176 tag.extensionFields.scopeName = vStringValue (parent);
179 tag.extensionFields.inheritance = vStringValue (inheritance);
180 addAccessFields (&tag, class, K_CLASS, vStringLength (parent) > 0,
181 is_class_parent);
182 return makeTagEntry (&tag);
185 static void makeVariableTag (vString *const var, vString *const parent,
186 bool is_class_parent)
188 tagEntryInfo tag;
189 initTagEntry (&tag, vStringValue (var), K_VARIABLE);
190 if (vStringLength (parent) > 0)
192 tag.extensionFields.scopeKindIndex = K_CLASS;
193 tag.extensionFields.scopeName = vStringValue (parent);
195 addAccessFields (&tag, var, K_VARIABLE, vStringLength (parent) > 0,
196 is_class_parent);
197 makeTagEntry (&tag);
200 /* Skip a single or double quoted string. */
201 static const char *skipString (const char *cp)
203 const char *start = cp;
204 int escaped = 0;
205 for (cp++; *cp; cp++)
207 if (escaped)
208 escaped--;
209 else if (*cp == '\\')
210 escaped++;
211 else if (*cp == *start)
212 return cp + 1;
214 return cp;
217 /* Skip everything up to an identifier start. */
218 static const char *skipEverything (const char *cp)
220 int match;
221 for (; *cp; cp++)
223 if (*cp == '#')
224 return strchr(cp, '\0');
226 match = 0;
227 if (*cp == '"' || *cp == '\'')
228 match = 1;
230 /* these checks find unicode, binary (Python 3) and raw strings */
231 if (!match)
233 bool r_first = (*cp == 'r' || *cp == 'R');
235 /* "r" | "R" | "u" | "U" | "b" | "B" */
236 if (r_first || *cp == 'u' || *cp == 'U' || *cp == 'b' || *cp == 'B')
238 unsigned int i = 1;
240 /* r_first -> "rb" | "rB" | "Rb" | "RB"
241 !r_first -> "ur" | "UR" | "Ur" | "uR" | "br" | "Br" | "bR" | "BR" */
242 if (( r_first && (cp[i] == 'b' || cp[i] == 'B')) ||
243 (!r_first && (cp[i] == 'r' || cp[i] == 'R')))
244 i++;
246 if (cp[i] == '\'' || cp[i] == '"')
248 match = 1;
249 cp += i;
253 if (match)
255 cp = skipString(cp);
256 if (!*cp) break;
258 if (isIdentifierFirstCharacter ((int) *cp))
259 return cp;
260 if (match)
261 cp--; /* avoid jumping over the character after a skipped string */
263 return cp;
266 /* Skip an identifier. */
267 static const char *skipIdentifier (const char *cp)
269 while (isIdentifierCharacter ((int) *cp))
270 cp++;
271 return cp;
274 static const char *findDefinitionOrClass (const char *cp)
276 while (*cp)
278 cp = skipEverything (cp);
279 if (!strncmp(cp, "def", 3) || !strncmp(cp, "class", 5) ||
280 !strncmp(cp, "cdef", 4) || !strncmp(cp, "cpdef", 5))
282 return cp;
284 cp = skipIdentifier (cp);
286 return NULL;
289 static const char *skipSpace (const char *cp)
291 while (isspace ((int) *cp))
292 ++cp;
293 return cp;
296 /* Starting at ''cp'', parse an identifier into ''identifier''. */
297 static const char *parseIdentifier (const char *cp, vString *const identifier)
299 vStringClear (identifier);
300 while (isIdentifierCharacter ((int) *cp))
302 vStringPut (identifier, (int) *cp);
303 ++cp;
305 return cp;
308 static int parseClass (const char *cp, vString *const class,
309 vString *const parent, int is_class_parent)
311 int corkIndex;
312 vString *const inheritance = vStringNew ();
313 vStringClear (inheritance);
314 cp = parseIdentifier (cp, class);
315 cp = skipSpace (cp);
316 if (*cp == '(')
318 ++cp;
319 while (*cp != ')')
321 if (*cp == '\0')
323 /* Closing parenthesis can be in follow up line. */
324 cp = (const char *) readLineFromInputFile ();
325 if (!cp) break;
326 vStringPut (inheritance, ' ');
327 continue;
329 vStringPut (inheritance, *cp);
330 ++cp;
333 corkIndex = makeClassTag (class, inheritance, parent, is_class_parent);
334 vStringDelete (inheritance);
335 return corkIndex;
338 static void parseImports (const char *cp)
340 const char *pos;
341 vString *name, *name_next;
343 cp = skipEverything (cp);
345 if ((pos = strstr (cp, "import")) == NULL)
346 return;
348 cp = pos + 6;
350 /* continue only if there is some space between the keyword and the identifier */
351 if (! isspace (*cp))
352 return;
354 cp++;
355 cp = skipSpace (cp);
357 name = vStringNew ();
358 name_next = vStringNew ();
360 cp = skipEverything (cp);
361 while (*cp)
363 cp = parseIdentifier (cp, name);
365 cp = skipEverything (cp);
366 /* we parse the next possible import statement as well to be able to ignore 'foo' in
367 * 'import foo as bar' */
368 parseIdentifier (cp, name_next);
370 /* take the current tag only if the next one is not "as" */
371 if (strcmp (vStringValue (name_next), "as") != 0 &&
372 strcmp (vStringValue (name), "as") != 0)
374 makeSimpleTag (name, K_IMPORT);
377 vStringDelete (name);
378 vStringDelete (name_next);
381 /* modified from lcpp.c getArglistFromStr().
382 * warning: terminates rest of string past arglist!
383 * note: does not ignore brackets inside strings! */
384 static char *parseArglist(const char *buf)
386 char *start, *end;
387 int level;
388 if (NULL == buf)
389 return NULL;
390 if (NULL == (start = strchr(buf, '(')))
391 return NULL;
392 for (level = 1, end = start + 1; level > 0; ++end)
394 if ('\0' == *end)
395 break;
396 else if ('(' == *end)
397 ++ level;
398 else if (')' == *end)
399 -- level;
401 *end = '\0';
402 return strdup(start);
405 static struct corkInfo parseFunction (const char *cp, vString *const def,
406 vString *const parent, int is_class_parent)
408 char *arglist;
409 struct corkInfo info;
411 cp = parseIdentifier (cp, def);
412 arglist = parseArglist (cp);
413 info = makeFunctionTag (def, parent, is_class_parent, arglist);
414 if (arglist != NULL)
415 eFree (arglist);
416 return info;
419 /* Get the combined name of a nested symbol. Classes are separated with ".",
420 * functions with "/". For example this code:
421 * class MyClass:
422 * def myFunction:
423 * def SubFunction:
424 * class SubClass:
425 * def Method:
426 * pass
427 * Would produce this string:
428 * MyClass.MyFunction/SubFunction/SubClass.Method
430 static bool constructParentString(NestingLevels *nls, int indent,
431 vString *result)
433 int i;
434 NestingLevel *prev = NULL;
435 int is_class = false;
436 vStringClear (result);
437 for (i = 0; i < nls->n; i++)
439 NestingLevel *nl = nestingLevelsGetNthFromRoot (nls, i);
440 tagEntryInfo *e;
442 if (indent <= PY_NL_INDENTATION(nl))
443 break;
444 if (prev)
446 vStringCatS(result, "."); /* make Geany symbol list grouping work properly */
448 if (prev->kindIndex == K_CLASS)
449 vStringCatS(result, ".");
450 else
451 vStringCatS(result, "/");
455 e = getEntryOfNestingLevel (nl);
456 if (e)
458 vStringCatS(result, e->name);
459 is_class = (e->kindIndex == K_CLASS);
461 else
462 is_class = false;
464 prev = nl;
466 return is_class;
469 /* Check indentation level and truncate nesting levels accordingly */
470 static void checkIndent(NestingLevels *nls, int indent)
472 int i;
473 NestingLevel *n;
475 for (i = 0; i < nls->n; i++)
477 n = nestingLevelsGetNthFromRoot (nls, i);
478 if (n && indent <= PY_NL_INDENTATION(n))
480 /* truncate levels */
481 nls->n = i;
482 break;
487 static void addNestingLevel(NestingLevels *nls, int indentation, struct corkInfo *info)
489 int i;
490 NestingLevel *nl = NULL;
492 for (i = 0; i < nls->n; i++)
494 nl = nestingLevelsGetNthFromRoot(nls, i);
495 if (indentation <= PY_NL_INDENTATION(nl)) break;
497 if (i == nls->n)
498 nl = nestingLevelsPush(nls, info->index);
499 else
500 /* reuse existing slot */
501 nl = nestingLevelsTruncate (nls, i + 1, info->index);
503 PY_NL_INDENTATION(nl) = indentation;
506 /* Return a pointer to the start of the next triple string, or NULL. Store
507 * the kind of triple string in "which" if the return is not NULL.
509 static char const *find_triple_start(char const *string, char const **which)
511 char const *cp = string;
513 for (; *cp; cp++)
515 if (*cp == '#')
516 break;
517 if (*cp == '"' || *cp == '\'')
519 if (strncmp(cp, doubletriple, 3) == 0)
521 *which = doubletriple;
522 return cp;
524 if (strncmp(cp, singletriple, 3) == 0)
526 *which = singletriple;
527 return cp;
529 cp = skipString(cp);
530 if (!*cp) break;
531 cp--; /* avoid jumping over the character after a skipped string */
534 return NULL;
537 /* Find the end of a triple string as pointed to by "which", and update "which"
538 * with any other triple strings following in the given string.
540 static void find_triple_end(char const *string, char const **which)
542 char const *s = string;
543 while (1)
545 /* Check if the string ends in the same line. */
546 s = strstr (s, *which);
547 if (!s) break;
548 s += 3;
549 *which = NULL;
550 /* If yes, check if another one starts in the same line. */
551 s = find_triple_start(s, which);
552 if (!s) break;
553 s += 3;
557 static const char *findVariable(const char *line)
559 /* Parse global and class variable names (C.x) from assignment statements.
560 * Object attributes (obj.x) are ignored.
561 * Assignment to a tuple 'x, y = 2, 3' not supported.
562 * TODO: ignore duplicate tags from reassignment statements. */
563 const char *cp, *sp, *eq, *start;
565 cp = strstr(line, "=");
566 if (!cp)
567 return NULL;
568 eq = cp + 1;
569 while (*eq)
571 if (*eq == '=')
572 return NULL; /* ignore '==' operator and 'x=5,y=6)' function lines */
573 if (*eq == '(' || *eq == '#')
574 break; /* allow 'x = func(b=2,y=2,' lines and comments at the end of line */
575 eq++;
578 /* go backwards to the start of the line, checking we have valid chars */
579 start = cp - 1;
580 while (start >= line && isspace ((int) *start))
581 --start;
582 while (start >= line && isIdentifierCharacter ((int) *start))
583 --start;
584 if (!isIdentifierFirstCharacter(*(start + 1)))
585 return NULL;
586 sp = start;
587 while (sp >= line && isspace ((int) *sp))
588 --sp;
589 if ((sp + 1) != line) /* the line isn't a simple variable assignment */
590 return NULL;
591 /* the line is valid, parse the variable name */
592 ++start;
593 return start;
596 /* Skip type declaration that optionally follows a cdef/cpdef */
597 static const char *skipTypeDecl (const char *cp, bool *is_class)
599 const char *lastStart = cp, *ptr = cp;
600 int loopCount = 0;
601 ptr = skipSpace(cp);
602 if (!strncmp("extern", ptr, 6)) {
603 ptr += 6;
604 ptr = skipSpace(ptr);
605 if (!strncmp("from", ptr, 4)) { return NULL; }
607 if (!strncmp("class", ptr, 5)) {
608 ptr += 5 ;
609 *is_class = true;
610 ptr = skipSpace(ptr);
611 return ptr;
613 /* limit so that we don't pick off "int item=obj()" */
614 while (*ptr && loopCount++ < 2) {
615 while (*ptr && *ptr != '=' && *ptr != '(' && !isspace(*ptr)) {
616 /* skip over e.g. 'cpdef numpy.ndarray[dtype=double, ndim=1]' */
617 if(*ptr == '[') {
618 while (*ptr && *ptr != ']') ptr++;
619 if (*ptr) ptr++;
620 } else {
621 ptr++;
624 if (!*ptr || *ptr == '=') return NULL;
625 if (*ptr == '(') {
626 return lastStart; /* if we stopped on a '(' we are done */
628 ptr = skipSpace(ptr);
629 lastStart = ptr;
630 while (*lastStart == '*') lastStart++; /* cdef int *identifier */
632 return NULL;
635 /* checks if there is a lambda at position of cp, and return its argument list
636 * if so.
637 * We don't return the lambda name since it is useless for now since we already
638 * know it when we call this function, and it would be a little slower. */
639 static bool varIsLambda (const char *cp, char **arglist)
641 bool is_lambda = false;
643 cp = skipSpace (cp);
644 cp = skipIdentifier (cp); /* skip the lambda's name */
645 cp = skipSpace (cp);
646 if (*cp == '=')
648 cp++;
649 cp = skipSpace (cp);
650 if (strncmp (cp, "lambda", 6) == 0)
652 const char *tmp;
654 cp += 6; /* skip the lambda */
655 tmp = skipSpace (cp);
656 /* check if there is a space after lambda to detect assignations
657 * starting with 'lambdaXXX' */
658 if (tmp != cp)
660 vString *args = vStringNew ();
662 cp = tmp;
663 vStringPut (args, '(');
664 for (; *cp != 0 && *cp != ':'; cp++)
665 vStringPut (args, *cp);
666 vStringPut (args, ')');
667 if (arglist)
668 *arglist = strdup (vStringValue (args));
669 vStringDelete (args);
670 is_lambda = true;
674 return is_lambda;
677 /* checks if @p cp has keyword @p keyword at the start, and fills @p cp_n with
678 * the position of the next non-whitespace after the keyword */
679 static bool matchKeyword (const char *keyword, const char *cp, const char **cp_n)
681 size_t kw_len = strlen (keyword);
682 if (strncmp (cp, keyword, kw_len) == 0 && isspace (cp[kw_len]))
684 *cp_n = skipSpace (&cp[kw_len + 1]);
685 return true;
687 return false;
690 static void findPythonTags (void)
692 vString *const continuation = vStringNew ();
693 vString *const name = vStringNew ();
694 vString *const parent = vStringNew();
696 NestingLevels *const nesting_levels = nestingLevelsNew(sizeof (struct nestingLevelUserData));
698 const char *line;
699 int line_skip = 0;
700 char const *longStringLiteral = NULL;
702 while ((line = (const char *) readLineFromInputFile ()) != NULL)
704 const char *cp = line, *candidate;
705 char const *longstring;
706 char const *keyword, *variable;
707 int indent;
709 cp = skipSpace (cp);
711 if (*cp == '\0') /* skip blank line */
712 continue;
714 /* Skip comment if we are not inside a multi-line string. */
715 if (*cp == '#' && !longStringLiteral)
716 continue;
718 /* Deal with line continuation. */
719 if (!line_skip) vStringClear(continuation);
720 vStringCatS(continuation, line);
721 vStringStripTrailing(continuation);
722 if (vStringLast(continuation) == '\\')
724 vStringChop(continuation);
725 vStringCatS(continuation, " ");
726 line_skip = 1;
727 continue;
729 cp = line = vStringValue(continuation);
730 cp = skipSpace (cp);
731 indent = cp - line;
732 line_skip = 0;
734 /* Deal with multiline string ending. */
735 if (longStringLiteral)
737 find_triple_end(cp, &longStringLiteral);
738 continue;
741 checkIndent(nesting_levels, indent);
743 /* Find global and class variables */
744 variable = findVariable(line);
745 if (variable)
747 const char *start = variable;
748 char *arglist;
749 bool parent_is_class;
751 vStringClear (name);
752 while (isIdentifierCharacter ((int) *start))
754 vStringPut (name, (int) *start);
755 ++start;
758 parent_is_class = constructParentString(nesting_levels, indent, parent);
759 if (varIsLambda (variable, &arglist))
761 /* show class members or top-level script lambdas only */
762 if (parent_is_class || vStringLength(parent) == 0)
763 makeFunctionTag (name, parent, parent_is_class, arglist);
764 eFree (arglist);
766 else
768 /* skip variables in methods */
769 if (parent_is_class || vStringLength(parent) == 0)
770 makeVariableTag (name, parent, parent_is_class);
774 /* Deal with multiline string start. */
775 longstring = find_triple_start(cp, &longStringLiteral);
776 if (longstring)
778 longstring += 3;
779 find_triple_end(longstring, &longStringLiteral);
780 /* We don't parse for any tags in the rest of the line. */
781 continue;
784 /* Deal with def and class keywords. */
785 keyword = findDefinitionOrClass (cp);
786 if (keyword)
788 bool found = false;
789 bool is_class = false;
790 if (matchKeyword ("def", keyword, &cp))
792 found = true;
794 else if (matchKeyword ("class", keyword, &cp))
796 found = true;
797 is_class = true;
799 else if (matchKeyword ("cdef", keyword, &cp))
801 candidate = skipTypeDecl (cp, &is_class);
802 if (candidate)
804 found = true;
805 cp = candidate;
809 else if (matchKeyword ("cpdef", keyword, &cp))
811 candidate = skipTypeDecl (cp, &is_class);
812 if (candidate)
814 found = true;
815 cp = candidate;
819 if (found)
821 bool is_parent_class;
822 struct corkInfo info;
824 is_parent_class =
825 constructParentString(nesting_levels, indent, parent);
827 if (is_class)
829 info.index = parseClass (cp, name, parent, is_parent_class);
831 else
832 info = parseFunction(cp, name, parent, is_parent_class);
834 addNestingLevel(nesting_levels, indent, &info);
836 continue;
838 /* Find and parse imports */
839 parseImports(line);
842 /* Force popping all nesting levels. */
843 checkIndent(nesting_levels, 0);
845 /* Clean up all memory we allocated. */
846 vStringDelete (parent);
847 vStringDelete (name);
848 vStringDelete (continuation);
849 nestingLevelsFree (nesting_levels);
852 extern parserDefinition *PythonParser (void)
854 static const char *const extensions[] = { "py", "pyx", "pxd", "pxi" ,"scons", NULL };
855 parserDefinition *def = parserNew ("Python");
856 def->kindTable = PythonKinds;
857 def->kindCount = ARRAY_SIZE (PythonKinds);
858 def->extensions = extensions;
859 def->parser = findPythonTags;
860 def->useCork = CORK_QUEUE;
861 return def;