Replace utils_make_human_readable_str() with g_format_size()
[geany-mirror.git] / ctags / parsers / html.c
blob56b3b3f33283dfd6d4ad2051e3d2ea8264c0c4a5
1 /*
2 * Copyright (c) 2016, Jiri Techet
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
7 * This module contains functions for generating tags for HTML language
8 * files.
9 */
11 #include "general.h"
13 #include <string.h>
14 #include <ctype.h>
16 #include "entry.h"
17 #include "parse.h"
18 #include "read.h"
19 #include "routines.h"
20 #include "keyword.h"
21 #include "promise.h"
23 /* The max. number of nested elements - prevents further recursion if the limit
24 * is exceeded and avoids stack overflow for invalid input containing too many
25 * open tags */
26 #define MAX_DEPTH 1000
29 typedef enum {
30 K_ANCHOR,
31 K_CLASS,
32 K_HEADING1,
33 K_HEADING2,
34 K_HEADING3,
35 K_STYELSHEET,
36 K_ID,
37 K_SCRIPT,
38 } htmlKind;
41 typedef enum {
42 CLASS_KIND_ATTRIBUTE_ROLE,
43 } ClassRole;
45 typedef enum {
46 SCRIPT_KIND_EXTERNAL_FILE_ROLE,
47 } ScriptRole;
49 typedef enum {
50 STYLESHEET_KIND_EXTERNAL_FILE_ROLE,
51 } StylesheetRole;
53 static roleDefinition ClassRoles [] = {
54 { true, "attribute", "assigned as attributes" },
57 static roleDefinition ScriptRoles [] = {
58 { true, "extFile", "referenced as external files" },
61 static roleDefinition StylesheetRoles [] = {
62 { true, "extFile", "referenced as external files" },
65 static kindDefinition HtmlKinds [] = {
66 { true, 'a', "anchor", "named anchors" },
67 { true, 'c', "class", "classes",
68 .referenceOnly = true, ATTACH_ROLES (ClassRoles)},
69 { true, 'h', "heading1", "H1 headings" },
70 { true, 'i', "heading2", "H2 headings" },
71 { true, 'j', "heading3", "H3 headings" },
72 { true, 'C', "stylesheet", "stylesheets",
73 .referenceOnly = true, ATTACH_ROLES (StylesheetRoles)},
74 { true, 'I', "id", "identifiers" },
75 { true, 'J', "script", "scripts",
76 .referenceOnly = true, ATTACH_ROLES (ScriptRoles)},
79 typedef enum {
80 KEYWORD_h1,
81 KEYWORD_h2,
82 KEYWORD_h3,
83 KEYWORD_a,
84 KEYWORD_script,
85 KEYWORD_style,
86 KEYWORD_name,
88 /* void elements */
89 KEYWORD_area,
90 KEYWORD_base,
91 KEYWORD_br,
92 KEYWORD_class,
93 KEYWORD_col,
94 KEYWORD_command,
95 KEYWORD_embed,
96 KEYWORD_hr,
97 KEYWORD_href,
98 KEYWORD_id,
99 KEYWORD_img,
100 KEYWORD_input,
101 KEYWORD_keygen,
102 KEYWORD_link,
103 KEYWORD_meta,
104 KEYWORD_param,
105 KEYWORD_rel,
106 KEYWORD_source,
107 KEYWORD_src,
108 KEYWORD_track,
109 KEYWORD_wbr
110 } keywordId;
112 static const keywordTable HtmlKeywordTable[] = {
113 {"h1", KEYWORD_h1},
114 {"h2", KEYWORD_h2},
115 {"h3", KEYWORD_h3},
116 {"a", KEYWORD_a},
117 {"script", KEYWORD_script},
118 {"style", KEYWORD_style},
119 {"name", KEYWORD_name},
121 /* void elements */
122 {"area", KEYWORD_area},
123 {"base", KEYWORD_base},
124 {"br", KEYWORD_br},
125 {"class", KEYWORD_class},
126 {"col", KEYWORD_col},
127 {"command", KEYWORD_command},
128 {"embed", KEYWORD_embed},
129 {"hr", KEYWORD_hr},
130 {"href", KEYWORD_href},
131 {"id", KEYWORD_id},
132 {"img", KEYWORD_img},
133 {"input", KEYWORD_input},
134 {"keygen", KEYWORD_keygen},
135 {"link", KEYWORD_link},
136 {"meta", KEYWORD_meta},
137 {"param", KEYWORD_param},
138 {"rel", KEYWORD_rel},
139 {"source", KEYWORD_source},
140 {"src", KEYWORD_src},
141 {"track", KEYWORD_track},
142 {"wbr", KEYWORD_wbr},
145 typedef enum {
146 TOKEN_EOF,
147 TOKEN_NAME, /* tag and attribute names */
148 TOKEN_STRING, /* single- or double-quoted attribute value */
149 TOKEN_TEXT,
150 TOKEN_TAG_START, /* < */
151 TOKEN_TAG_START2, /* </ */
152 TOKEN_TAG_END, /* > */
153 TOKEN_TAG_END2, /* /> */
154 TOKEN_EQUAL,
155 TOKEN_COMMENT,
156 TOKEN_OTHER
157 } tokenType;
159 #ifdef DEBUG
160 const char *tokenTypes[] = {
161 #define E(X) [TOKEN_##X] = #X
162 E(EOF),
163 E(NAME),
164 E(STRING),
165 E(TEXT),
166 E(TAG_START),
167 E(TAG_START2),
168 E(TAG_END),
169 E(TAG_END2),
170 E(EQUAL),
171 E(COMMENT),
172 E(OTHER),
173 #undef E
175 #endif
177 typedef struct {
178 tokenType type;
179 vString *string;
180 } tokenInfo;
183 static int Lang_html;
186 static void readTag (tokenInfo *token, vString *text, int depth);
188 #ifdef DEBUG
189 #if 0
190 static void dumpToken (tokenInfo *token, const char *context, const char* extra_context)
192 fprintf (stderr, "[%7s] %-20s@%s.%s\n",
193 tokenTypes[token->type], vStringValue(token->string),
194 context, extra_context? extra_context: "_");
196 #endif
197 #endif
199 static void readTokenText (tokenInfo *const token, bool collectText)
201 int c;
202 int lastC = 'X'; /* whatever non-space character */
204 vStringClear (token->string);
206 getNextChar:
208 c = getcFromInputFile ();
210 switch (c)
212 case EOF:
213 token->type = TOKEN_EOF;
214 break;
216 case '<':
217 ungetcToInputFile (c);
218 token->type = TOKEN_TEXT;
219 break;
221 default:
222 if (collectText)
224 if (isspace (c))
225 c = ' ';
226 if (c != ' ' || lastC != ' ')
228 vStringPut (token->string, c);
229 lastC = c;
233 goto getNextChar;
237 static void readToken (tokenInfo *const token, bool skipComments)
239 int c;
241 vStringClear (token->string);
243 getNextChar:
245 c = getcFromInputFile ();
246 while (isspace (c))
247 c = getcFromInputFile ();
249 switch (c)
251 case EOF:
252 token->type = TOKEN_EOF;
253 break;
255 case '<':
257 int d = getcFromInputFile ();
259 if (d == '!')
261 d = getcFromInputFile ();
262 if (d == '-')
264 d = getcFromInputFile ();
265 if (d == '-')
267 int e = ' ';
268 int f = ' ';
271 d = e;
272 e = f;
273 f = getcFromInputFile ();
275 while (f != EOF && ! (d == '-' && e == '-' && f == '>'));
277 if (skipComments)
278 goto getNextChar;
279 else
281 token->type = TOKEN_COMMENT;
282 break;
286 ungetcToInputFile (d);
287 token->type = TOKEN_OTHER;
289 else if (d == '?')
290 token->type = TOKEN_OTHER;
291 else if (d == '/')
292 token->type = TOKEN_TAG_START2;
293 else
295 ungetcToInputFile (d);
296 token->type = TOKEN_TAG_START;
298 break;
300 case '/':
302 int d = getcFromInputFile ();
303 if (d == '>')
304 token->type = TOKEN_TAG_END2;
305 else
307 ungetcToInputFile (d);
308 token->type = TOKEN_OTHER;
310 break;
312 case '>':
313 token->type = TOKEN_TAG_END;
314 break;
316 case '=':
317 token->type = TOKEN_EQUAL;
318 break;
320 case '"':
321 case '\'':
323 const int delimiter = c;
324 c = getcFromInputFile ();
325 while (c != EOF && c != delimiter)
327 vStringPut (token->string, c);
328 c = getcFromInputFile ();
330 token->type = TOKEN_STRING;
331 break;
334 default:
338 vStringPut (token->string, tolower (c));
339 c = getcFromInputFile ();
341 while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
342 c != '=' && c != '\'' && c != '"' && c != EOF);
343 if (c != EOF)
344 ungetcToInputFile (c);
345 token->type = TOKEN_NAME;
346 break;
351 static void appendText (vString *text, vString *appendedText)
353 if (text != NULL && vStringLength (appendedText) > 0)
355 if (vStringLength (text) > 0 && vStringLast (text) == ' ' &&
356 vStringLength (appendedText) > 0 && vStringChar (appendedText, 0) == ' ')
358 vStringStripTrailing (text);
360 vStringCat (text, appendedText);
364 static bool readTagContent (tokenInfo *token, vString *text, long *line, long *lineOffset, int depth)
366 tokenType type;
368 readTokenText (token, text != NULL);
369 appendText (text, token->string);
373 *line = getInputLineNumber ();
374 *lineOffset = getInputLineOffset ();
375 readToken (token, false);
376 type = token->type;
377 if (type == TOKEN_TAG_START)
378 readTag (token, text, depth + 1);
379 if (type == TOKEN_COMMENT || type == TOKEN_TAG_START)
381 readTokenText (token, text != NULL);
382 appendText (text, token->string);
385 while (type == TOKEN_COMMENT || type == TOKEN_TAG_START);
387 return type == TOKEN_TAG_START2;
390 static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
392 bool found_start = false;
393 bool found_script = false;
395 long line_tmp[2] = {0};
396 long lineOffset_tmp[2] = {0};
398 tokenType type;
402 line_tmp[0] = getInputLineNumber ();
403 lineOffset_tmp[0] = getInputLineOffset ();
405 readToken (token, false);
406 type = token->type;
408 if (type == TOKEN_TAG_START2)
410 found_start = true;
411 line_tmp[1] = line_tmp[0];
412 lineOffset_tmp[1] = lineOffset_tmp[0];
414 else if (found_start
415 && type == TOKEN_NAME
416 && lookupKeyword (vStringValue (token->string), Lang_html) == KEYWORD_script)
418 found_script = true;
419 *line = line_tmp[1];
420 *lineOffset = lineOffset_tmp[1];
422 else
423 found_start = false;
425 while ((type != TOKEN_EOF) && (!found_script));
427 return found_script;
430 static void makeClassRefTags (const char *classes)
432 vString *klass = vStringNew ();
436 if (*classes && !isspace (*classes))
437 vStringPut (klass, *classes);
438 else if (!vStringIsEmpty (klass))
440 makeSimpleRefTag (klass, K_CLASS,
441 CLASS_KIND_ATTRIBUTE_ROLE);
442 vStringClear (klass);
445 if (!*classes)
446 break;
448 classes++;
449 } while (1);
451 vStringDelete (klass);
454 static void readTag (tokenInfo *token, vString *text, int depth)
456 bool textCreated = false;
458 readToken (token, true);
459 if (token->type == TOKEN_NAME)
461 keywordId startTag;
462 bool isHeading;
463 bool isVoid;
464 vString *stylesheet = NULL;
465 bool stylesheet_expectation = false;
467 startTag = lookupKeyword (vStringValue (token->string), Lang_html);
468 isHeading = (startTag == KEYWORD_h1 || startTag == KEYWORD_h2 || startTag == KEYWORD_h3);
469 isVoid = (startTag >= KEYWORD_area && startTag <= KEYWORD_wbr);
470 if (text == NULL && isHeading)
472 text = vStringNew ();
473 textCreated = true;
478 keywordId attribute = KEYWORD_NONE;
480 readToken (token, true);
481 if (token->type == TOKEN_NAME)
482 attribute = lookupKeyword (vStringValue (token->string), Lang_html);
484 if (attribute == KEYWORD_class)
486 readToken (token, true);
487 if (token->type == TOKEN_EQUAL)
489 readToken (token, true);
490 if (token->type == TOKEN_STRING)
491 makeClassRefTags (vStringValue (token->string));
494 else if (attribute == KEYWORD_id)
496 readToken (token, true);
497 if (token->type == TOKEN_EQUAL)
499 readToken (token, true);
500 if (token->type == TOKEN_STRING)
501 makeSimpleTag (token->string, K_ID);
504 else if (startTag == KEYWORD_a && attribute == KEYWORD_name)
506 readToken (token, true);
507 if (token->type == TOKEN_EQUAL)
509 readToken (token, true);
510 if (token->type == TOKEN_STRING || token->type == TOKEN_NAME)
511 makeSimpleTag (token->string, K_ANCHOR);
514 else if (startTag == KEYWORD_script && attribute == KEYWORD_src)
516 readToken (token, true);
517 if (token->type == TOKEN_EQUAL)
519 readToken (token, true);
520 if (token->type == TOKEN_STRING)
521 makeSimpleRefTag (token->string, K_SCRIPT,
522 SCRIPT_KIND_EXTERNAL_FILE_ROLE);
525 else if (startTag == KEYWORD_link)
527 if (attribute == KEYWORD_rel)
529 readToken (token, true);
530 if (token->type == TOKEN_EQUAL)
532 readToken (token, true);
533 if (token->type == TOKEN_STRING &&
534 /* strcmp is not enough:
535 * e.g. <link href="fancy.css"
536 * rel="alternate stylesheet" title="Fancy"> */
537 vStringLength(token->string) >= 10 &&
538 strstr (vStringValue (token->string), "stylesheet"))
539 stylesheet_expectation = true;
542 else if (attribute == KEYWORD_href)
544 readToken (token, true);
545 if (token->type == TOKEN_EQUAL)
547 readToken (token, true);
548 if (token->type == TOKEN_STRING)
550 if (stylesheet == NULL)
551 stylesheet = vStringNewCopy (token->string);
552 else
553 vStringCopy (stylesheet, token->string);
557 if (stylesheet_expectation && stylesheet && !vStringIsEmpty (stylesheet))
559 makeSimpleRefTag (stylesheet, K_STYELSHEET,
560 STYLESHEET_KIND_EXTERNAL_FILE_ROLE);
561 stylesheet_expectation = false;
562 if (stylesheet)
563 vStringClear (stylesheet);
567 while (token->type != TOKEN_TAG_END && token->type != TOKEN_TAG_END2 &&
568 token->type != TOKEN_EOF);
570 vStringDelete (stylesheet);
571 stylesheet = NULL;
573 if (!isVoid && token->type == TOKEN_TAG_END && depth < MAX_DEPTH)
575 long startSourceLineNumber = getSourceLineNumber ();
576 long startLineNumber = getInputLineNumber ();
577 long startLineOffset = getInputLineOffset ();
578 long endLineNumber;
579 long endLineOffset;
580 bool tag_start2;
582 if (startTag == KEYWORD_script)
584 bool script = skipScriptContent (token, &endLineNumber, &endLineOffset);
585 if (script)
586 makePromise ("JavaScript", startLineNumber, startLineOffset,
587 endLineNumber, endLineOffset, startSourceLineNumber);
588 readToken (token, true);
589 goto out;
592 tag_start2 = readTagContent (token, text, &endLineNumber, &endLineOffset, depth);
593 if (tag_start2)
595 readToken (token, true);
596 if (isHeading && textCreated && vStringLength (text) > 0)
598 keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
599 if (startTag == endTag)
601 htmlKind headingKind;
603 if (startTag == KEYWORD_h1)
604 headingKind = K_HEADING1;
605 else if (startTag == KEYWORD_h2)
606 headingKind = K_HEADING2;
607 else
608 headingKind = K_HEADING3;
610 vStringStripLeading (text);
611 vStringStripTrailing (text);
612 makeSimpleTag (text, headingKind);
615 else if (startTag == KEYWORD_style)
617 keywordId endTag = lookupKeyword (vStringValue (token->string), Lang_html);
618 if (startTag == endTag)
619 makePromise ("CSS", startLineNumber, startLineOffset,
620 endLineNumber, endLineOffset, startSourceLineNumber);
623 readToken (token, true);
628 out:
629 if (textCreated)
630 vStringDelete (text);
633 static void findHtmlTags (void)
635 tokenInfo token;
637 token.string = vStringNew ();
641 readToken (&token, true);
642 if (token.type == TOKEN_TAG_START)
643 readTag (&token, NULL, 0);
645 while (token.type != TOKEN_EOF);
647 vStringDelete (token.string);
650 static void initialize (const langType language)
652 Lang_html = language;
655 /* parser definition */
656 extern parserDefinition* HtmlParser (void)
658 static const char *const extensions [] = { "htm", "html", NULL };
659 parserDefinition* def = parserNew ("HTML");
660 def->kindTable = HtmlKinds;
661 def->kindCount = ARRAY_SIZE (HtmlKinds);
662 def->extensions = extensions;
663 def->parser = findHtmlTags;
664 def->initialize = initialize;
665 def->keywordTable = HtmlKeywordTable;
666 def->keywordCount = ARRAY_SIZE (HtmlKeywordTable);
667 return def;