2 * Copyright (c) 2016, Jiri Techet
4 * This source code is released for free distribution under the terms of the
5 * GNU General Public License version 2 or (at your option) any later version.
7 * This module contains functions for generating tags for HTML language
23 /* The max. number of nested elements - prevents further recursion if the limit
24 * is exceeded and avoids stack overflow for invalid input containing too many
26 #define MAX_DEPTH 1000
42 CLASS_KIND_ATTRIBUTE_ROLE
,
46 SCRIPT_KIND_EXTERNAL_FILE_ROLE
,
50 STYLESHEET_KIND_EXTERNAL_FILE_ROLE
,
53 static roleDefinition ClassRoles
[] = {
54 { true, "attribute", "assigned as attributes" },
57 static roleDefinition ScriptRoles
[] = {
58 { true, "extFile", "referenced as external files" },
61 static roleDefinition StylesheetRoles
[] = {
62 { true, "extFile", "referenced as external files" },
65 static kindDefinition HtmlKinds
[] = {
66 { true, 'a', "anchor", "named anchors" },
67 { true, 'c', "class", "classes",
68 .referenceOnly
= true, ATTACH_ROLES (ClassRoles
)},
69 { true, 'h', "heading1", "H1 headings" },
70 { true, 'i', "heading2", "H2 headings" },
71 { true, 'j', "heading3", "H3 headings" },
72 { true, 'C', "stylesheet", "stylesheets",
73 .referenceOnly
= true, ATTACH_ROLES (StylesheetRoles
)},
74 { true, 'I', "id", "identifiers" },
75 { true, 'J', "script", "scripts",
76 .referenceOnly
= true, ATTACH_ROLES (ScriptRoles
)},
112 static const keywordTable HtmlKeywordTable
[] = {
117 {"script", KEYWORD_script
},
118 {"style", KEYWORD_style
},
119 {"name", KEYWORD_name
},
122 {"area", KEYWORD_area
},
123 {"base", KEYWORD_base
},
125 {"class", KEYWORD_class
},
126 {"col", KEYWORD_col
},
127 {"command", KEYWORD_command
},
128 {"embed", KEYWORD_embed
},
130 {"href", KEYWORD_href
},
132 {"img", KEYWORD_img
},
133 {"input", KEYWORD_input
},
134 {"keygen", KEYWORD_keygen
},
135 {"link", KEYWORD_link
},
136 {"meta", KEYWORD_meta
},
137 {"param", KEYWORD_param
},
138 {"rel", KEYWORD_rel
},
139 {"source", KEYWORD_source
},
140 {"src", KEYWORD_src
},
141 {"track", KEYWORD_track
},
142 {"wbr", KEYWORD_wbr
},
147 TOKEN_NAME
, /* tag and attribute names */
148 TOKEN_STRING
, /* single- or double-quoted attribute value */
150 TOKEN_TAG_START
, /* < */
151 TOKEN_TAG_START2
, /* </ */
152 TOKEN_TAG_END
, /* > */
153 TOKEN_TAG_END2
, /* /> */
160 const char *tokenTypes
[] = {
161 #define E(X) [TOKEN_##X] = #X
183 static int Lang_html
;
186 static void readTag (tokenInfo
*token
, vString
*text
, int depth
);
190 static void dumpToken (tokenInfo
*token
, const char *context
, const char* extra_context
)
192 fprintf (stderr
, "[%7s] %-20s@%s.%s\n",
193 tokenTypes
[token
->type
], vStringValue(token
->string
),
194 context
, extra_context
? extra_context
: "_");
199 static void readTokenText (tokenInfo
*const token
, bool collectText
)
202 int lastC
= 'X'; /* whatever non-space character */
204 vStringClear (token
->string
);
208 c
= getcFromInputFile ();
213 token
->type
= TOKEN_EOF
;
217 ungetcToInputFile (c
);
218 token
->type
= TOKEN_TEXT
;
226 if (c
!= ' ' || lastC
!= ' ')
228 vStringPut (token
->string
, c
);
237 static void readToken (tokenInfo
*const token
, bool skipComments
)
241 vStringClear (token
->string
);
245 c
= getcFromInputFile ();
247 c
= getcFromInputFile ();
252 token
->type
= TOKEN_EOF
;
257 int d
= getcFromInputFile ();
261 d
= getcFromInputFile ();
264 d
= getcFromInputFile ();
273 f
= getcFromInputFile ();
275 while (f
!= EOF
&& ! (d
== '-' && e
== '-' && f
== '>'));
281 token
->type
= TOKEN_COMMENT
;
286 ungetcToInputFile (d
);
287 token
->type
= TOKEN_OTHER
;
290 token
->type
= TOKEN_OTHER
;
292 token
->type
= TOKEN_TAG_START2
;
295 ungetcToInputFile (d
);
296 token
->type
= TOKEN_TAG_START
;
302 int d
= getcFromInputFile ();
304 token
->type
= TOKEN_TAG_END2
;
307 ungetcToInputFile (d
);
308 token
->type
= TOKEN_OTHER
;
313 token
->type
= TOKEN_TAG_END
;
317 token
->type
= TOKEN_EQUAL
;
323 const int delimiter
= c
;
324 c
= getcFromInputFile ();
325 while (c
!= EOF
&& c
!= delimiter
)
327 vStringPut (token
->string
, c
);
328 c
= getcFromInputFile ();
330 token
->type
= TOKEN_STRING
;
338 vStringPut (token
->string
, tolower (c
));
339 c
= getcFromInputFile ();
341 while (!isspace (c
) && c
!= '<' && c
!= '>' && c
!= '/' &&
342 c
!= '=' && c
!= '\'' && c
!= '"' && c
!= EOF
);
344 ungetcToInputFile (c
);
345 token
->type
= TOKEN_NAME
;
351 static void appendText (vString
*text
, vString
*appendedText
)
353 if (text
!= NULL
&& vStringLength (appendedText
) > 0)
355 if (vStringLength (text
) > 0 && vStringLast (text
) == ' ' &&
356 vStringLength (appendedText
) > 0 && vStringChar (appendedText
, 0) == ' ')
358 vStringStripTrailing (text
);
360 vStringCat (text
, appendedText
);
364 static bool readTagContent (tokenInfo
*token
, vString
*text
, long *line
, long *lineOffset
, int depth
)
368 readTokenText (token
, text
!= NULL
);
369 appendText (text
, token
->string
);
373 *line
= getInputLineNumber ();
374 *lineOffset
= getInputLineOffset ();
375 readToken (token
, false);
377 if (type
== TOKEN_TAG_START
)
378 readTag (token
, text
, depth
+ 1);
379 if (type
== TOKEN_COMMENT
|| type
== TOKEN_TAG_START
)
381 readTokenText (token
, text
!= NULL
);
382 appendText (text
, token
->string
);
385 while (type
== TOKEN_COMMENT
|| type
== TOKEN_TAG_START
);
387 return type
== TOKEN_TAG_START2
;
390 static bool skipScriptContent (tokenInfo
*token
, long *line
, long *lineOffset
)
392 bool found_start
= false;
393 bool found_script
= false;
395 long line_tmp
[2] = {0};
396 long lineOffset_tmp
[2] = {0};
402 line_tmp
[0] = getInputLineNumber ();
403 lineOffset_tmp
[0] = getInputLineOffset ();
405 readToken (token
, false);
408 if (type
== TOKEN_TAG_START2
)
411 line_tmp
[1] = line_tmp
[0];
412 lineOffset_tmp
[1] = lineOffset_tmp
[0];
415 && type
== TOKEN_NAME
416 && lookupKeyword (vStringValue (token
->string
), Lang_html
) == KEYWORD_script
)
420 *lineOffset
= lineOffset_tmp
[1];
425 while ((type
!= TOKEN_EOF
) && (!found_script
));
430 static void makeClassRefTags (const char *classes
)
432 vString
*klass
= vStringNew ();
436 if (*classes
&& !isspace (*classes
))
437 vStringPut (klass
, *classes
);
438 else if (!vStringIsEmpty (klass
))
440 makeSimpleRefTag (klass
, K_CLASS
,
441 CLASS_KIND_ATTRIBUTE_ROLE
);
442 vStringClear (klass
);
451 vStringDelete (klass
);
454 static void readTag (tokenInfo
*token
, vString
*text
, int depth
)
456 bool textCreated
= false;
458 readToken (token
, true);
459 if (token
->type
== TOKEN_NAME
)
464 vString
*stylesheet
= NULL
;
465 bool stylesheet_expectation
= false;
467 startTag
= lookupKeyword (vStringValue (token
->string
), Lang_html
);
468 isHeading
= (startTag
== KEYWORD_h1
|| startTag
== KEYWORD_h2
|| startTag
== KEYWORD_h3
);
469 isVoid
= (startTag
>= KEYWORD_area
&& startTag
<= KEYWORD_wbr
);
470 if (text
== NULL
&& isHeading
)
472 text
= vStringNew ();
478 keywordId attribute
= KEYWORD_NONE
;
480 readToken (token
, true);
481 if (token
->type
== TOKEN_NAME
)
482 attribute
= lookupKeyword (vStringValue (token
->string
), Lang_html
);
484 if (attribute
== KEYWORD_class
)
486 readToken (token
, true);
487 if (token
->type
== TOKEN_EQUAL
)
489 readToken (token
, true);
490 if (token
->type
== TOKEN_STRING
)
491 makeClassRefTags (vStringValue (token
->string
));
494 else if (attribute
== KEYWORD_id
)
496 readToken (token
, true);
497 if (token
->type
== TOKEN_EQUAL
)
499 readToken (token
, true);
500 if (token
->type
== TOKEN_STRING
)
501 makeSimpleTag (token
->string
, K_ID
);
504 else if (startTag
== KEYWORD_a
&& attribute
== KEYWORD_name
)
506 readToken (token
, true);
507 if (token
->type
== TOKEN_EQUAL
)
509 readToken (token
, true);
510 if (token
->type
== TOKEN_STRING
|| token
->type
== TOKEN_NAME
)
511 makeSimpleTag (token
->string
, K_ANCHOR
);
514 else if (startTag
== KEYWORD_script
&& attribute
== KEYWORD_src
)
516 readToken (token
, true);
517 if (token
->type
== TOKEN_EQUAL
)
519 readToken (token
, true);
520 if (token
->type
== TOKEN_STRING
)
521 makeSimpleRefTag (token
->string
, K_SCRIPT
,
522 SCRIPT_KIND_EXTERNAL_FILE_ROLE
);
525 else if (startTag
== KEYWORD_link
)
527 if (attribute
== KEYWORD_rel
)
529 readToken (token
, true);
530 if (token
->type
== TOKEN_EQUAL
)
532 readToken (token
, true);
533 if (token
->type
== TOKEN_STRING
&&
534 /* strcmp is not enough:
535 * e.g. <link href="fancy.css"
536 * rel="alternate stylesheet" title="Fancy"> */
537 vStringLength(token
->string
) >= 10 &&
538 strstr (vStringValue (token
->string
), "stylesheet"))
539 stylesheet_expectation
= true;
542 else if (attribute
== KEYWORD_href
)
544 readToken (token
, true);
545 if (token
->type
== TOKEN_EQUAL
)
547 readToken (token
, true);
548 if (token
->type
== TOKEN_STRING
)
550 if (stylesheet
== NULL
)
551 stylesheet
= vStringNewCopy (token
->string
);
553 vStringCopy (stylesheet
, token
->string
);
557 if (stylesheet_expectation
&& stylesheet
&& !vStringIsEmpty (stylesheet
))
559 makeSimpleRefTag (stylesheet
, K_STYELSHEET
,
560 STYLESHEET_KIND_EXTERNAL_FILE_ROLE
);
561 stylesheet_expectation
= false;
563 vStringClear (stylesheet
);
567 while (token
->type
!= TOKEN_TAG_END
&& token
->type
!= TOKEN_TAG_END2
&&
568 token
->type
!= TOKEN_EOF
);
570 vStringDelete (stylesheet
);
573 if (!isVoid
&& token
->type
== TOKEN_TAG_END
&& depth
< MAX_DEPTH
)
575 long startSourceLineNumber
= getSourceLineNumber ();
576 long startLineNumber
= getInputLineNumber ();
577 long startLineOffset
= getInputLineOffset ();
582 if (startTag
== KEYWORD_script
)
584 bool script
= skipScriptContent (token
, &endLineNumber
, &endLineOffset
);
586 makePromise ("JavaScript", startLineNumber
, startLineOffset
,
587 endLineNumber
, endLineOffset
, startSourceLineNumber
);
588 readToken (token
, true);
592 tag_start2
= readTagContent (token
, text
, &endLineNumber
, &endLineOffset
, depth
);
595 readToken (token
, true);
596 if (isHeading
&& textCreated
&& vStringLength (text
) > 0)
598 keywordId endTag
= lookupKeyword (vStringValue (token
->string
), Lang_html
);
599 if (startTag
== endTag
)
601 htmlKind headingKind
;
603 if (startTag
== KEYWORD_h1
)
604 headingKind
= K_HEADING1
;
605 else if (startTag
== KEYWORD_h2
)
606 headingKind
= K_HEADING2
;
608 headingKind
= K_HEADING3
;
610 vStringStripLeading (text
);
611 vStringStripTrailing (text
);
612 makeSimpleTag (text
, headingKind
);
615 else if (startTag
== KEYWORD_style
)
617 keywordId endTag
= lookupKeyword (vStringValue (token
->string
), Lang_html
);
618 if (startTag
== endTag
)
619 makePromise ("CSS", startLineNumber
, startLineOffset
,
620 endLineNumber
, endLineOffset
, startSourceLineNumber
);
623 readToken (token
, true);
630 vStringDelete (text
);
633 static void findHtmlTags (void)
637 token
.string
= vStringNew ();
641 readToken (&token
, true);
642 if (token
.type
== TOKEN_TAG_START
)
643 readTag (&token
, NULL
, 0);
645 while (token
.type
!= TOKEN_EOF
);
647 vStringDelete (token
.string
);
650 static void initialize (const langType language
)
652 Lang_html
= language
;
655 /* parser definition */
656 extern parserDefinition
* HtmlParser (void)
658 static const char *const extensions
[] = { "htm", "html", NULL
};
659 parserDefinition
* def
= parserNew ("HTML");
660 def
->kindTable
= HtmlKinds
;
661 def
->kindCount
= ARRAY_SIZE (HtmlKinds
);
662 def
->extensions
= extensions
;
663 def
->parser
= findHtmlTags
;
664 def
->initialize
= initialize
;
665 def
->keywordTable
= HtmlKeywordTable
;
666 def
->keywordCount
= ARRAY_SIZE (HtmlKeywordTable
);