src/document/html/parser/parse.c

   1 /* HTML core parser routines */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <errno.h>
   8 #include <stdarg.h>
   9 #include <stdio.h>
  10 #include <stdlib.h>
  11 #include <string.h>
  12
  13 #include "elinks.h"
  14
  15 #include "document/css/apply.h"
  16 #include "document/css/parser.h"
  17 #include "document/html/parser/forms.h"
  18 #include "document/html/parser/general.h"
  19 #include "document/html/parser/link.h"
  20 #include "document/html/parser/parse.h"
  21 #include "document/html/parser/stack.h"
  22 #include "document/html/parser.h"
  23 #include "document/options.h"
  24 #include "intl/charsets.h"
  25 #include "util/conv.h"
  26 #include "util/error.h"
  27 #include "util/fastfind.h"
  28 #include "util/memdebug.h"
  29 #include "util/memory.h"
  30 #include "util/string.h"
  31
  32 /* Unsafe macros */
  33 #include "document/html/internal.h"
  34
  35
  36 #define end_of_tag(c) ((c) == '>' || (c) == '<')
  37
  38 static inline int
  39 atchr(register unsigned char c)
  40 {
  41         return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));
  42 }
  43
  44 /* This function eats one html element. */
  45 /* - e is pointer to the begining of the element (*e must be '<')
  46  * - eof is pointer to the end of scanned area
  47  * - parsed element name is stored in name, it's length is namelen
  48  * - first attribute is stored in attr
  49  * - end points to first character behind the html element */
  50 /* It returns -1 when it failed (returned values in pointers are invalid) and
  51  * 0 for success. */
  52 int
  53 parse_element(register unsigned char *e, unsigned char *eof,
  54               unsigned char **name, int *namelen,
  55               unsigned char **attr, unsigned char **end)
  56 {
  57 #define next_char() if (++e == eof) return -1;
  58
  59         assert(e && eof);
  60         if (e >= eof || *e != '<') return -1;
  61
  62         next_char();
  63         if (name) *name = e;
  64
  65         if (*e == '/') next_char();
  66         if (!isident(*e)) return -1;
  67
  68         while (isident(*e)) next_char();
  69
  70         if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')
  71                 return -1;
  72
  73         if (name && namelen) *namelen = e - *name;
  74
  75         while (isspace(*e) || *e == '/' || *e == ':') next_char();
  76
  77         /* Skip bad attribute */
  78         while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
  79
  80         if (attr) *attr = e;
  81
  82 next_attr:
  83         while (isspace(*e)) next_char();
  84
  85         /* Skip bad attribute */
  86         while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
  87
  88         if (end_of_tag(*e)) goto end;
  89
  90         while (atchr(*e)) next_char();
  91         while (isspace(*e)) next_char();
  92
  93         if (*e != '=') {
  94                 if (end_of_tag(*e)) goto end;
  95                 goto next_attr;
  96         }
  97         next_char();
  98
  99         while (isspace(*e)) next_char();
 100
 101         if (isquote(*e)) {
 102                 unsigned char quote = *e;
 103
 104 /* quoted_value: */
 105                 next_char();
 106                 while (*e != quote) next_char();
 107                 next_char();
 108                 /* The following apparently handles the case of <foo
 109                  * id="a""b">, however that is very rare and probably not
 110                  * conforming. More frequent (and mishandling it more fatal) is
 111                  * probably the typo of <foo id="a""> - we can handle it as
 112                  * long as this is commented out. --pasky */
 113                 /* if (*e == quote) goto quoted_value; */
 114         } else {
 115                 while (!isspace(*e) && !end_of_tag(*e)) next_char();
 116         }
 117
 118         while (isspace(*e)) next_char();
 119
 120         if (!end_of_tag(*e)) goto next_attr;
 121
 122 end:
 123         if (end) *end = e + (*e == '>');
 124
 125         return 0;
 126 }
 127
 128
 129 #define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, 0xFF)
 130
 131 #define add_chr(s, l, c)                                                \
 132         do {                                                            \
 133                 if (!realloc_chrs(&(s), l)) return NULL;                \
 134                 (s)[(l)++] = (c);                                       \
 135         } while (0)
 136
 137 unsigned char *
 138 get_attr_value(register unsigned char *e, unsigned char *name,
 139                int cp, enum html_attr_flags flags)
 140 {
 141         unsigned char *n;
 142         unsigned char *name_start;
 143         unsigned char *attr = NULL;
 144         int attrlen = 0;
 145         int found;
 146
 147 next_attr:
 148         skip_space(e);
 149         if (end_of_tag(*e) || !atchr(*e)) goto parse_error;
 150         n = name;
 151         name_start = e;
 152
 153         while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;
 154         found = !*n && !atchr(*e);
 155
 156         if (found && (flags & HTML_ATTR_TEST)) return name_start;
 157
 158         while (atchr(*e)) e++;
 159         skip_space(e);
 160         if (*e != '=') {
 161                 if (found) goto found_endattr;
 162                 goto next_attr;
 163         }
 164         e++;
 165         skip_space(e);
 166
 167         if (found) {
 168                 if (!isquote(*e)) {
 169                         while (!isspace(*e) && !end_of_tag(*e)) {
 170                                 if (!*e) goto parse_error;
 171                                 add_chr(attr, attrlen, *e);
 172                                 e++;
 173                         }
 174                 } else {
 175                         unsigned char quote = *e;
 176
 177 /* parse_quoted_value: */
 178                         while (*(++e) != quote) {
 179                                 if (*e == ASCII_CR) continue;
 180                                 if (!*e) goto parse_error;
 181                                 if (*e != ASCII_TAB && *e != ASCII_LF)
 182                                         add_chr(attr, attrlen, *e);
 183                                 else if (!(flags & HTML_ATTR_EAT_NL))
 184                                         add_chr(attr, attrlen, ' ');
 185                         }
 186                         e++;
 187                         /* The following apparently handles the case of <foo
 188                          * id="a""b">, however that is very rare and probably
 189                          * not conforming. More frequent (and mishandling it
 190                          * more fatal) is probably the typo of <foo id="a""> -
 191                          * we can handle it as long as this is commented out.
 192                          * --pasky */
 193 #if 0
 194                         if (*e == quote) {
 195                                 add_chr(attr, attrlen, *e);
 196                                 goto parse_quoted_value;
 197                         }
 198 #endif
 199                 }
 200
 201 found_endattr:
 202                 add_chr(attr, attrlen, '\0');
 203                 attrlen--;
 204
 205                 if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */
 206                     memchr(attr, '&', attrlen)) {
 207                         unsigned char *saved_attr = attr;
 208
 209                         attr = convert_string(NULL, saved_attr, attrlen, cp,
 210                                               CSM_QUERY, NULL, NULL, NULL);
 211                         mem_free(saved_attr);
 212                 }
 213
 214                 set_mem_comment(attr, name, strlen(name));
 215                 return attr;
 216
 217         } else {
 218                 if (!isquote(*e)) {
 219                         while (!isspace(*e) && !end_of_tag(*e)) {
 220                                 if (!*e) goto parse_error;
 221                                 e++;
 222                         }
 223                 } else {
 224                         unsigned char quote = *e;
 225
 226                         do {
 227                                 while (*(++e) != quote)
 228                                         if (!*e) goto parse_error;
 229                                 e++;
 230                         } while (/* See above. *e == quote */ 0);
 231                 }
 232         }
 233
 234         goto next_attr;
 235
 236 parse_error:
 237         mem_free_if(attr);
 238         return NULL;
 239 }
 240
 241 #undef add_chr
 242
 243
 244 /* Extract numerical value of attribute @name.
 245  * It will return a positive integer value on success,
 246  * or -1 on error. */
 247 int
 248 get_num(unsigned char *a, unsigned char *name, int cp)
 249 {
 250         unsigned char *al = get_attr_val(a, name, cp);
 251         int result = -1;
 252
 253         if (al) {
 254                 unsigned char *end;
 255                 long num;
 256
 257                 errno = 0;
 258                 num = strtol(al, (char **) &end, 10);
 259                 if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)
 260                         result = (int) num;
 261
 262                 mem_free(al);
 263         }
 264
 265         return result;
 266 }
 267
 268 /* Parse 'width[%],....'-like attribute @name of element @a.  If @limited is
 269  * set, it will limit the width value to the current usable width. Note that
 270  * @limited must be set to be able to parse percentage widths. */
 271 /* The function returns width in characters or -1 in case of error. */
 272 int
 273 get_width(unsigned char *a, unsigned char *name, int limited,
 274           struct html_context *html_context)
 275 {
 276         unsigned char *value = get_attr_val(a, name, html_context->options->cp);
 277         unsigned char *str = value;
 278         unsigned char *end;
 279         int percentage = 0;
 280         int len;
 281         long width;
 282
 283         if (!value) return -1;
 284
 285         /* Skip spaces at start of string if any. */
 286         skip_space(str);
 287
 288         /* Search for end of string or ',' character (ie. in "100,200") */
 289         for (len = 0; str[len] && str[len] != ','; len++);
 290
 291         /* Go back, and skip spaces after width if any. */
 292         while (len && isspace(str[len - 1])) len--;
 293         if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
 294
 295         /* Is this a percentage ? */
 296         if (str[len - 1] == '%') len--, percentage = 1;
 297
 298         /* Skip spaces between width number and percentage if any. */
 299         while (len && isspace(str[len - 1])) len--;
 300         if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
 301
 302         /* Shorten the string a bit, so strtoul() will work on useful
 303          * part of it. */
 304         str[len] = '\0';
 305
 306         /* Convert to number if possible. */
 307         errno = 0;
 308         width = strtoul((char *) str, (char **) &end, 10);
 309
 310         /* @end points into the @value string so check @end position
 311          * before freeing @value. */
 312         /* We will accept floats but ceil() them. */
 313         if (errno || (*end && *end != '.') || width >= INT_MAX) {
 314                 /* Not a valid number. */
 315                 mem_free(value);
 316                 return -1;
 317         }
 318
 319         mem_free(value);
 320
 321 #define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;
 322
 323         if (limited) {
 324                 int maxwidth = get_html_max_width();
 325
 326                 if (percentage) {
 327                         /* Value is a percentage. */
 328                         width = width * maxwidth / 100;
 329                 } else {
 330                         /* Value is a number of pixels, makes an approximation. */
 331                         width = WIDTH_PIXELS2CHARS(width);
 332                 }
 333
 334                 if (width > maxwidth)
 335                         width = maxwidth;
 336
 337         } else {
 338                 if (percentage) {
 339                         /* No sense, we need @limited and @maxwidth for percentage. */
 340                         return -1;
 341                 } else {
 342                         /* Value is a number of pixels, makes an approximation,
 343                          * no limit here */
 344                         width = WIDTH_PIXELS2CHARS(width);
 345                 }
 346         }
 347
 348 #undef WIDTH_PIXELS2CHARS
 349
 350         if (width < 0)
 351                 width = 0;
 352
 353         return width;
 354 }
 355
 356
 357 unsigned char *
 358 skip_comment(unsigned char *html, unsigned char *eof)
 359 {
 360         if (html + 4 <= eof && html[2] == '-' && html[3] == '-') {
 361                 html += 4;
 362                 while (html < eof) {
 363                         if (html + 2 <= eof && html[0] == '-' && html[1] == '-') {
 364                                 html += 2;
 365                                 while (html < eof && *html == '-') html++;
 366                                 while (html < eof && isspace(*html)) html++;
 367                                 if (html >= eof) return eof;
 368                                 if (*html == '>') return html + 1;
 369                                 continue;
 370                         }
 371                         html++;
 372                 }
 373
 374         } else {
 375                 html += 2;
 376                 while (html < eof) {
 377                         if (html[0] == '>') return html + 1;
 378                         html++;
 379                 }
 380         }
 381
 382         return eof;
 383 }
 384
 385
 386
 387
 388 enum element_type {
 389         ET_NESTABLE,
 390         ET_NON_NESTABLE,
 391         ET_NON_PAIRABLE,
 392         ET_LI,
 393 };
 394
 395 struct element_info {
 396         /* Element name, uppercase. */
 397         unsigned char *name;
 398
 399         /* Element handler. This does the relevant arguments processing and
 400          * formatting (by calling renderer hooks). Note that in a few cases,
 401          * this is just a placeholder and the element is given special care
 402          * in start_element() (which is also where we call these handlers). */
 403         element_handler_T *open;
 404
 405         element_handler_T *close;
 406
 407         /* How many line-breaks to ensure we have before and after an element.
 408          * Value of 1 means the element will be on a line on its own, value
 409          * of 2 means that it will also have empty lines before and after.
 410          * Note that this does not add up - it just ensures that there is
 411          * at least so many linebreaks, but does not add more if that is the
 412          * case. Therefore, something like e.g. </pre></p> will add only two
 413          * linebreaks, not four. */
 414         /* In some stack killing logic, we use some weird heuristic based on
 415          * whether an element is block or inline. That is determined from
 416          * whether this attribute is zero on non-zero. */
 417         int linebreak;
 418
 419         enum element_type type;
 420 };
 421
 422 static struct element_info elements[] = {
 423  {"A",           html_a,           NULL,                 0, ET_NON_NESTABLE},
 424  {"ABBR",        html_italic,      NULL,                 0, ET_NESTABLE    },
 425  {"ADDRESS",     html_address,     NULL,                 2, ET_NESTABLE    },
 426  {"APPLET",      html_applet,      NULL,                 1, ET_NON_PAIRABLE},
 427  {"B",           html_bold,        NULL,                 0, ET_NESTABLE    },
 428  {"BASE",        html_base,        NULL,                 0, ET_NON_PAIRABLE},
 429  {"BASEFONT",    html_font,        NULL,                 0, ET_NON_PAIRABLE},
 430  {"BLOCKQUOTE",  html_blockquote,  NULL,                 2, ET_NESTABLE    },
 431  {"BODY",        html_body,        NULL,                 0, ET_NESTABLE    },
 432  {"BR",          html_br,          NULL,                 1, ET_NON_PAIRABLE},
 433  {"BUTTON",      html_button,      NULL,                 0, ET_NESTABLE    },
 434  {"CAPTION",     html_center,      NULL,                 1, ET_NESTABLE    },
 435  {"CENTER",      html_center,      NULL,                 1, ET_NESTABLE    },
 436  {"CODE",        html_fixed,       NULL,                 0, ET_NESTABLE    },
 437  {"DD",          html_dd,          NULL,                 1, ET_NON_PAIRABLE},
 438  {"DFN",         html_bold,        NULL,                 0, ET_NESTABLE    },
 439  {"DIR",         html_ul,          NULL,                 2, ET_NESTABLE    },
 440  {"DIV",         html_linebrk,     NULL,                 1, ET_NESTABLE    },
 441  {"DL",          html_dl,          NULL,                 2, ET_NESTABLE    },
 442  {"DT",          html_dt,          NULL,                 1, ET_NON_PAIRABLE},
 443  {"EM",          html_italic,      NULL,                 0, ET_NESTABLE    },
 444  {"EMBED",       html_embed,       NULL,                 0, ET_NON_PAIRABLE},
 445  {"FIXED",       html_fixed,       NULL,                 0, ET_NESTABLE    },
 446  {"FONT",        html_font,        NULL,                 0, ET_NESTABLE    },
 447  {"FORM",        html_form,        NULL,                 1, ET_NESTABLE    },
 448  {"FRAME",       html_frame,       NULL,                 1, ET_NON_PAIRABLE},
 449  {"FRAMESET",    html_frameset,    NULL,                 1, ET_NESTABLE    },
 450  {"H1",          html_h1,          NULL,                 2, ET_NON_NESTABLE},
 451  {"H2",          html_h2,          NULL,                 2, ET_NON_NESTABLE},
 452  {"H3",          html_h3,          NULL,                 2, ET_NON_NESTABLE},
 453  {"H4",          html_h4,          NULL,                 2, ET_NON_NESTABLE},
 454  {"H5",          html_h5,          NULL,                 2, ET_NON_NESTABLE},
 455  {"H6",          html_h6,          NULL,                 2, ET_NON_NESTABLE},
 456  {"HEAD",        html_head,        NULL,                 0, ET_NESTABLE    },
 457  {"HR",          html_hr,          NULL,                 2, ET_NON_PAIRABLE},
 458  {"HTML",        html_html,        html_html_close,      0, ET_NESTABLE    },
 459  {"I",           html_italic,      NULL,                 0, ET_NESTABLE    },
 460  {"IFRAME",      html_iframe,      NULL,                 1, ET_NON_PAIRABLE},
 461  {"IMG",         html_img,         NULL,                 0, ET_NON_PAIRABLE},
 462  {"INPUT",       html_input,       NULL,                 0, ET_NON_PAIRABLE},
 463  {"LI",          html_li,          NULL,                 1, ET_LI          },
 464  {"LINK",        html_link,        NULL,                 1, ET_NON_PAIRABLE},
 465  {"LISTING",     html_pre,         NULL,                 2, ET_NESTABLE    },
 466  {"MENU",        html_ul,          NULL,                 2, ET_NESTABLE    },
 467  {"META",        html_meta,        NULL,                 0, ET_NON_PAIRABLE},
 468  {"NOFRAMES",    html_noframes,    NULL,                 0, ET_NESTABLE    },
 469  {"NOSCRIPT",    html_noscript,    NULL,                 0, ET_NESTABLE    },
 470  {"OBJECT",      html_object,      NULL,                 1, ET_NON_PAIRABLE},
 471  {"OL",          html_ol,          NULL,                 2, ET_NESTABLE    },
 472  {"OPTION",      html_option,      NULL,                 1, ET_NON_PAIRABLE},
 473  {"P",           html_p,           NULL,                 2, ET_NON_NESTABLE},
 474  {"PRE",         html_pre,         NULL,                 2, ET_NESTABLE    },
 475  {"Q",           html_quote,       html_quote_close,     0, ET_NESTABLE    },
 476  {"S",           html_underline,   NULL,                 0, ET_NESTABLE    },
 477  {"SCRIPT",      html_script,      NULL,                 0, ET_NESTABLE    },
 478  {"SELECT",      html_select,      NULL,                 0, ET_NESTABLE    },
 479  {"SPAN",        html_span,        NULL,                 0, ET_NESTABLE    },
 480  {"STRIKE",      html_underline,   NULL,                 0, ET_NESTABLE    },
 481  {"STRONG",      html_bold,        NULL,                 0, ET_NESTABLE    },
 482  {"STYLE",       html_style,       html_style_close,     0, ET_NESTABLE    },
 483  {"SUB",         html_subscript,   html_subscript_close, 0, ET_NESTABLE    },
 484  {"SUP",         html_superscript, NULL,                 0, ET_NESTABLE    },
 485  {"TABLE",       html_table,       NULL,                 2, ET_NESTABLE    },
 486  {"TD",          html_td,          NULL,                 0, ET_NESTABLE    },
 487  {"TEXTAREA",    html_textarea,    NULL,                 0, ET_NON_PAIRABLE},
 488  {"TH",          html_th,          NULL,                 0, ET_NESTABLE    },
 489  {"TITLE",       html_title,       NULL,                 0, ET_NESTABLE    },
 490  {"TR",          html_tr,          NULL,                 1, ET_NESTABLE    },
 491  {"TT",          html_tt,          NULL,                 0, ET_NON_NESTABLE},
 492  {"U",           html_underline,   NULL,                 0, ET_NESTABLE    },
 493  {"UL",          html_ul,          NULL,                 2, ET_NESTABLE    },
 494  {"XMP",         html_xmp,         html_xmp_close,       2, ET_NESTABLE    },
 495  {NULL,          NULL,             NULL,                 0, ET_NESTABLE    },
 496 };
 497
 498 #define NUMBER_OF_TAGS (sizeof_array(elements) - 1)
 499
 500
 501 #ifndef USE_FASTFIND
 502
 503 static int
 504 compar(const void *a, const void *b)
 505 {
 506         return strcasecmp(((struct element_info *) a)->name,
 507                           ((struct element_info *) b)->name);
 508 }
 509
 510 #else
 511
 512 static struct element_info *internal_pointer;
 513
 514 /* Reset internal list pointer */
 515 static void
 516 tags_list_reset(void)
 517 {
 518         internal_pointer = elements;
 519 }
 520
 521 /* Returns a pointer to a struct that contains
 522  * current key and data pointers and increment
 523  * internal pointer.
 524  * It returns NULL when key is NULL. */
 525 static struct fastfind_key_value *
 526 tags_list_next(void)
 527 {
 528         static struct fastfind_key_value kv;
 529
 530         if (!internal_pointer->name) return NULL;
 531
 532         kv.key = internal_pointer->name;
 533         kv.data = internal_pointer;
 534
 535         internal_pointer++;
 536
 537         return &kv;
 538 }
 539
 540 static struct fastfind_index ff_tags_index
 541         = INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset, tags_list_next);
 542
 543 #endif /* USE_FASTFIND */
 544
 545
 546 void
 547 init_tags_lookup(void)
 548 {
 549 #ifdef USE_FASTFIND
 550         fastfind_index(&ff_tags_index, FF_COMPRESS);
 551 #endif
 552 }
 553
 554 void
 555 free_tags_lookup(void)
 556 {
 557 #ifdef USE_FASTFIND
 558         fastfind_done(&ff_tags_index);
 559 #endif
 560 }
 561
 562
 563 static unsigned char *process_element(unsigned char *name, int namelen, int endingtag,
 564                 unsigned char *html, unsigned char *prev_html,
 565                 unsigned char *eof, unsigned char *attr,
 566                 struct html_context *html_context);
 567
 568 /* Count the consecutive newline entity references (e.g. "&#13;") at
 569  * the beginning of the range from @html to @eof.  Store the number of
 570  * newlines to *@newlines_out and return the address where they end.
 571  *
 572  * This function currently requires a semicolon at the end of any
 573  * entity reference, and does not support U+2028 LINE SEPARATOR and
 574  * U+2029 PARAGRAPH SEPARATOR.  */
 575 static const unsigned char *
 576 count_newline_entities(const unsigned char *html, const unsigned char *eof,
 577                        int *newlines_out)
 578 {
 579         int newlines = 0;
 580         int prev_was_cr = 0; /* treat CRLF as one newline, not two */
 581
 582         while ((html + 5 < eof && html[0] == '&' && html[1] == '#')) {
 583                 const unsigned char *peek = html + 2;
 584                 int this_is_cr;
 585
 586                 if (*peek == 'x' || *peek == 'X') {
 587                         ++peek;
 588                         while (peek < eof && *peek == '0')
 589                                 ++peek;
 590                         if (peek == eof)
 591                                 break;
 592                         else if (*peek == 'a' || *peek == 'A')
 593                                 this_is_cr = 0;
 594                         else if (*peek == 'd' || *peek == 'D')
 595                                 this_is_cr = 1;
 596                         else
 597                                 break;
 598                         ++peek;
 599                 } else {
 600                         while (peek < eof && *peek == '0')
 601                                 ++peek;
 602                         if (eof - peek < 2 || *peek != '1')
 603                                 break;
 604                         else if (peek[1] == '0')
 605                                 this_is_cr = 0;
 606                         else if (peek[1] == '3')
 607                                 this_is_cr = 1;
 608                         else
 609                                 break;
 610                         peek += 2;
 611                 }
 612                 /* @peek should now be pointing to the semicolon of
 613                  * e.g. "&#00013;" or "&#x00a;".  Or more digits might
 614                  * follow.  */
 615                 if (peek == eof || *peek != ';')
 616                         break;
 617                 ++peek;
 618
 619                 if (this_is_cr || !prev_was_cr)
 620                         ++newlines;
 621                 prev_was_cr = this_is_cr;
 622                 html = peek;
 623         }
 624
 625         *newlines_out = newlines;
 626         return html;
 627 }
 628
 629 void
 630 parse_html(unsigned char *html, unsigned char *eof,
 631            struct part *part, unsigned char *head,
 632            struct html_context *html_context)
 633 {
 634         unsigned char *base_pos = html;
 635         int noupdate = 0;
 636
 637         html_context->putsp = HTML_SPACE_SUPPRESS;
 638         html_context->line_breax = html_context->table_level ? 2 : 1;
 639         html_context->position = 0;
 640         html_context->was_br = 0;
 641         html_context->was_li = 0;
 642         html_context->was_body = 0;
 643 /*      html_context->was_body_background = 0; */
 644         html_context->part = part;
 645         html_context->eoff = eof;
 646         if (head) process_head(html_context, head);
 647
 648 main_loop:
 649         while (html < eof) {
 650                 unsigned char *name, *attr, *end;
 651                 int namelen, endingtag;
 652                 int dotcounter = 0;
 653
 654                 if (!noupdate) {
 655                         html_context->part = part;
 656                         html_context->eoff = eof;
 657                         base_pos = html;
 658                 } else {
 659                         noupdate = 0;
 660                 }
 661
 662                 if (isspace(*html) && !html_is_preformatted()) {
 663                         unsigned char *h = html;
 664
 665                         while (h < eof && isspace(*h))
 666                                 h++;
 667                         if (h + 1 < eof && h[0] == '<' && h[1] == '/') {
 668                                 if (!parse_element(h, eof, &name, &namelen, &attr, &end)) {
 669                                         put_chrs(html_context, base_pos, html - base_pos);
 670                                         base_pos = html = h;
 671                                         html_context->putsp = HTML_SPACE_ADD;
 672                                         goto element;
 673                                 }
 674                         }
 675                         html++;
 676                         if (!(html_context->position + (html - base_pos - 1)))
 677                                 goto skip_w; /* ??? */
 678                         if (*(html - 1) == ' ') {       /* Do not replace with isspace() ! --Zas */
 679                                 /* BIG performance win; not sure if it doesn't cause any bug */
 680                                 if (html < eof && !isspace(*html)) {
 681                                         noupdate = 1;
 682                                         continue;
 683                                 }
 684                                 put_chrs(html_context, base_pos, html - base_pos);
 685                         } else {
 686                                 put_chrs(html_context, base_pos, html - base_pos - 1);
 687                                 put_chrs(html_context, " ", 1);
 688                         }
 689
 690 skip_w:
 691                         while (html < eof && isspace(*html))
 692                                 html++;
 693                         continue;
 694                 }
 695
 696                 if (html_is_preformatted()) {
 697                         html_context->putsp = HTML_SPACE_NORMAL;
 698                         if (*html == ASCII_TAB) {
 699                                 put_chrs(html_context, base_pos, html - base_pos);
 700                                 put_chrs(html_context, "        ",
 701                                          8 - (html_context->position % 8));
 702                                 html++;
 703                                 continue;
 704
 705                         } else if (*html == ASCII_CR || *html == ASCII_LF) {
 706                                 put_chrs(html_context, base_pos, html - base_pos);
 707                                 if (html - base_pos == 0 && html_context->line_breax > 0)
 708                                         html_context->line_breax--;
 709 next_break:
 710                                 if (*html == ASCII_CR && html < eof - 1
 711                                     && html[1] == ASCII_LF)
 712                                         html++;
 713                                 ln_break(html_context, 1);
 714                                 html++;
 715                                 if (*html == ASCII_CR || *html == ASCII_LF) {
 716                                         html_context->line_breax = 0;
 717                                         goto next_break;
 718                                 }
 719                                 continue;
 720
 721                         } else if (html + 5 < eof && *html == '&') {
 722                                 /* Really nasty hack to make &#13; handling in
 723                                  * <pre>-tags lynx-compatible. It works around
 724                                  * the entity handling done in the renderer,
 725                                  * since checking #13 value there would require
 726                                  * something along the lines of NBSP_CHAR or
 727                                  * checking for '\n's in AT_PREFORMATTED text. */
 728                                 /* See bug 52 and 387 for more info. */
 729                                 int length = html - base_pos;
 730                                 int newlines;
 731
 732                                 html = (unsigned char *) count_newline_entities(html, eof, &newlines);
 733                                 if (newlines) {
 734                                         put_chrs(html_context, base_pos, length);
 735                                         ln_break(html_context, newlines);
 736                                         continue;
 737                                 }
 738                         }
 739                 }
 740
 741                 while (*html < ' ') {
 742                         if (html - base_pos)
 743                                 put_chrs(html_context, base_pos, html - base_pos);
 744
 745                         dotcounter++;
 746                         base_pos = ++html;
 747                         if (*html >= ' ' || isspace(*html) || html >= eof) {
 748                                 unsigned char *dots = fmem_alloc(dotcounter);
 749
 750                                 if (dots) {
 751                                         memset(dots, '.', dotcounter);
 752                                         put_chrs(html_context, dots, dotcounter);
 753                                         fmem_free(dots);
 754                                 }
 755                                 goto main_loop;
 756                         }
 757                 }
 758
 759                 if (html + 2 <= eof && html[0] == '<' && (html[1] == '!' || html[1] == '?')
 760                     && !(html_context->was_xmp || html_context->was_style)) {
 761                         put_chrs(html_context, base_pos, html - base_pos);
 762                         html = skip_comment(html, eof);
 763                         continue;
 764                 }
 765
 766                 if (*html != '<' || parse_element(html, eof, &name, &namelen, &attr, &end)) {
 767                         html++;
 768                         noupdate = 1;
 769                         continue;
 770                 }
 771
 772 element:
 773                 endingtag = *name == '/'; name += endingtag; namelen -= endingtag;
 774                 if (!endingtag && html_context->putsp == HTML_SPACE_ADD && !html_top->invisible)
 775                         put_chrs(html_context, " ", 1);
 776                 put_chrs(html_context, base_pos, html - base_pos);
 777                 if (!html_is_preformatted() && !endingtag && html_context->putsp == HTML_SPACE_NORMAL) {
 778                         unsigned char *ee = end;
 779                         unsigned char *nm;
 780
 781                         while (!parse_element(ee, eof, &nm, NULL, NULL, &ee))
 782                                 if (*nm == '/')
 783                                         goto ng;
 784                         if (ee < eof && isspace(*ee)) {
 785                                 put_chrs(html_context, " ", 1);
 786                         }
 787                 }
 788 ng:
 789                 html = process_element(name, namelen, endingtag, end, html, eof, attr, html_context);
 790         }
 791
 792         if (noupdate) put_chrs(html_context, base_pos, html - base_pos);
 793         ln_break(html_context, 1);
 794         /* Restore the part in case the html_context was trashed in the last
 795          * iteration so that when destroying the stack in the caller we still
 796          * get the right part pointer. */
 797         html_context->part = part;
 798         html_context->putsp = HTML_SPACE_SUPPRESS;
 799         html_context->position = 0;
 800         html_context->was_br = 0;
 801 }
 802
 803 static unsigned char *
 804 start_element(struct element_info *ei,
 805               unsigned char *name, int namelen,
 806               unsigned char *html,
 807               unsigned char *eof, unsigned char *attr,
 808               struct html_context *html_context)
 809 {
 810 #define ELEMENT_RENDER_PROLOGUE \
 811         ln_break(html_context, ei->linebreak); \
 812         a = get_attr_val(attr, "id", html_context->options->cp); \
 813         if (a) { \
 814                 html_context->special_f(html_context, SP_TAG, a); \
 815                 mem_free(a); \
 816         }
 817
 818         unsigned char *a;
 819         struct par_attrib old_format;
 820         int restore_format;
 821 #ifdef CONFIG_CSS
 822         struct css_selector *selector = NULL;
 823 #endif
 824
 825         if (html_top->type == ELEMENT_WEAK) {
 826                 pop_html_element(html_context);
 827         }
 828
 829         /* We try to process nested <script> if we didn't process the parent
 830          * one. */
 831         if (html_top->invisible
 832             && (ei->open != html_script || html_top->invisible < 2)) {
 833                 ELEMENT_RENDER_PROLOGUE
 834                 return html;
 835         }
 836
 837         restore_format = html_is_preformatted();
 838         old_format = par_format;
 839
 840         /* Support for <meta refresh="..."> inside <body>. (bug 700) */
 841         if (ei->open == html_meta && html_context->was_body) {
 842                 html_handle_body_meta(html_context, name - 1, eof);
 843                 html_context->was_body = 0;
 844         }
 845
 846 #ifdef CONFIG_CSS
 847         if (ei->open == html_style && html_context->options->css_enable) {
 848                 css_parse_stylesheet(&html_context->css_styles,
 849                                      html_context->base_href, html, eof);
 850         }
 851 #endif
 852
 853         if (ei->type == ET_NON_NESTABLE || ei->type == ET_LI) {
 854                 struct html_element *e;
 855
 856                 if (ei->type == ET_NON_NESTABLE) {
 857                         foreach (e, html_context->stack) {
 858                                 if (e->type < ELEMENT_KILLABLE) break;
 859                                 if (is_block_element(e) || is_inline_element(ei)) break;
 860                         }
 861                 } else {
 862                         foreach (e, html_context->stack) {
 863                                 if (is_block_element(e) && is_inline_element(ei)) break;
 864                                 if (e->type < ELEMENT_KILLABLE) break;
 865                                 if (!strlcasecmp(e->name, e->namelen, name, namelen)) break;
 866                         }
 867                 }
 868
 869                 if (!strlcasecmp(e->name, e->namelen, name, namelen)) {
 870                         while (e->prev != (void *) &html_context->stack)
 871                                 kill_html_stack_item(html_context, e->prev);
 872
 873                         if (e->type > ELEMENT_IMMORTAL)
 874                                 kill_html_stack_item(html_context, e);
 875                 }
 876         }
 877
 878         if (ei->type != ET_NON_PAIRABLE) {
 879                 html_stack_dup(html_context, ELEMENT_KILLABLE);
 880                 html_top->name = name;
 881                 html_top->namelen = namelen;
 882                 html_top->options = attr;
 883                 html_top->linebreak = ei->linebreak;
 884
 885 #ifdef CONFIG_ECMASCRIPT
 886                 if (has_attr(attr, "onClick", html_context->options->cp)) {
 887                         /* XXX: Put something better to format.link. --pasky */
 888                         mem_free_set(&format.link, stracpy("javascript:void(0);"));
 889                         mem_free_set(&format.target, stracpy(html_context->base_target));
 890                         format.style.fg = format.clink;
 891                         html_top->pseudo_class = ELEMENT_LINK;
 892                         mem_free_set(&format.title, stracpy("onClick placeholder"));
 893                         /* Er. I know. Well, double html_focusable()s shouldn't
 894                          * really hurt. */
 895                         html_focusable(html_context, attr);
 896                 }
 897 #endif
 898         }
 899
 900 #ifdef CONFIG_CSS
 901         if (html_top->options && html_context->options->css_enable) {
 902                 /* XXX: We should apply CSS otherwise as well, but that'll need
 903                  * some deeper changes in order to have options filled etc.
 904                  * Probably just applying CSS from more places, since we
 905                  * usually have type != ET_NESTABLE when we either (1)
 906                  * rescan on your own from somewhere else (2) html_stack_dup()
 907                  * in our own way.  --pasky */
 908                 /* Call it now to gain some of the stuff which might affect
 909                  * formatting of some elements. */
 910                 /* FIXME: The caching of the CSS selector is broken, since t can
 911                  * lead to wrong styles being applied to following elements, so
 912                  * disabled for now. */
 913                 selector = get_css_selector_for_element(html_context, html_top,
 914                                                         &html_context->css_styles,
 915                                                         &html_context->stack);
 916
 917                 if (selector) {
 918                         apply_css_selector_style(html_context, html_top, selector);
 919                         done_css_selector(selector);
 920                 }
 921         }
 922         /* Now this was the reason for this whole funny ELEMENT_RENDER_PROLOGUE
 923          * bussiness. Only now we have the definitive linebreak value, since
 924          * that's what the display: property plays with. */
 925 #endif
 926         ELEMENT_RENDER_PROLOGUE
 927         if (ei->open) ei->open(html_context, attr, html, eof, &html);
 928 #ifdef CONFIG_CSS
 929         if (selector && html_top->options) {
 930                 /* Call it now to override default colors of the elements. */
 931                 selector = get_css_selector_for_element(html_context, html_top,
 932                                                         &html_context->css_styles,
 933                                                         &html_context->stack);
 934
 935                 if (selector) {
 936                         apply_css_selector_style(html_context, html_top, selector);
 937                         done_css_selector(selector);
 938                 }
 939         }
 940 #endif
 941
 942         if (ei->open != html_br) html_context->was_br = 0;
 943
 944         if (restore_format) par_format = old_format;
 945
 946         return html;
 947 #undef ELEMENT_RENDER_PROLOGUE
 948 }
 949
 950 static unsigned char *
 951 end_element(struct element_info *ei,
 952             unsigned char *name, int namelen,
 953             unsigned char *html,
 954             unsigned char *eof, unsigned char *attr,
 955             struct html_context *html_context)
 956 {
 957         struct html_element *e, *elt;
 958         int lnb = 0;
 959         int kill = 0;
 960
 961         html_context->was_br = 0;
 962         if (ei->type == ET_NON_PAIRABLE || ei->type == ET_LI)
 963                 return html;
 964
 965         if (ei->close) ei->close(html_context, attr, html, eof, &html);
 966
 967         /* dump_html_stack(html_context); */
 968         foreach (e, html_context->stack) {
 969                 if (is_block_element(e) && is_inline_element(ei)) kill = 1;
 970                 if (strlcasecmp(e->name, e->namelen, name, namelen)) {
 971                         if (e->type < ELEMENT_KILLABLE)
 972                                 break;
 973                         else
 974                                 continue;
 975                 }
 976                 if (kill) {
 977                         kill_html_stack_item(html_context, e);
 978                         break;
 979                 }
 980                 for (elt = e;
 981                      elt != (void *) &html_context->stack;
 982                      elt = elt->prev)
 983                         if (elt->linebreak > lnb)
 984                                 lnb = elt->linebreak;
 985
 986                 /* This hack forces a line break after a list end. It is needed
 987                  * when ending a list with the last <li> having no text the
 988                  * line_breax is 2 so the ending list's linebreak will be
 989                  * ignored when calling ln_break(). */
 990                 if (html_context->was_li)
 991                         html_context->line_breax = 0;
 992
 993                 ln_break(html_context, lnb);
 994                 while (e->prev != (void *) &html_context->stack)
 995                         kill_html_stack_item(html_context, e->prev);
 996                 kill_html_stack_item(html_context, e);
 997                 break;
 998         }
 999         /* dump_html_stack(html_context); */
1000
1001         return html;
1002 }
1003
1004 static unsigned char *
1005 process_element(unsigned char *name, int namelen, int endingtag,
1006                 unsigned char *html, unsigned char *prev_html,
1007                 unsigned char *eof, unsigned char *attr,
1008                 struct html_context *html_context)
1009
1010 {
1011         struct element_info *ei;
1012
1013 #ifndef USE_FASTFIND
1014         {
1015                 struct element_info elem;
1016                 unsigned char tmp;
1017
1018                 tmp = name[namelen];
1019                 name[namelen] = '\0';
1020
1021                 elem.name = name;
1022                 ei = bsearch(&elem, elements, NUMBER_OF_TAGS, sizeof(elem), compar);
1023                 name[namelen] = tmp;
1024         }
1025 #else
1026         ei = (struct element_info *) fastfind_search(&ff_tags_index, name, namelen);
1027 #endif
1028         if (html_context->was_xmp || html_context->was_style) {
1029                 if (!ei || (ei->open != html_xmp && ei->open != html_style) || !endingtag) {
1030                         put_chrs(html_context, "<", 1);
1031                         return prev_html + 1;
1032                 }
1033         }
1034
1035         if (!ei) return html;
1036
1037         if (!endingtag) {
1038                 return start_element(ei, name, namelen, html, eof, attr, html_context);
1039         } else {
1040                 return end_element(ei, name, namelen, html, eof, attr, html_context);
1041         }
1042 }
1043
1044 void
1045 scan_http_equiv(unsigned char *s, unsigned char *eof, struct string *head,
1046                 struct string *title, struct document_options *options)
1047 {
1048         unsigned char *name, *attr, *he, *c;
1049         int namelen;
1050
1051         if (title && !init_string(title)) return;
1052
1053         add_char_to_string(head, '\n');
1054
1055 se:
1056         while (s < eof && *s != '<') {
1057 sp:
1058                 s++;
1059         }
1060         if (s >= eof) return;
1061         if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1062                 s = skip_comment(s, eof);
1063                 goto se;
1064         }
1065         if (parse_element(s, eof, &name, &namelen, &attr, &s)) goto sp;
1066
1067 ps:
1068         if (!strlcasecmp(name, namelen, "HEAD", 4)) goto se;
1069         if (!strlcasecmp(name, namelen, "/HEAD", 5)) return;
1070         if (!strlcasecmp(name, namelen, "BODY", 4)) return;
1071         if (title && !title->length && !strlcasecmp(name, namelen, "TITLE", 5)) {
1072                 unsigned char *s1;
1073
1074 xse:
1075                 s1 = s;
1076                 while (s < eof && *s != '<') {
1077 xsp:
1078                         s++;
1079                 }
1080                 if (s - s1)
1081                         add_bytes_to_string(title, s1, s - s1);
1082                 if (s >= eof) goto se;
1083                 if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1084                         s = skip_comment(s, eof);
1085                         goto xse;
1086                 }
1087                 if (parse_element(s, eof, &name, &namelen, &attr, &s)) {
1088                         s1 = s;
1089                         goto xsp;
1090                 }
1091                 clr_spaces(title->source);
1092                 goto ps;
1093         }
1094         if (strlcasecmp(name, namelen, "META", 4)) goto se;
1095
1096         he = get_attr_val(attr, "charset", options->cp);
1097         if (he) {
1098                 add_to_string(head, "Charset: ");
1099                 add_to_string(head, he);
1100                 mem_free(he);
1101         }
1102
1103         he = get_attr_val(attr, "http-equiv", options->cp);
1104         if (!he) goto se;
1105
1106         add_to_string(head, he);
1107         mem_free(he);
1108
1109         c = get_attr_val(attr, "content", options->cp);
1110         if (c) {
1111                 add_to_string(head, ": ");
1112                 add_to_string(head, c);
1113                 mem_free(c);
1114         }
1115
1116         add_crlf_to_string(head);
1117         goto se;
1118 }