libyelp/yelp-info-parser.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil -*- */
   2 /*
   3  * Copyright (C) 2005 Davyd Madeley <davyd@madeley.id.au>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public
  16  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  17  *
  18  * Author: Davyd Madeley  <davyd@madeley.id.au>
  19  */
  20
  21 #ifdef HAVE_CONFIG_H
  22 #include <config.h>
  23 #endif
  24
  25 #include <glib.h>
  26 #include <gtk/gtk.h>
  27 #include <string.h>
  28
  29 #include "yelp-info-parser.h"
  30 #include "yelp-magic-decompressor.h"
  31 #include "yelp-debug.h"
  32
  33
  34 static GtkTreeIter *  find_real_top                      (GtkTreeModel *model,
  35                                                           GtkTreeIter *it);
  36 static GtkTreeIter *  find_real_sibling                  (GtkTreeModel *model,
  37                                                           GtkTreeIter *it,
  38                                                           GtkTreeIter *comp);
  39 static xmlNodePtr     yelp_info_parse_menu               (GtkTreeStore *tree,
  40                                                           xmlNodePtr *node,
  41                                                           gchar *page_content,
  42                                                           gboolean notes);
  43 static gboolean       get_menuoptions                    (gchar *line,
  44                                                           gchar **title,
  45                                                           gchar **ref,
  46                                                           gchar **desc,
  47                                                           gchar **xref);
  48 static gboolean       resolve_frag_id                    (GtkTreeModel *model,
  49                                                           GtkTreePath *path,
  50                                                           GtkTreeIter *iter,
  51                                                           gpointer data);
  52 static void           info_process_text_notes            (xmlNodePtr *node,
  53                                                           gchar *content,
  54                                                           GtkTreeStore
  55                                                           *tree);
  56
  57 /*
  58   Used to output the correct <heading level="?" /> tag.
  59  */
  60 static const gchar* level_headings[] = { NULL, "1", "2", "3" };
  61
  62 static GHashTable *
  63 info_image_get_attributes (gchar const* string)
  64 {
  65   GMatchInfo *match_info;
  66   GRegex *regex;
  67   GHashTable *h;
  68
  69   h = 0;
  70   regex = g_regex_new ("([^\\s][^\\s=]+)=(?:([^\\s \"]+)|(?:\"((?:[^\\\"]|\\\\[\\\\\"])*)\"))", 0, 0, NULL);
  71   g_regex_match (regex, string, 0, &match_info);
  72   while (g_match_info_matches (match_info))
  73     {
  74       gchar *key;
  75       gchar *value;
  76
  77       if (!h)
  78         h = g_hash_table_new (g_str_hash, g_str_equal);
  79       key = g_match_info_fetch (match_info, 1);
  80       value = g_match_info_fetch (match_info, 2);
  81       if (!*value)
  82         value = g_match_info_fetch (match_info, 3);
  83       g_hash_table_insert (h, key, value);
  84       g_match_info_next (match_info, NULL);
  85     }
  86   g_match_info_free (match_info);
  87   g_regex_unref (regex);
  88
  89   return h;
  90 }
  91
  92 /*
  93   info elements look like \0\b[<TAGNAME>\0\b] and take attribute=value
  94   pairs, i.e. for image: \0\b[image src="foo.png" \0\b]
  95 */
  96 #define INFO_TAG_0 "\0"
  97 #define INFO_TAG_1 "\b"
  98 #define INFO_TAG_OPEN_2 INFO_TAG_1 "["
  99 #define INFO_TAG_CLOSE_2 INFO_TAG_1 "]"
 100 #define INFO_TAG_OPEN_2_RE INFO_TAG_1 "[[]"
 101 #define INFO_TAG_CLOSE_2_RE INFO_TAG_1 "[]]"
 102 #define INFO_TAG_OPEN INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2
 103 #define INFO_TAG_CLOSE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2
 104 #define INFO_TAG_OPEN_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2_RE
 105 #define INFO_TAG_CLOSE_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2_RE
 106 /* C/glib * cannot really handle \0 in strings, convert to '@' */
 107 #define INFO_C_TAG_0 "@"
 108 #define INFO_C_TAG_OPEN INFO_C_TAG_0 INFO_TAG_OPEN_2
 109 #define INFO_C_TAG_CLOSE INFO_C_TAG_0 INFO_TAG_CLOSE_2
 110 #define INFO_C_TAG_OPEN_RE INFO_C_TAG_0 INFO_TAG_OPEN_2_RE
 111 #define INFO_C_TAG_CLOSE_RE INFO_C_TAG_0 INFO_TAG_CLOSE_2_RE
 112 #define INFO_C_IMAGE_TAG_OPEN INFO_C_TAG_OPEN "image"
 113 #define INFO_C_IMAGE_TAG_OPEN_RE INFO_C_TAG_OPEN_RE "image"
 114
 115 static xmlNodePtr
 116 info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
 117 {
 118   gchar *title;
 119   gchar *text;
 120   gchar *alt;
 121   xmlNodePtr img;
 122   GHashTable *h = info_image_get_attributes (g_match_info_fetch (match_info, 1));
 123   gchar *source;
 124   if (h)
 125     source = (gchar*)g_hash_table_lookup (h, "src");
 126
 127   if (!h || !source || !*source)
 128     return xmlNewTextChild (parent, NULL, BAD_CAST "para",
 129                             BAD_CAST "[broken image]");
 130
 131   title = (gchar*)g_hash_table_lookup (h, "title");
 132   text = (gchar*)g_hash_table_lookup (h, "text");
 133   alt = (gchar*)g_hash_table_lookup (h, "alt");
 134   g_hash_table_destroy (h);
 135   img = xmlNewChild (parent, NULL, BAD_CAST "img", NULL);
 136   xmlNewProp (img, BAD_CAST "src", BAD_CAST source);
 137   xmlNewProp (img, BAD_CAST "title", BAD_CAST (title ? title : ""));
 138   xmlNewProp (img, BAD_CAST "text", BAD_CAST (text ? text : ""));
 139   xmlNewProp (img, BAD_CAST "alt", BAD_CAST (alt ? alt : ""));
 140   g_free (source);
 141   g_free (title);
 142   g_free (alt);
 143   return parent;
 144 }
 145
 146 /*
 147   If every element of `str' is `ch' then return TRUE, else FALSE.
 148  */
 149 static gboolean
 150 string_all_char_p (const gchar* str, gchar ch)
 151 {
 152   for (; *str; str++) {
 153     if (*str != ch) return FALSE;
 154   }
 155   return TRUE;
 156 }
 157
 158 /*
 159   If `line' is a line of '*', '=' or '-', return 1,2,3 respectively
 160   for the heading level. If it's anything else, return 0.
 161  */
 162 static int
 163 header_underline_level (const gchar* line)
 164 {
 165   if (*line != '*' && *line != '=' && *line != '-')
 166     return 0;
 167
 168   if (string_all_char_p (line, '*')) return 1;
 169   if (string_all_char_p (line, '=')) return 2;
 170   if (string_all_char_p (line, '-')) return 3;
 171
 172   return 0;
 173 }
 174
 175 /*
 176   Use g_strjoinv to join up the strings from `strings', but they might
 177   not actually be a null-terminated array. `end' should be strings+n,
 178   where I want the first n strings (strings+0, ..., strings+(n-1)). It
 179   shouldn't point outside of the array allocated, but it can point at
 180   the null string at the end.
 181  */
 182 static gchar*
 183 join_strings_subset (const gchar *separator,
 184                      gchar** strings, gchar** end)
 185 {
 186   gchar *ptr;
 187   gchar *glob;
 188
 189   g_assert(end > strings);
 190
 191   ptr = *end;
 192   *end = NULL;
 193
 194   glob = g_strjoinv (separator, strings);
 195   *end = ptr;
 196   return glob;
 197 }
 198
 199 /*
 200   Create a text node, child of `parent', with the lines strictly
 201   between `first' and `last'.
 202 */
 203 static void
 204 lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
 205                          gchar** first, gchar** last)
 206 {
 207   /* TODO? Currently we're copying the split strings again, which is
 208      less efficient than somehow storing lengths and using a sort of
 209      window on `content'. But that's much more difficult, so unless
 210      there's a problem, let's go with the stupid approach. */
 211   gchar *glob;
 212
 213   if (last > first) {
 214     glob = join_strings_subset ("\n", first, last);
 215     xmlAddChild (parent, xmlNewText (BAD_CAST glob));
 216     g_free (glob);
 217   }
 218 }
 219
 220 /*
 221   Convert body text CONTENT to xml nodes. This function is responsible
 222   for spotting headings etc and splitting them out correctly.
 223
 224   paragraph is as described in info_body_text, but cannot be null.
 225
 226   If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
 227   <para> tag.
 228
 229   TODO: IWBN add a regex match for *Note: here and call the *Note ==>
 230   <a href> logic of info_process_text_notes from here.
 231  */
 232 static void
 233 info_body_parse_text (xmlNodePtr parent, xmlNodePtr *paragraph,
 234                       xmlNsPtr ns,
 235                       gboolean inline_p, const gchar *content)
 236 {
 237   /* The easiest things to spot are headings: they look like a line of
 238    * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
 239    * them, we split content into single lines and work with them. */
 240   gchar **lines = g_strsplit (content, "\n", 0);
 241   gchar **first = lines, **last = lines;
 242   int header_level;
 243   xmlNodePtr header_node;
 244
 245   /* Deal with the possibility that `content' is empty */
 246   if (*lines == NULL) {
 247     if (!inline_p) {
 248       xmlNewTextChild (parent, NULL, BAD_CAST "para", BAD_CAST "");
 249     }
 250     return;
 251   }
 252
 253   /* Use a pair of pointers, first and last, which point to two lines,
 254    * the chunk of the body we're displaying (inclusive) */
 255   for (; *last; last++) {
 256
 257     /* Check for a blank line */
 258     if (**last == '\0') {
 259       if (last != first) {
 260         if (!*paragraph) {
 261           *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
 262         }
 263         lines_subset_text_child (*paragraph, ns, first, last);
 264       }
 265       /* On the next iteration, last==first both pointing at the next
 266          line. */
 267       first = last+1;
 268       *paragraph = NULL;
 269
 270       continue;
 271     }
 272
 273     /* Check for a header */
 274     header_level = header_underline_level (*last);
 275     if (header_level) {
 276       /* Write out any lines beforehand */
 277       lines_subset_text_child (parent, ns, first, last-1);
 278       /* Now write out the actual header line */
 279       header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
 280                                      BAD_CAST *(last-1));
 281       xmlNewProp (header_node, BAD_CAST "level",
 282                   BAD_CAST level_headings[header_level]);
 283
 284       first = last+1;
 285       last = first-1;
 286     }
 287   }
 288
 289   /* Write out any lines left */
 290   if (!*paragraph) {
 291     *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
 292   }
 293   lines_subset_text_child (*paragraph, ns, first, last);
 294
 295   g_strfreev (lines);
 296 }
 297
 298 /*
 299   info_body_text is responsible for taking a hunk of the info page's
 300   body and turning it into paragraph tags. It searches out images and
 301   marks them up properly if necessary.
 302
 303   parent should be the node in which we're currently storing text and
 304   paragraph a pointer to a <para> tag or NULL. At blank lines, we
 305   finish with the current para tag and switch to a new one.
 306
 307   It uses info_body_parse_text to mark up the actual bits of text.
 308  */
 309 static void
 310 info_body_text (xmlNodePtr parent, xmlNodePtr *paragraph, xmlNsPtr ns,
 311                 gboolean inline_p, gchar const *content)
 312 {
 313   xmlNodePtr thepara = NULL;
 314   gint content_len;
 315   gint pos;
 316   GRegex *regex;
 317   GMatchInfo *match_info;
 318   gchar *after;
 319   if (paragraph == NULL) paragraph = &thepara;
 320
 321   if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
 322     info_body_parse_text (parent, paragraph, ns, inline_p, content);
 323     return;
 324   }
 325
 326   content_len = strlen (content);
 327   pos = 0;
 328   regex = g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE "((?:[^" INFO_TAG_1 "]|[^" INFO_C_TAG_0 "]+" INFO_TAG_1 ")*)" INFO_C_TAG_CLOSE_RE ")", 0, 0, NULL);
 329
 330   g_regex_match (regex, content, 0, &match_info);
 331   while (g_match_info_matches (match_info))
 332     {
 333       gint image_start;
 334       gint image_end;
 335       gboolean image_found = g_match_info_fetch_pos (match_info, 0,
 336                                                      &image_start, &image_end);
 337       gchar *before = g_strndup (&content[pos], image_start - pos);
 338       pos = image_end + 1;
 339       info_body_parse_text (parent, paragraph, NULL, TRUE, before);
 340       g_free (before);
 341
 342       /* End the paragraph that was before */
 343       *paragraph = NULL;
 344
 345       if (image_found)
 346         info_insert_image (parent, match_info);
 347       g_match_info_next (match_info, NULL);
 348     }
 349   after = g_strndup (&content[pos], content_len - pos);
 350   info_body_parse_text (parent, paragraph, NULL, TRUE, after);
 351   g_free (after);
 352 }
 353
 354 /* Part 1: Parse File Into Tree Store */
 355
 356 enum
 357 {
 358         PAGE_TAG_TABLE,
 359         PAGE_NODE,
 360         PAGE_INDIRECT,
 361         PAGE_OTHER
 362 };
 363
 364 static int
 365 page_type (char *page)
 366 {
 367   if (g_ascii_strncasecmp (page, "Tag Table:\n", 11) == 0)
 368     return PAGE_TAG_TABLE;
 369   else if (g_ascii_strncasecmp (page, "Indirect:\n", 10) == 0)
 370     return PAGE_INDIRECT;
 371   else if (g_ascii_strncasecmp (page, "File:", 5) == 0 ||
 372            g_ascii_strncasecmp (page, "Node:", 5) == 0)
 373     return PAGE_NODE;
 374
 375   else
 376     return PAGE_OTHER;
 377 }
 378
 379 static char
 380 *open_info_file (const gchar *file)
 381 {
 382     GFile *gfile;
 383     GConverter *converter;
 384     GFileInputStream *file_stream;
 385     GInputStream *stream;
 386     gchar buf[1024];
 387     gssize bytes;
 388     GString *string;
 389     gchar *str;
 390     gsize i;
 391
 392     gfile = g_file_new_for_path (file);
 393     file_stream = g_file_read (gfile, NULL, NULL);
 394     converter = (GConverter *) yelp_magic_decompressor_new ();
 395     stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter);
 396     string = g_string_new (NULL);
 397
 398     while ((bytes = g_input_stream_read (stream, buf, 1024, NULL, NULL)) > 0)
 399         g_string_append_len (string, buf, bytes);
 400
 401     g_object_unref (stream);
 402
 403     str = string->str;
 404
 405     /* C/glib * cannot really handle \0 in strings, convert. */
 406     for (i = 0; i < (string->len - 1); i++)
 407         if (str[i] == INFO_TAG_OPEN[0] && str[i+1] == INFO_TAG_OPEN[1])
 408             str[i] = INFO_C_TAG_OPEN[0];
 409
 410     g_string_free (string, FALSE);
 411
 412     return str;
 413 }
 414
 415 static gchar *
 416 find_info_part (gchar *part_name, const gchar *base)
 417 {
 418   /* New and improved.  We now assume that all parts are
 419    * in the same subdirectory as the base file.  Makes
 420    * life much simpler and is (afaict) always true
 421    */
 422   gchar *path;
 423   gchar *tmp;
 424   gchar *bzfname, *gzfname, *lzfd, *fname;
 425   gchar *uri = NULL;
 426   tmp = g_strrstr (base, "/");
 427   path = g_strndup (base, tmp-base);
 428
 429   bzfname = g_strconcat (path, "/", part_name, ".bz2", NULL);
 430   gzfname = g_strconcat (path, "/", part_name, ".gz", NULL);
 431   lzfd = g_strconcat (path, "/", part_name, ".lzma", NULL);
 432   fname = g_strconcat (path, "/", part_name, NULL);
 433
 434   if (g_file_test (bzfname, G_FILE_TEST_EXISTS))
 435     uri = g_strdup (bzfname);
 436   else if (g_file_test (gzfname, G_FILE_TEST_EXISTS))
 437     uri = g_strdup (gzfname);
 438   else if (g_file_test (lzfd, G_FILE_TEST_EXISTS))
 439     uri = g_strdup (lzfd);
 440   else if (g_file_test (fname, G_FILE_TEST_EXISTS))
 441     uri = g_strdup (fname);
 442
 443   g_free (bzfname);
 444   g_free (gzfname);
 445   g_free (lzfd);
 446   g_free (fname);
 447   g_free (path);
 448   return uri;
 449
 450 }
 451
 452 static char
 453 *process_indirect_map (char *page, const gchar *file)
 454 {
 455         char **lines;
 456         char **ptr;
 457         char *composite = NULL;
 458         size_t composite_len = 0;
 459
 460         lines = g_strsplit (page, "\n", 0);
 461
 462         /*
 463           Go backwards down the list so that we allocate composite
 464           big enough the first time around.
 465         */
 466         for (ptr = lines + 1; *ptr != NULL; ptr++);
 467         for (ptr--; ptr != lines; ptr--)
 468         {
 469                 char **items;
 470                 char *filename;
 471                 char *str;
 472                 char **pages;
 473                 gsize offset;
 474                 gsize plength;
 475
 476                 debug_print (DB_DEBUG, "Line: %s\n", *ptr);
 477                 items = g_strsplit (*ptr, ": ", 2);
 478
 479                 if (items[0])
 480                 {
 481                   filename = find_info_part (items[0], file);
 482                   str = open_info_file (filename);
 483                   if (!str) {
 484                         g_strfreev (items);
 485                         continue;
 486                   }
 487                         pages = g_strsplit (str, "\x1f", 2);
 488                         g_free (str);
 489                         if (!pages[1]) {
 490                                 g_strfreev (items);
 491                                 g_strfreev (pages);
 492                                 continue;
 493                         }
 494
 495                         offset = (gsize) atoi (items[1]);
 496                         plength = strlen(pages[1]);
 497
 498                         debug_print (DB_DEBUG, "Need to make string %s+%i bytes = %i\n",
 499                                     items[1], plength,
 500                                     offset + plength);
 501
 502                         if (!composite) /* not yet created, malloc it */
 503                         {
 504                                 composite_len = offset + plength;
 505                                 composite = g_malloc (sizeof (char) *
 506                                                       (composite_len + 1));
 507                                 memset (composite, '-', composite_len);
 508                                 composite[composite_len] = '\0';
 509                         }
 510
 511                         /* Because we're going down the list
 512                          * backwards, plength should always be short
 513                          * enough to fit in the memory allocated. But
 514                          * in case something's broken/malicious, we
 515                          * should check anyway.
 516                          */
 517                         if (offset > composite_len)
 518                           continue;
 519                         if (plength + offset + 1 > composite_len)
 520                           plength = composite_len - offset - 1;
 521
 522                         composite[offset] = '\x1f';
 523                         memcpy (composite + offset + 1, pages[1], plength);
 524
 525                         g_free (filename);
 526                         g_strfreev (pages);
 527                 }
 528
 529                 g_strfreev (items);
 530         }
 531
 532         g_strfreev (lines);
 533
 534         return composite;
 535 }
 536
 537 /*
 538   Open up the relevant info file and read it all into memory. If there
 539   is an indirect table thingy, we resolve that as we go.
 540
 541   Returns a NULL-terminated list of pointers to pages on success and
 542   NULL otherwise.
 543  */
 544 static gchar**
 545 expanded_info_file (const gchar *file)
 546 {
 547   gchar *slurp = open_info_file (file);
 548   gchar **page_list;
 549   gchar **page;
 550
 551   if (!slurp) return NULL;
 552
 553   /* TODO: There's a lot of copying of bits of memory here. With a bit
 554    * more effort we could avoid it. Either we should fix this or
 555    * measure the time taken and decide it's irrelevant...
 556    *
 557    * Note: \x1f\n is ^_\n
 558    */
 559   page_list = g_strsplit (slurp, "\x1f\n", 0);
 560
 561   g_free (slurp);
 562
 563   for (page = page_list; *page != NULL; page++) {
 564     if (page_type (*page) == PAGE_INDIRECT) {
 565
 566       slurp = process_indirect_map (*page, file);
 567       g_strfreev (page_list);
 568
 569       if (!slurp)
 570         return NULL;
 571
 572       page_list = g_strsplit (slurp, "\x1f\n", 0);
 573       g_free (slurp);
 574       break;
 575     }
 576   }
 577
 578   return page_list;
 579 }
 580
 581 /*
 582   Look for strings in source by key. For example, we extract "blah"
 583   from "Node: blah," when the key is "Node: ". To know when to stop,
 584   there are two strings: end and cancel.
 585
 586   If we find a character from end first, return a copy of the string
 587   up to (not including) that character. If we find a character of
 588   cancel first, return NULL. If we find neither, return the rest of
 589   the string.
 590
 591   cancel can be NULL, in which case, we don't do its test.
 592  */
 593 static char*
 594 get_value_after_ext (const char *source, const char *key,
 595                      const char *end, const char *cancel)
 596 {
 597   char *start;
 598   size_t not_end, not_cancel;
 599
 600   start = strstr (source, key);
 601   if (!start) return NULL;
 602
 603   start += strlen (key);
 604
 605   not_end = strcspn (start, end);
 606   not_cancel = (cancel) ? strcspn (start, cancel) : not_end + 1;
 607
 608   if (not_cancel < not_end)
 609     return NULL;
 610
 611   return g_strndup (start, not_end);
 612 }
 613
 614 static char*
 615 get_value_after (const char* source, const char *key)
 616 {
 617   return get_value_after_ext (source, key, ",", "\n\x7f");
 618 }
 619
 620 static int
 621 node2page (GHashTable *nodes2pages, char *node)
 622 {
 623   gpointer p;
 624
 625   if (g_hash_table_lookup_extended (nodes2pages, node,
 626                                     NULL, &p))
 627     return GPOINTER_TO_INT(p);
 628
 629   /* This shouldn't happen: we should only ever have to look up pages
 630    * that exist. */
 631   g_return_val_if_reached (0);
 632 }
 633
 634 static GtkTreeIter
 635 *node2iter (GHashTable *nodes2iters, char *node)
 636 {
 637         GtkTreeIter *iter;
 638
 639         iter = g_hash_table_lookup (nodes2iters, node);
 640         d (if (!iter) debug_print (DB_WARN, "Could not retrieve iter for node !%s!\n", node));
 641         return iter;
 642 }
 643
 644 GtkTreeIter
 645 *find_real_top (GtkTreeModel *model, GtkTreeIter *it)
 646 {
 647   GtkTreeIter *r = NULL;
 648   GtkTreeIter *tmp = NULL;
 649
 650   if (!it)
 651     return NULL;
 652
 653   r = gtk_tree_iter_copy (it);
 654   tmp = g_malloc0 (sizeof (GtkTreeIter));
 655   while (gtk_tree_model_iter_parent (model, tmp, r)) {
 656     gtk_tree_iter_free (r);
 657     r = gtk_tree_iter_copy (tmp);
 658   }
 659   g_free (tmp);
 660
 661   return r;
 662 }
 663
 664 GtkTreeIter * find_real_sibling (GtkTreeModel *model,
 665                                  GtkTreeIter *it, GtkTreeIter *comp)
 666 {
 667   GtkTreeIter *r;
 668   GtkTreeIter *tmp = NULL;
 669   gboolean result = FALSE;
 670   gchar *title;
 671   gchar *reftitle;
 672
 673   if (!it) {
 674     return NULL;
 675   }
 676
 677   r = gtk_tree_iter_copy (it);
 678   tmp = gtk_tree_iter_copy (it);
 679
 680   reftitle = gtk_tree_model_get_string_from_iter (model, comp);
 681
 682   result = gtk_tree_model_iter_parent (model, r, it);
 683   if (!result)
 684     return it;
 685
 686   title = gtk_tree_model_get_string_from_iter (model, r);
 687
 688   while (!g_str_equal (title, reftitle) && result) {
 689     gtk_tree_iter_free (tmp);
 690     tmp = gtk_tree_iter_copy (r);
 691     result = gtk_tree_model_iter_parent (model, r, tmp);
 692     if (result)
 693       title = gtk_tree_model_get_string_from_iter (model, r);
 694   }
 695
 696   if (!g_str_equal (title, reftitle))
 697     {
 698       gtk_tree_iter_free (tmp);
 699       tmp = NULL;
 700     }
 701
 702   gtk_tree_iter_free (r);
 703   g_free (title);
 704   g_free (reftitle);
 705   return tmp;
 706
 707 }
 708
 709 static void
 710 process_page (GtkTreeStore *tree,
 711               GHashTable *nodes2pages, GHashTable *nodes2iters,
 712               int *processed_table, char **page_list, char *page_text)
 713 {
 714         GtkTreeIter *iter;
 715
 716         char **parts;
 717         char *node;
 718         char *up;
 719         char *prev;
 720         char *next;
 721         gchar *tmp;
 722
 723         int page;
 724
 725         /* split out the header line and the text */
 726         parts = g_strsplit (page_text, "\n", 3);
 727
 728         node = get_value_after (parts[0], "Node: ");
 729         up = get_value_after (parts[0], "Up: ");
 730         prev = get_value_after (parts[0], "Prev: ");
 731         next = get_value_after (parts[0], "Next: ");
 732
 733         if (next && g_str_equal (next, "Top")) {
 734           g_free (next);
 735           next = NULL;
 736         }
 737         if (g_str_equal (node, "Top") && prev != NULL) {
 738           g_free (prev);
 739           prev = NULL;
 740         }
 741
 742         /* check to see if this page has been processed already */
 743         page = node2page (nodes2pages, node);
 744         if (processed_table[page]) {
 745                 return;
 746         }
 747         processed_table[page] = 1;
 748
 749         debug_print (DB_DEBUG, "-- Processing Page %s\n\tParent: %s\n", node, up);
 750
 751         iter = g_slice_alloc0 (sizeof (GtkTreeIter));
 752         /* check to see if we need to process our parent and siblings */
 753         if (up && g_ascii_strncasecmp (up, "(dir)", 5) && strcmp (up, "Top"))
 754         {
 755                 page = node2page (nodes2pages, up);
 756                 if (!processed_table[page])
 757                 {
 758                   debug_print (DB_DEBUG, "%% Processing Node %s\n", up);
 759                   process_page (tree, nodes2pages,
 760                                 nodes2iters, processed_table, page_list,
 761                                 page_list[page]);
 762                 }
 763         }
 764         if (prev && g_ascii_strncasecmp (prev, "(dir)", 5))
 765           {
 766             if (strncmp (node, "Top", 3)) {
 767               /* Special case the Top node to always appear first */
 768             } else {
 769               page = node2page (nodes2pages, prev);
 770               if (!processed_table[page])
 771                 {
 772                   debug_print (DB_DEBUG, "%% Processing Node %s\n", prev);
 773                   process_page (tree, nodes2pages,
 774                                 nodes2iters, processed_table, page_list,
 775                                 page_list[page]);
 776                 }
 777             }
 778           }
 779
 780         /* by this point our parent and older sibling should be processed */
 781         if (!up || !g_ascii_strcasecmp (up, "(dir)"))
 782         {
 783           debug_print (DB_DEBUG, "\t> no parent\n");
 784                 if (!prev || !g_ascii_strcasecmp (prev, "(dir)"))
 785                 {
 786                   debug_print (DB_DEBUG, "\t> no previous\n");
 787                         gtk_tree_store_append (tree, iter, NULL);
 788                 }
 789                 else if (prev) {
 790                   GtkTreeIter *real;
 791                   real = find_real_top (GTK_TREE_MODEL (tree),
 792                                         node2iter (nodes2iters, prev));
 793                   if (real) {
 794                     gtk_tree_store_insert_after (tree, iter, NULL,
 795                                                  real);
 796                     gtk_tree_iter_free (real);
 797                   }
 798                   else
 799                     gtk_tree_store_append (tree, iter, NULL);
 800                 }
 801         }
 802         else if (!prev || !g_ascii_strcasecmp (prev, "(dir)") || !strcmp (prev, up))
 803         {
 804           debug_print (DB_DEBUG, "\t> no previous\n");
 805                 gtk_tree_store_append (tree, iter,
 806                         node2iter (nodes2iters, up));
 807         }
 808         else if (up && prev)
 809         {
 810           GtkTreeIter *upit = node2iter (nodes2iters, up);
 811           GtkTreeIter *previt = node2iter (nodes2iters, prev);
 812           GtkTreeIter *nit = NULL;
 813           debug_print (DB_DEBUG, "+++ Parent: %s Previous: %s\n", up, prev);
 814
 815           d (if (upit) debug_print (DB_DEBUG, "++++ Have parent node!\n"));
 816           d (if (previt) debug_print (DB_DEBUG, "++++ Have previous node!\n"));
 817           nit = find_real_sibling (GTK_TREE_MODEL (tree), previt, upit);
 818           if (nit) {
 819             gtk_tree_store_insert_after (tree, iter,
 820                                          upit,
 821                                          nit);
 822             gtk_tree_iter_free (nit);
 823           }
 824           else
 825             gtk_tree_store_append (tree, iter, upit);
 826         }
 827         else
 828         {
 829           debug_print (DB_DEBUG, "# node %s was not put in tree\n", node);
 830           return;
 831         }
 832
 833         d (if (iter) debug_print (DB_DEBUG, "Have a valid iter, storing for %s\n", node));
 834
 835         g_hash_table_insert (nodes2iters, g_strdup (node), iter);
 836         debug_print (DB_DEBUG, "size: %i\n", g_hash_table_size (nodes2iters));
 837
 838         /*tmp = g_strdup_printf ("%i",
 839           node2page (nodes2pages, node));*/
 840         tmp = g_strdup (node);
 841         tmp = g_strdelimit (tmp, " ", '_');
 842         gtk_tree_store_set (tree, iter,
 843                             INFO_PARSER_COLUMN_PAGE_NO, tmp,
 844                             INFO_PARSER_COLUMN_PAGE_NAME, node,
 845                             INFO_PARSER_COLUMN_PAGE_CONTENT, parts[2],
 846                             -1);
 847
 848         g_free (tmp);
 849         g_free (node);
 850         g_free (up);
 851         g_free (prev);
 852         g_free (next);
 853         g_strfreev (parts);
 854 }
 855
 856 struct TagTableFix {
 857   GHashTable *nodes2pages; /* Build this... */
 858   GHashTable *pages2nodes; /* ... using this. */
 859 };
 860
 861 static void
 862 use_offset2page (gpointer o, gpointer p, gpointer ud)
 863 {
 864   struct TagTableFix* ttf = (struct TagTableFix*)ud;
 865
 866   const gchar* node = g_hash_table_lookup (ttf->pages2nodes, p);
 867   if (node) {
 868     g_hash_table_insert (ttf->nodes2pages, g_strdup (node), p);
 869   }
 870 }
 871
 872 /*
 873   We had a nodes2offsets hash table, but sometimes these things
 874   lie. How terribly rude. Anyway, use offsets2pages and pages2nodes
 875   (and injectivity!) to construct the nodes2pages hash table.
 876 */
 877 static GHashTable *
 878 make_nodes2pages (GHashTable* offsets2pages,
 879                   GHashTable* pages2nodes)
 880 {
 881   struct TagTableFix ttf;
 882
 883   ttf.nodes2pages =
 884     g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
 885   ttf.pages2nodes = pages2nodes;
 886
 887   g_hash_table_foreach (offsets2pages, use_offset2page, &ttf);
 888
 889   return ttf.nodes2pages;
 890 }
 891
 892 /**
 893  * Parse file into a GtkTreeStore containing useful information that we can
 894  * later convert into a nice XML document or something else.
 895  */
 896 GtkTreeStore
 897 *yelp_info_parser_parse_file (char *file)
 898 {
 899         gchar **page_list;
 900         char **ptr;
 901         int pages;
 902         int offset;
 903         GHashTable *offsets2pages = NULL;
 904         GHashTable *pages2nodes = NULL;
 905         GHashTable *nodes2pages = NULL;
 906         GHashTable *nodes2iters = NULL;
 907         int *processed_table;
 908         GtkTreeStore *tree;
 909         int pt;
 910
 911         page_list = expanded_info_file (file);
 912         if (!page_list)
 913           return NULL;
 914
 915         pages = 0;
 916         offset = 0;
 917
 918         offsets2pages = g_hash_table_new_full (g_str_hash, g_str_equal, g_free,
 919                                                NULL);
 920         pages2nodes = g_hash_table_new_full (g_direct_hash, g_direct_equal, NULL,
 921                                              g_free);
 922
 923         for (ptr = page_list; *ptr != NULL; ptr++)
 924         {
 925           gchar *name = NULL;
 926
 927           g_hash_table_insert (offsets2pages,
 928                                g_strdup_printf ("%i", offset),
 929                                GINT_TO_POINTER (pages));
 930
 931           name = get_value_after (*ptr, "Node: ");
 932           if (name)
 933             g_hash_table_insert (pages2nodes,
 934                                  GINT_TO_POINTER (pages), name);
 935
 936           offset += strlen (*ptr);
 937           if (pages) offset += 2;
 938           pages++;
 939
 940           pt = page_type (*ptr);
 941           if (pt == PAGE_INDIRECT) {
 942             g_warning ("Found an indirect page in a file "
 943                        "we thought we'd expanded.");
 944           }
 945         }
 946
 947         /* Now consolidate (and correct) the two hash tables */
 948         nodes2pages = make_nodes2pages (offsets2pages, pages2nodes);
 949
 950         g_hash_table_destroy (offsets2pages);
 951         g_hash_table_destroy (pages2nodes);
 952
 953         processed_table = g_malloc0 (pages * sizeof (int));
 954         tree = gtk_tree_store_new (INFO_PARSER_N_COLUMNS, G_TYPE_STRING, G_TYPE_STRING,
 955                         G_TYPE_STRING);
 956         nodes2iters = g_hash_table_new_full (g_str_hash, g_str_equal, g_free,
 957                                              (GDestroyNotify) gtk_tree_iter_free);
 958
 959         pages = 0;
 960         for (ptr = page_list; *ptr != NULL; ptr++)
 961         {
 962           if (page_type (*ptr) != PAGE_NODE) continue;
 963           process_page (tree, nodes2pages, nodes2iters,
 964                         processed_table, page_list, *ptr);
 965         }
 966
 967         g_strfreev (page_list);
 968
 969         g_hash_table_destroy (nodes2iters);
 970         g_hash_table_destroy (nodes2pages);
 971
 972         g_free (processed_table);
 973
 974         return tree;
 975 }
 976
 977 /* End Part 1 */
 978 /* Part 2: Parse Tree into XML */
 979 static void
 980 parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 981 {
 982     GtkTreeIter children, parent;
 983         xmlNodePtr newnode;
 984
 985         char *page_no = NULL;
 986         char *page_name = NULL;
 987         char *page_content = NULL;
 988         gboolean notes = FALSE;
 989
 990         debug_print (DB_DEBUG, "Decended\n");
 991         do
 992         {
 993                 gtk_tree_model_get (GTK_TREE_MODEL (tree), &iter,
 994                                 INFO_PARSER_COLUMN_PAGE_NO, &page_no,
 995                                 INFO_PARSER_COLUMN_PAGE_NAME, &page_name,
 996                                 INFO_PARSER_COLUMN_PAGE_CONTENT, &page_content,
 997                                 -1);
 998                 debug_print (DB_DEBUG, "Got Section: %s\n", page_name);
 999                 if (strstr (page_content, "*Note") ||
1000                     strstr (page_content, "*note")) {
1001                   notes = TRUE;
1002                 }
1003                 if (strstr (page_content, "* Menu:")) {
1004                   newnode = yelp_info_parse_menu (tree, node, page_content, notes);
1005                 } else {
1006                   newnode = xmlNewTextChild (*node, NULL,
1007                                              BAD_CAST "Section",
1008                                              NULL);
1009                   if (!notes)
1010                     info_body_text (newnode, NULL, NULL, FALSE, page_content);
1011
1012                   else {
1013                     /* Handle notes here */
1014                     info_process_text_notes (&newnode, page_content, tree);
1015                   }
1016                 }
1017                 /* if we free the page content, now it's in the XML, we can
1018                  * save some memory */
1019                 g_free (page_content);
1020                 page_content = NULL;
1021
1022                 if (gtk_tree_model_iter_parent (GTK_TREE_MODEL (tree), &parent, &iter)) {
1023                     gchar *parent_id;
1024                     gtk_tree_model_get (GTK_TREE_MODEL (tree), &parent,
1025                                         INFO_PARSER_COLUMN_PAGE_NO, &parent_id,
1026                                         -1);
1027                     xmlNewProp (newnode, BAD_CAST "up", BAD_CAST parent_id);
1028                     g_free (parent_id);
1029                 }
1030
1031                 xmlNewProp (newnode, BAD_CAST "id",
1032                             BAD_CAST page_no);
1033                 xmlNewProp (newnode, BAD_CAST "name",
1034                             BAD_CAST page_name);
1035                 if (gtk_tree_model_iter_children (GTK_TREE_MODEL (tree),
1036                                 &children,
1037                                 &iter))
1038                   parse_tree_level (tree, &newnode, children);
1039                 g_free (page_no);
1040                 g_free (page_name);
1041         }
1042         while (gtk_tree_model_iter_next (GTK_TREE_MODEL (tree), &iter));
1043         debug_print (DB_DEBUG, "Ascending\n");
1044 }
1045
1046 xmlDocPtr
1047 yelp_info_parser_parse_tree (GtkTreeStore *tree)
1048 {
1049         xmlDocPtr doc;
1050         xmlNodePtr node;
1051         GtkTreeIter iter;
1052
1053         /*
1054         xmlChar *xmlbuf;
1055         int bufsiz;
1056         */
1057
1058         doc = xmlNewDoc (BAD_CAST "1.0");
1059         node = xmlNewNode (NULL, BAD_CAST "Info");
1060         xmlDocSetRootElement (doc, node);
1061
1062         /* functions I will want:
1063         gtk_tree_model_get_iter_first;
1064         gtk_tree_model_iter_next;
1065         gtk_tree_model_iter_children;
1066         */
1067
1068         if (gtk_tree_model_get_iter_first (GTK_TREE_MODEL (tree), &iter))
1069                 parse_tree_level (tree, &node, iter);
1070         d (else debug_print (DB_DEBUG, "Empty tree?\n"));
1071
1072         /*
1073         xmlDocDumpFormatMemory (doc, &xmlbuf, &bufsiz, 1);
1074         g_print ("XML follows:\n%s\n", xmlbuf);
1075         */
1076
1077         return doc;
1078 }
1079
1080 gboolean
1081 resolve_frag_id (GtkTreeModel *model, GtkTreePath *path, GtkTreeIter *iter,
1082                  gpointer data)
1083 {
1084   gchar *page_no = NULL;
1085   gchar *page_name = NULL;
1086   gchar **xref = data;
1087
1088   gtk_tree_model_get (GTK_TREE_MODEL (model), iter,
1089                       INFO_PARSER_COLUMN_PAGE_NO, &page_no,
1090                       INFO_PARSER_COLUMN_PAGE_NAME, &page_name,
1091                       -1);
1092   if (g_str_equal (page_name, *xref)) {
1093     g_free (*xref);
1094     *xref = g_strdup (page_name);
1095     *xref = g_strdelimit (*xref, " ", '_');
1096
1097     g_free (page_name);
1098     g_free (page_no);
1099     return TRUE;
1100   }
1101   g_free (page_name);
1102   g_free (page_no);
1103
1104   return FALSE;
1105 }
1106
1107 gboolean
1108 get_menuoptions (gchar *line, gchar **title, gchar **ref, gchar **desc,
1109                  gchar **xref)
1110 {
1111   /* Since info is actually braindead and allows .s in
1112    * its references, we gotta carefully extract things
1113    * as .s can be in either the title or desc
1114    */
1115   gchar *tmp = line;
1116   gchar *tfind = NULL;
1117
1118   if (!g_str_has_prefix (line, "* "))
1119     return FALSE;
1120
1121   tfind = strchr (tmp, ':');
1122
1123   if (!tfind) /* No : on the line, bail out */
1124     return FALSE;
1125
1126   (*title) = g_strndup (tmp, tfind-tmp);
1127
1128   if (tfind[1] == ':') { /* This happens if the title and ref are the same
1129                          * Most menus are of this type
1130                          */
1131
1132     (*ref) = NULL; /* There is no second part.  The rest is description */
1133
1134     tmp++;
1135     (*xref) = g_strndup (tmp, tfind-tmp);
1136     g_strstrip (*xref);
1137
1138     tfind+=2;
1139     (*desc) = g_strdup (tfind);
1140   } else { /* The other type of menu option */
1141     gchar *td = NULL;
1142
1143     tfind++;
1144     td = strchr (tfind, '.');
1145     if (!td)
1146       return FALSE;
1147     (*ref) = g_strndup (tfind, td-tfind);
1148     (*xref) = g_strdup (*ref);
1149     g_strstrip (*xref);
1150
1151     td++;
1152     (*desc) = g_strdup (td);
1153   }
1154   return TRUE;
1155 }
1156
1157 /* Find the first non whitespace character in str or return pointer to the
1158  * '\0' if there isn't one. */
1159 static gchar*
1160 first_non_space (gchar* str)
1161 {
1162   /* As long as str is null terminated, this is ok! */
1163   while (g_ascii_isspace (*str)) str++;
1164   return str;
1165 }
1166
1167 static xmlNodePtr
1168 yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
1169                       gchar *page_content, gboolean notes)
1170 {
1171   gchar **split;
1172   gchar **menuitems;
1173   gchar *tmp = NULL;
1174   xmlNodePtr newnode, menu_node, mholder = NULL;
1175   int i=0;
1176
1177   split = g_strsplit (page_content, "* Menu:", 2);
1178
1179   newnode = xmlNewChild (*node, NULL,
1180                          BAD_CAST "Section", NULL);
1181
1182
1183   if (!notes)
1184     info_body_text (newnode, NULL, NULL, FALSE, split[0]);
1185   else {
1186     info_process_text_notes (&newnode, split[0], tree);
1187   }
1188
1189   menuitems = g_strsplit (split[1], "\n", -1);
1190   g_strfreev (split);
1191
1192   /* The output xml should look something like the following:
1193
1194      <menu>
1195        <menuholder>
1196          <a href="xref:Help-Inv">Help-Inv</a>
1197          <para1>Invisible text in Emacs Info.</para1>
1198        </menuholder>
1199        <menuholder>
1200          <a href="xref:Help-M">Help-M</a>
1201          <para1>Menus.</para1>
1202        </menuholder>
1203        ...
1204      </menu>
1205
1206      (from the top page of info:info). Note the absence of *'s and
1207      ::'s on the links.
1208
1209      If there's a line with no "* Blah::", it looks like a child of
1210      the previous menu item so (for i > 0) deal with that correctly by
1211      not "closing" the <menuholder> tag until we find the next
1212      start.
1213   */
1214
1215   if (menuitems[0] != NULL) {
1216     /* If there are any menu items, make the <menu> node */
1217     menu_node = xmlNewChild (newnode, NULL, BAD_CAST "menu", NULL);
1218   }
1219
1220   while (menuitems[i] != NULL) {
1221     gboolean menu = FALSE;
1222     gchar *title = NULL;
1223     gchar *ref = NULL;
1224     gchar *desc = NULL;
1225     gchar *xref = NULL;
1226     gchar *link_text = NULL;
1227     xmlNodePtr ref1;
1228
1229     menu = get_menuoptions (menuitems[i], &title, &ref, &desc, &xref);
1230
1231     if (menu && (*title == '\0' || *(title + 1) == '\0')) {
1232       g_warning ("Info title unexpectedly short for menu item (%s)",
1233                  menuitems[i]);
1234       menu = FALSE;
1235     }
1236
1237     if (menu) {
1238       mholder = xmlNewChild (menu_node, NULL, BAD_CAST "menuholder", NULL);
1239       gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &xref);
1240
1241       if (ref == NULL) { /* A standard type menu */
1242         /* title+2 skips the "* ". We know we haven't jumped over the
1243            end of the string because strlen (title) >= 3 */
1244         link_text = g_strdup (title+2);
1245
1246         ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
1247                                 BAD_CAST link_text);
1248
1249         tmp = g_strconcat ("xref:", xref, NULL);
1250         xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
1251         g_free (tmp);
1252       } else { /* Indexy type menu  - we gotta do a  little work to fix the
1253                 * spacing
1254                 */
1255         gchar *spacing = ref;
1256         gint c=0;
1257         gchar *sp = NULL;
1258
1259         while (*spacing ==' ') {
1260           c++;
1261           spacing++;
1262         }
1263         sp = g_strndup (ref, c);
1264
1265         link_text = g_strdup (title);
1266
1267         ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
1268                                 BAD_CAST link_text);
1269         tmp = g_strconcat ("xref:", xref, NULL);
1270         xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
1271         g_free (tmp);
1272         xmlNewTextChild (mholder, NULL, BAD_CAST "spacing",
1273                          BAD_CAST sp);
1274         tmp = g_strconcat (g_strstrip(ref), ".", NULL);
1275         ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
1276                                 BAD_CAST tmp);
1277         g_free (tmp);
1278         tmp = g_strconcat ("xref:", xref, NULL);
1279         xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
1280
1281         g_free (tmp);
1282         g_free (sp);
1283       }
1284
1285       tmp = g_strconcat ("\n", first_non_space (desc), NULL);
1286
1287       /*
1288         Don't print the link text a second time, because that looks
1289         really stupid.
1290
1291         We don't do a straight check for equality because lots of
1292         .info files have something like
1293
1294           * Foo::    Foo.
1295
1296         Obviously if the longer explanation has more afterwards, we
1297         don't want to omit it, which is why there's the strlen test.
1298       */
1299       if (strncmp (link_text, tmp + 1, strlen (link_text)) ||
1300           strlen (link_text) + 1 < strlen (tmp + 1)) {
1301         xmlNewTextChild (mholder, NULL,
1302                          BAD_CAST "para1", BAD_CAST tmp);
1303       }
1304
1305       g_free (tmp);
1306       g_free (link_text);
1307     }
1308     else if (*(menuitems[i]) != '\0') {
1309       tmp = g_strconcat ("\n", first_non_space (menuitems[i]), NULL);
1310       xmlNewTextChild (mholder ? mholder : menu_node,
1311                        NULL, BAD_CAST "para1",
1312                        BAD_CAST tmp);
1313       g_free (tmp);
1314     }
1315     i++;
1316     g_free (title);
1317     g_free (ref);
1318     g_free (desc);
1319     g_free (xref);
1320
1321   }
1322   g_strfreev (menuitems);
1323
1324   return newnode;
1325 }
1326
1327 void
1328 info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
1329 {
1330   gchar **notes;
1331   gchar **current;
1332   xmlNodePtr ref1;
1333   xmlNodePtr paragraph = NULL;
1334   gboolean first = TRUE;
1335
1336   /*
1337     Split using the regular expression
1338
1339       \*[Nn]ote(?!_)
1340
1341     which deals with either case and the last bit is a lookahead so
1342     that we don't split on things of the form *Note:_, which aren't
1343     real notes.
1344   */
1345   notes = g_regex_split_simple ("\\*[Nn]ote(?!_)", content, 0, 0);
1346
1347   for (current = notes; *current != NULL; current++) {
1348     gchar *url, **urls;
1349     gchar *append;
1350     gchar *alt_append, *alt_append1;
1351     gchar *link_text;
1352     gchar *href = NULL;
1353     gchar *break_point = NULL;
1354     gboolean broken = FALSE;
1355     if (first) {
1356       /* The first node is special.  It doesn't have a note ref at the
1357        * start, so we can just add it and forget about it.
1358        */
1359       first = FALSE;
1360       info_body_text (*node, &paragraph, NULL, TRUE, (*current));
1361       continue;
1362     }
1363
1364     /* If we got to here, we now gotta parse the note reference */
1365     append = strchr (*current, ':');
1366     if (!append) {
1367       info_body_text (*node, &paragraph, NULL, TRUE, *current);
1368       continue;
1369     }
1370     append++;
1371     alt_append = append;
1372     alt_append1 = alt_append;
1373     append = strchr (append, ':');
1374     alt_append = strchr (alt_append, '.');
1375     if (alt_append && g_str_has_prefix (alt_append, ".info")) {
1376       broken = TRUE;
1377       alt_append++;
1378       alt_append = strchr (alt_append, '.');
1379     }
1380     alt_append1 = strchr (alt_append1, ',');
1381     if (!append && !alt_append && !alt_append1) {
1382       info_body_text (*node, &paragraph, NULL, TRUE, *current);
1383       continue;
1384     }
1385     if (!append || alt_append || alt_append1) {
1386       if (!append) {
1387         if (alt_append) append = alt_append;
1388         else append = alt_append1;
1389       }
1390       if ((alt_append && alt_append < append))
1391         append = alt_append;
1392       if (alt_append1 && alt_append1 < append)
1393         append = alt_append1;
1394     }
1395     append++;
1396     url = g_strndup (*current, append - (*current));
1397
1398     /* Save a copy of the unadulterated link text for later. */
1399     link_text = g_strconcat ("*Note", url, NULL);
1400
1401     /* By now, we got 2 things.  First, is append which is the (hopefully)
1402      * non-link text.  Second, we got a url.
1403      * The url can be in several forms:
1404      * 1. linkend::
1405      * 2. linkend:(infofile)Linkend.
1406      * 3. Title: Linkend.
1407      * 4. Title: Linkend, (pretty sure this is just broken)
1408      * 5. Title: (infofile.info)Linkend.
1409      * All possibilities should have been picked up.
1410      * Here:
1411      * Clean up the split.  Should be left with a real url and
1412      * a list of fragments that should be linked
1413      * Also goes through and removes extra spaces, leaving only one
1414      * space in place of many
1415      */
1416     urls = g_strsplit (url, "\n", -1);
1417     break_point = strchr (url, '\n');
1418     while (break_point) {
1419       *break_point = ' ';
1420       break_point = strchr (++break_point, '\n');
1421     }
1422     break_point = strchr (url, ' ');
1423     while (break_point) {
1424       if (*(break_point+1) == ' ') {
1425         /* Massive space.  Fix. */
1426         gchar *next = break_point;
1427         gchar *url_copy;
1428         gchar *old = url;
1429         while (*next == ' ')
1430           next++;
1431         next--;
1432         url_copy = g_strndup (url, break_point-url);
1433         url = g_strconcat (url_copy, next, NULL);
1434         g_free (old);
1435         break_point = strchr (url, ' ');
1436         g_free (url_copy);
1437       } else {
1438         break_point++;
1439         break_point = strchr (break_point, ' ');
1440       }
1441     }
1442     if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */
1443       gchar *stop = NULL;
1444       gchar *lurl = NULL;
1445       gchar *zloc = NULL;
1446       stop = strchr (url, ':');
1447       lurl = strchr (stop, '(');
1448       if (!lurl) { /* 3rd type of link */
1449         gchar *link;
1450         gint length;
1451         stop++;
1452         link = g_strdup (stop);
1453         link = g_strstrip (link);
1454         length = strlen (link) - 1;
1455         link[length] = '\0';
1456         href = g_strconcat ("xref:", link, NULL);
1457         link[length] = 'a';
1458         g_free (link);
1459
1460
1461       } else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
1462         if (broken) {
1463           gchar *new_url;
1464           gchar *info;
1465           gchar *stripped;
1466
1467           new_url = g_strdup (lurl);
1468           info = strstr (new_url, ".info)");
1469           stripped = g_strndup (new_url, info-new_url);
1470           info +=5;
1471           lurl = g_strconcat (stripped, info, NULL);
1472           g_free (stripped);
1473           g_free (new_url);
1474         }
1475         zloc = &(lurl[strlen(lurl)-1]);
1476         *zloc = '\0';
1477         href = g_strconcat ("info:", lurl, NULL);
1478         *zloc = 'a';
1479       }
1480     } else { /* First kind of link */
1481       gchar *tmp1;
1482       gchar *frag;
1483
1484       tmp1 = strchr (url, ':');
1485       if (!tmp1)
1486         frag = g_strdup (url);
1487       else
1488         frag = g_strndup (url, tmp1 - url);
1489       g_strstrip (frag);
1490       gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
1491       href = g_strconcat ("xref:", frag, NULL);
1492       g_free (frag);
1493     }
1494
1495     /* Check we've got a valid paragraph node */
1496     if (!paragraph) {
1497       paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
1498     }
1499
1500     /*
1501       Now we're supposed to actually render the link. I have a list of
1502       bits of URL and actually this is really easy - I want to have
1503       the link *text* exactly the same as it appeared in the .info
1504       file, so don't use the list of strings urls, instead use the
1505       whole lot: url (complete with embedded newlines etc.)
1506     */
1507     ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
1508                             BAD_CAST link_text);
1509     g_free (link_text);
1510     xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
1511
1512     g_strfreev (urls);
1513
1514     /* Finally, we can add the following text as required */
1515     info_body_text (*node, &paragraph, NULL, TRUE, append);
1516
1517     g_free (url);
1518     g_free (href);
1519   }
1520   g_strfreev (notes);
1521 }