libyelp/yelp-info-parser.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil -*- */
   2 /*
   3  * Copyright (C) 2005 Davyd Madeley <davyd@madeley.id.au>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public
  16  * License along with this program; if not, write to the
  17  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18  * Boston, MA 02111-1307, USA.
  19  *
  20  * Author: Davyd Madeley  <davyd@madeley.id.au>
  21  */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <glib.h>
  28 #include <gtk/gtk.h>
  29 #include <string.h>
  30
  31 #include "yelp-info-parser.h"
  32 #include "yelp-magic-decompressor.h"
  33 #include "yelp-debug.h"
  34
  35
  36 GtkTreeIter *         find_real_top                      (GtkTreeModel *model,
  37                                                           GtkTreeIter *it);
  38 GtkTreeIter *         find_real_sibling                  (GtkTreeModel *model,
  39                                                           GtkTreeIter *it,
  40                                                           GtkTreeIter *comp);
  41 xmlNodePtr            yelp_info_parse_menu               (GtkTreeStore *tree,
  42                                                           xmlNodePtr *node,
  43                                                           gchar *page_content,
  44                                                           gboolean notes);
  45 gboolean              get_menuoptions                    (gchar *line,
  46                                                           gchar **title,
  47                                                           gchar **ref,
  48                                                           gchar **desc,
  49                                                           gchar **xref);
  50 gboolean              resolve_frag_id                    (GtkTreeModel *model,
  51                                                           GtkTreePath *path,
  52                                                           GtkTreeIter *iter,
  53                                                           gpointer data);
  54 void                  info_process_text_notes            (xmlNodePtr *node,
  55                                                           gchar *content,
  56                                                           GtkTreeStore
  57                                                           *tree);
  58
  59 /*
  60   Used to output the correct <heading level="?" /> tag.
  61  */
  62 static const gchar* level_headings[] = { NULL, "1", "2", "3" };
  63
  64 static GHashTable *
  65 info_image_get_attributes (gchar const* string)
  66 {
  67   GMatchInfo *match_info;
  68   GRegex *regex;
  69   GHashTable *h;
  70
  71   h = 0;
  72   regex = g_regex_new ("([^\\s][^\\s=]+)=(?:([^\\s \"]+)|(?:\"((?:[^\\\"]|\\\\[\\\\\"])*)\"))", 0, 0, NULL);
  73   g_regex_match (regex, string, 0, &match_info);
  74   while (g_match_info_matches (match_info))
  75     {
  76       gchar *key;
  77       gchar *value;
  78
  79       if (!h)
  80         h = g_hash_table_new (g_str_hash, g_str_equal);
  81       key = g_match_info_fetch (match_info, 1);
  82       value = g_match_info_fetch (match_info, 2);
  83       if (!*value)
  84         value = g_match_info_fetch (match_info, 3);
  85       g_hash_table_insert (h, key, value);
  86       g_match_info_next (match_info, NULL);
  87     }
  88   g_match_info_free (match_info);
  89   g_regex_unref (regex);
  90
  91   return h;
  92 }
  93
  94 /*
  95   info elements look like \0\b[<TAGNAME>\0\b] and take attribute=value
  96   pairs, i.e. for image: \0\b[image src="foo.png" \0\b]
  97 */
  98 #define INFO_TAG_0 "\0"
  99 #define INFO_TAG_1 "\b"
 100 #define INFO_TAG_OPEN_2 INFO_TAG_1 "["
 101 #define INFO_TAG_CLOSE_2 INFO_TAG_1 "]"
 102 #define INFO_TAG_OPEN_2_RE INFO_TAG_1 "[[]"
 103 #define INFO_TAG_CLOSE_2_RE INFO_TAG_1 "[]]"
 104 #define INFO_TAG_OPEN INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2
 105 #define INFO_TAG_CLOSE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2
 106 #define INFO_TAG_OPEN_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2_RE
 107 #define INFO_TAG_CLOSE_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2_RE
 108 /* C/glib * cannot really handle \0 in strings, convert to '@' */
 109 #define INFO_C_TAG_0 "@"
 110 #define INFO_C_TAG_OPEN INFO_C_TAG_0 INFO_TAG_OPEN_2
 111 #define INFO_C_TAG_CLOSE INFO_C_TAG_0 INFO_TAG_CLOSE_2
 112 #define INFO_C_TAG_OPEN_RE INFO_C_TAG_0 INFO_TAG_OPEN_2_RE
 113 #define INFO_C_TAG_CLOSE_RE INFO_C_TAG_0 INFO_TAG_CLOSE_2_RE
 114 #define INFO_C_IMAGE_TAG_OPEN INFO_C_TAG_OPEN "image"
 115 #define INFO_C_IMAGE_TAG_OPEN_RE INFO_C_TAG_OPEN_RE "image"
 116
 117 static xmlNodePtr
 118 info_insert_image (xmlNodePtr parent, GMatchInfo *match_info)
 119 {
 120   GHashTable *h = info_image_get_attributes (g_match_info_fetch (match_info, 1));
 121   gchar *source;
 122   if (h)
 123     source = (gchar*)g_hash_table_lookup (h, "src");
 124
 125   if (!h || !source || !*source)
 126     return xmlNewTextChild (parent, NULL, BAD_CAST "para",
 127                             BAD_CAST "[broken image]");
 128
 129   gchar *title = (gchar*)g_hash_table_lookup (h, "title");
 130   gchar *text = (gchar*)g_hash_table_lookup (h, "text");
 131   gchar *alt = (gchar*)g_hash_table_lookup (h, "alt");
 132   g_hash_table_destroy (h);
 133   xmlNodePtr img = xmlNewChild (parent, NULL, BAD_CAST "img", NULL);
 134   xmlNewProp (img, BAD_CAST "src", BAD_CAST source);
 135   xmlNewProp (img, BAD_CAST "title", BAD_CAST (title ? title : ""));
 136   xmlNewProp (img, BAD_CAST "text", BAD_CAST (text ? text : ""));
 137   xmlNewProp (img, BAD_CAST "alt", BAD_CAST (alt ? alt : ""));
 138   g_free (source);
 139   g_free (title);
 140   g_free (alt);
 141   return parent;
 142 }
 143
 144 /*
 145   If every element of `str' is `ch' then return TRUE, else FALSE.
 146  */
 147 static gboolean
 148 string_all_char_p (const gchar* str, gchar ch)
 149 {
 150   for (; *str; str++) {
 151     if (*str != ch) return FALSE;
 152   }
 153   return TRUE;
 154 }
 155
 156 /*
 157   If `line' is a line of '*', '=' or '-', return 1,2,3 respectively
 158   for the heading level. If it's anything else, return 0.
 159  */
 160 static int
 161 header_underline_level (const gchar* line)
 162 {
 163   if (*line != '*' && *line != '=' && *line != '-')
 164     return 0;
 165
 166   if (string_all_char_p (line, '*')) return 1;
 167   if (string_all_char_p (line, '=')) return 2;
 168   if (string_all_char_p (line, '-')) return 3;
 169
 170   return 0;
 171 }
 172
 173 /*
 174   Use g_strjoinv to join up the strings from `strings', but they might
 175   not actually be a null-terminated array. `end' should be strings+n,
 176   where I want the first n strings (strings+0, ..., strings+(n-1)). It
 177   shouldn't point outside of the array allocated, but it can point at
 178   the null string at the end.
 179  */
 180 static gchar*
 181 join_strings_subset (const gchar *separator,
 182                      gchar** strings, gchar** end)
 183 {
 184   g_assert(end > strings);
 185
 186   gchar *ptr = *end;
 187   *end = NULL;
 188
 189   gchar *glob = g_strjoinv (separator, strings);
 190   *end = ptr;
 191   return glob;
 192 }
 193
 194 /*
 195   Create a text node, child of `parent', with the lines strictly
 196   between `first' and `last'.
 197 */
 198 static void
 199 lines_subset_text_child (xmlNodePtr parent, xmlNsPtr ns,
 200                          gchar** first, gchar** last)
 201 {
 202   /* TODO? Currently we're copying the split strings again, which is
 203      less efficient than somehow storing lengths and using a sort of
 204      window on `content'. But that's much more difficult, so unless
 205      there's a problem, let's go with the stupid approach. */
 206   gchar *glob;
 207
 208   if (last > first) {
 209     glob = join_strings_subset ("\n", first, last);
 210     xmlAddChild (parent, xmlNewText (BAD_CAST glob));
 211     g_free (glob);
 212   }
 213 }
 214
 215 /*
 216   Convert body text CONTENT to xml nodes. This function is responsible
 217   for spotting headings etc and splitting them out correctly.
 218
 219   paragraph is as described in info_body_text, but cannot be null.
 220
 221   If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
 222   <para> tag.
 223
 224   TODO: IWBN add a regex match for *Note: here and call the *Note ==>
 225   <a href> logic of info_process_text_notes from here.
 226  */
 227 static void
 228 info_body_parse_text (xmlNodePtr parent, xmlNodePtr *paragraph,
 229                       xmlNsPtr ns,
 230                       gboolean inline_p, const gchar *content)
 231 {
 232   /* The easiest things to spot are headings: they look like a line of
 233    * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
 234    * them, we split content into single lines and work with them. */
 235   gchar **lines = g_strsplit (content, "\n", 0);
 236   gchar **first = lines, **last = lines;
 237   int header_level;
 238   xmlNodePtr header_node;
 239
 240   /* Deal with the possibility that `content' is empty */
 241   if (*lines == NULL) {
 242     if (!inline_p) {
 243       xmlNewTextChild (parent, NULL, BAD_CAST "para", BAD_CAST "");
 244     }
 245     return;
 246   }
 247
 248   /* Use a pair of pointers, first and last, which point to two lines,
 249    * the chunk of the body we're displaying (inclusive) */
 250   for (; *last; last++) {
 251
 252     /* Check for a blank line */
 253     if (**last == '\0') {
 254       if (last != first) {
 255         if (!*paragraph) {
 256           *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
 257         }
 258         lines_subset_text_child (*paragraph, ns, first, last);
 259       }
 260       /* On the next iteration, last==first both pointing at the next
 261          line. */
 262       first = last+1;
 263       *paragraph = NULL;
 264
 265       continue;
 266     }
 267
 268     /* Check for a header */
 269     header_level = header_underline_level (*last);
 270     if (header_level) {
 271       /* Write out any lines beforehand */
 272       lines_subset_text_child (parent, ns, first, last-1);
 273       /* Now write out the actual header line */
 274       header_node = xmlNewTextChild (parent, ns, BAD_CAST "header",
 275                                      BAD_CAST *(last-1));
 276       xmlNewProp (header_node, BAD_CAST "level",
 277                   BAD_CAST level_headings[header_level]);
 278
 279       first = last+1;
 280       last = first-1;
 281     }
 282   }
 283
 284   /* Write out any lines left */
 285   if (!*paragraph) {
 286     *paragraph = xmlNewChild (parent, ns, BAD_CAST "para", NULL);
 287   }
 288   lines_subset_text_child (*paragraph, ns, first, last);
 289
 290   g_strfreev (lines);
 291 }
 292
 293 /*
 294   info_body_text is responsible for taking a hunk of the info page's
 295   body and turning it into paragraph tags. It searches out images and
 296   marks them up properly if necessary.
 297
 298   parent should be the node in which we're currently storing text and
 299   paragraph a pointer to a <para> tag or NULL. At blank lines, we
 300   finish with the current para tag and switch to a new one.
 301
 302   It uses info_body_parse_text to mark up the actual bits of text.
 303  */
 304 static void
 305 info_body_text (xmlNodePtr parent, xmlNodePtr *paragraph, xmlNsPtr ns,
 306                 gboolean inline_p, gchar const *content)
 307 {
 308   xmlNodePtr thepara = NULL;
 309   if (paragraph == NULL) paragraph = &thepara;
 310
 311   if (!strstr (content, INFO_C_IMAGE_TAG_OPEN)) {
 312     info_body_parse_text (parent, paragraph, ns, inline_p, content);
 313     return;
 314   }
 315
 316   gint content_len = strlen (content);
 317   gint pos = 0;
 318   GRegex *regex = g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE "((?:[^" INFO_TAG_1 "]|[^" INFO_C_TAG_0 "]+" INFO_TAG_1 ")*)" INFO_C_TAG_CLOSE_RE ")", 0, 0, NULL);
 319   GMatchInfo *match_info;
 320
 321   g_regex_match (regex, content, 0, &match_info);
 322   while (g_match_info_matches (match_info))
 323     {
 324       gint image_start;
 325       gint image_end;
 326       gboolean image_found = g_match_info_fetch_pos (match_info, 0,
 327                                                      &image_start, &image_end);
 328       gchar *before = g_strndup (&content[pos], image_start - pos);
 329       pos = image_end + 1;
 330       info_body_parse_text (parent, paragraph, NULL, TRUE, before);
 331       g_free (before);
 332
 333       /* End the paragraph that was before */
 334       *paragraph = NULL;
 335
 336       if (image_found)
 337         info_insert_image (parent, match_info);
 338       g_match_info_next (match_info, NULL);
 339     }
 340   gchar *after = g_strndup (&content[pos], content_len - pos);
 341   info_body_parse_text (parent, paragraph, NULL, TRUE, after);
 342   g_free (after);
 343 }
 344
 345 /* Part 1: Parse File Into Tree Store */
 346
 347 enum
 348 {
 349         PAGE_TAG_TABLE,
 350         PAGE_NODE,
 351         PAGE_INDIRECT,
 352         PAGE_OTHER
 353 };
 354
 355 static int
 356 page_type (char *page)
 357 {
 358   if (g_ascii_strncasecmp (page, "Tag Table:\n", 11) == 0)
 359     return PAGE_TAG_TABLE;
 360   else if (g_ascii_strncasecmp (page, "Indirect:\n", 10) == 0)
 361     return PAGE_INDIRECT;
 362   else if (g_ascii_strncasecmp (page, "File:", 5) == 0 ||
 363            g_ascii_strncasecmp (page, "Node:", 5) == 0)
 364     return PAGE_NODE;
 365
 366   else
 367     return PAGE_OTHER;
 368 }
 369
 370 static char
 371 *open_info_file (const gchar *file)
 372 {
 373     GFile *gfile;
 374     GConverter *converter;
 375     GFileInputStream *file_stream;
 376     GInputStream *stream;
 377     gchar buf[1024];
 378     gssize bytes;
 379     GString *string;
 380     gchar *str;
 381     int i;
 382
 383     gfile = g_file_new_for_path (file);
 384     file_stream = g_file_read (gfile, NULL, NULL);
 385     converter = (GConverter *) yelp_magic_decompressor_new ();
 386     stream = g_converter_input_stream_new ((GInputStream *) file_stream, converter);
 387     string = g_string_new (NULL);
 388
 389     while ((bytes = g_input_stream_read (stream, buf, 1024, NULL, NULL)) > 0)
 390         g_string_append_len (string, buf, bytes);
 391
 392     g_object_unref (stream);
 393
 394     str = string->str;
 395
 396     /* C/glib * cannot really handle \0 in strings, convert. */
 397     for (i = 0; i < (string->len - 1); i++)
 398         if (str[i] == INFO_TAG_OPEN[0] && str[i+1] == INFO_TAG_OPEN[1])
 399             str[i] = INFO_C_TAG_OPEN[0];
 400
 401     g_string_free (string, FALSE);
 402
 403     return str;
 404 }
 405
 406 static gchar *
 407 find_info_part (gchar *part_name, const gchar *base)
 408 {
 409   /* New and improved.  We now assume that all parts are
 410    * in the same subdirectory as the base file.  Makes
 411    * life much simpler and is (afaict) always true
 412    */
 413   gchar *path;
 414   gchar *tmp;
 415   gchar *bzfname, *gzfname, *lzfd, *fname;
 416   gchar *uri = NULL;
 417   tmp = g_strrstr (base, "/");
 418   path = g_strndup (base, tmp-base);
 419
 420   bzfname = g_strconcat (path, "/", part_name, ".bz2", NULL);
 421   gzfname = g_strconcat (path, "/", part_name, ".gz", NULL);
 422   lzfd = g_strconcat (path, "/", part_name, ".lzma", NULL);
 423   fname = g_strconcat (path, "/", part_name, NULL);
 424
 425   if (g_file_test (bzfname, G_FILE_TEST_EXISTS))
 426     uri = g_strdup (bzfname);
 427   else if (g_file_test (gzfname, G_FILE_TEST_EXISTS))
 428     uri = g_strdup (gzfname);
 429   else if (g_file_test (lzfd, G_FILE_TEST_EXISTS))
 430     uri = g_strdup (lzfd);
 431   else if (g_file_test (fname, G_FILE_TEST_EXISTS))
 432     uri = g_strdup (fname);
 433
 434   g_free (bzfname);
 435   g_free (gzfname);
 436   g_free (lzfd);
 437   g_free (fname);
 438   g_free (path);
 439   return uri;
 440
 441 }
 442
 443 static char
 444 *process_indirect_map (char *page, const gchar *file)
 445 {
 446         char **lines;
 447         char **ptr;
 448         char *composite = NULL;
 449         size_t composite_len = 0;
 450
 451         lines = g_strsplit (page, "\n", 0);
 452
 453         /*
 454           Go backwards down the list so that we allocate composite
 455           big enough the first time around.
 456         */
 457         for (ptr = lines + 1; *ptr != NULL; ptr++);
 458         for (ptr--; ptr != lines; ptr--)
 459         {
 460                 char **items;
 461                 char *filename;
 462                 char *str;
 463                 char **pages;
 464                 int offset;
 465                 int plength;
 466
 467                 debug_print (DB_DEBUG, "Line: %s\n", *ptr);
 468                 items = g_strsplit (*ptr, ": ", 2);
 469
 470                 if (items[0])
 471                 {
 472                   filename = find_info_part (items[0], file);
 473                   str = open_info_file (filename);
 474                   if (!str) {
 475                         g_strfreev (items);
 476                         continue;
 477                   }
 478                         pages = g_strsplit (str, "\x1f", 2);
 479                         g_free (str);
 480                         if (!pages[1]) {
 481                                 g_strfreev (items);
 482                                 g_strfreev (pages);
 483                                 continue;
 484                         }
 485
 486                         offset =  atoi(items[1]);
 487                         plength = strlen(pages[1]);
 488
 489                         debug_print (DB_DEBUG, "Need to make string %s+%i bytes = %i\n",
 490                                     items[1], plength,
 491                                     offset + plength);
 492
 493                         if (!composite) /* not yet created, malloc it */
 494                         {
 495                                 composite_len = offset + plength;
 496                                 composite = g_malloc (sizeof (char) *
 497                                                       (composite_len + 1));
 498                                 memset (composite, '-', composite_len);
 499                                 composite[composite_len] = '\0';
 500                         }
 501
 502                         /* Because we're going down the list
 503                          * backwards, plength should always be short
 504                          * enough to fit in the memory allocated. But
 505                          * in case something's broken/malicious, we
 506                          * should check anyway.
 507                          */
 508                         if (offset > composite_len)
 509                           continue;
 510                         if (plength + offset + 1 > composite_len)
 511                           plength = composite_len - offset - 1;
 512
 513                         composite[offset] = '\x1f';
 514                         memcpy (composite + offset + 1, pages[1], plength);
 515
 516                         g_free (filename);
 517                         g_strfreev (pages);
 518                 }
 519
 520                 g_strfreev (items);
 521         }
 522
 523         g_strfreev (lines);
 524
 525         return composite;
 526 }
 527
 528 /*
 529   Open up the relevant info file and read it all into memory. If there
 530   is an indirect table thingy, we resolve that as we go.
 531
 532   Returns a NULL-terminated list of pointers to pages on success and
 533   NULL otherwise.
 534  */
 535 static gchar**
 536 expanded_info_file (const gchar *file)
 537 {
 538   gchar *slurp = open_info_file (file);
 539   gchar **page_list;
 540   gchar **page;
 541
 542   if (!slurp) return NULL;
 543
 544   /* TODO: There's a lot of copying of bits of memory here. With a bit
 545    * more effort we could avoid it. Either we should fix this or
 546    * measure the time taken and decide it's irrelevant...
 547    *
 548    * Note: \x1f\n is ^_\n
 549    */
 550   page_list = g_strsplit (slurp, "\x1f\n", 0);
 551
 552   g_free (slurp);
 553
 554   for (page = page_list; *page != NULL; page++) {
 555     if (page_type (*page) == PAGE_INDIRECT) {
 556
 557       slurp = process_indirect_map (*page, file);
 558       g_strfreev (page_list);
 559
 560       if (!slurp)
 561         return NULL;
 562
 563       page_list = g_strsplit (slurp, "\x1f\n", 0);
 564       g_free (slurp);
 565       break;
 566     }
 567   }
 568
 569   return page_list;
 570 }
 571
 572 /*
 573   Look for strings in source by key. For example, we extract "blah"
 574   from "Node: blah," when the key is "Node: ". To know when to stop,
 575   there are two strings: end and cancel.
 576
 577   If we find a character from end first, return a copy of the string
 578   up to (not including) that character. If we find a character of
 579   cancel first, return NULL. If we find neither, return the rest of
 580   the string.
 581
 582   cancel can be NULL, in which case, we don't do its test.
 583  */
 584 static char*
 585 get_value_after_ext (const char *source, const char *key,
 586                      const char *end, const char *cancel)
 587 {
 588   char *start;
 589   size_t not_end, not_cancel;
 590
 591   start = strstr (source, key);
 592   if (!start) return NULL;
 593
 594   start += strlen (key);
 595
 596   not_end = strcspn (start, end);
 597   not_cancel = (cancel) ? strcspn (start, cancel) : not_end + 1;
 598
 599   if (not_cancel < not_end)
 600     return NULL;
 601
 602   return g_strndup (start, not_end);
 603 }
 604
 605 static char*
 606 get_value_after (const char* source, const char *key)
 607 {
 608   return get_value_after_ext (source, key, ",", "\n\x7f");
 609 }
 610
 611 static int
 612 node2page (GHashTable *nodes2pages, char *node)
 613 {
 614   gpointer p;
 615
 616   if (g_hash_table_lookup_extended (nodes2pages, node,
 617                                     NULL, &p))
 618     return GPOINTER_TO_INT(p);
 619
 620   /* This shouldn't happen: we should only ever have to look up pages
 621    * that exist. */
 622   g_return_val_if_reached (0);
 623 }
 624
 625 static GtkTreeIter
 626 *node2iter (GHashTable *nodes2iters, char *node)
 627 {
 628         GtkTreeIter *iter;
 629
 630         iter = g_hash_table_lookup (nodes2iters, node);
 631         d (if (!iter) debug_print (DB_WARN, "Could not retrieve iter for node !%s!\n", node));
 632         return iter;
 633 }
 634
 635 GtkTreeIter
 636 *find_real_top (GtkTreeModel *model, GtkTreeIter *it)
 637 {
 638   GtkTreeIter *r = NULL;
 639   GtkTreeIter *tmp = NULL;
 640
 641   if (!it)
 642     return NULL;
 643
 644   r = gtk_tree_iter_copy (it);
 645   tmp = g_malloc0 (sizeof (GtkTreeIter));
 646   while (gtk_tree_model_iter_parent (model, tmp, r)) {
 647     gtk_tree_iter_free (r);
 648     r = gtk_tree_iter_copy (tmp);
 649   }
 650   g_free (tmp);
 651
 652   return r;
 653 }
 654
 655 GtkTreeIter * find_real_sibling (GtkTreeModel *model,
 656                                  GtkTreeIter *it, GtkTreeIter *comp)
 657 {
 658   GtkTreeIter *r;
 659   GtkTreeIter *tmp = NULL;
 660   gboolean result = FALSE;
 661   gchar *title;
 662   gchar *reftitle;
 663
 664   if (!it) {
 665     return NULL;
 666   }
 667
 668   r = gtk_tree_iter_copy (it);
 669   tmp = gtk_tree_iter_copy (it);
 670
 671   reftitle = gtk_tree_model_get_string_from_iter (model, comp);
 672
 673   result = gtk_tree_model_iter_parent (model, r, it);
 674   if (!result)
 675     return it;
 676
 677   title = gtk_tree_model_get_string_from_iter (model, r);
 678
 679   while (!g_str_equal (title, reftitle) && result) {
 680     gtk_tree_iter_free (tmp);
 681     tmp = gtk_tree_iter_copy (r);
 682     result = gtk_tree_model_iter_parent (model, r, tmp);
 683     if (result)
 684       title = gtk_tree_model_get_string_from_iter (model, r);
 685   }
 686
 687   if (!g_str_equal (title, reftitle))
 688     {
 689       gtk_tree_iter_free (tmp);
 690       tmp = NULL;
 691     }
 692
 693   gtk_tree_iter_free (r);
 694   g_free (title);
 695   g_free (reftitle);
 696   return tmp;
 697
 698 }
 699
 700 static void
 701 process_page (GtkTreeStore *tree,
 702               GHashTable *nodes2pages, GHashTable *nodes2iters,
 703               int *processed_table, char **page_list, char *page_text)
 704 {
 705         GtkTreeIter *iter;
 706
 707         char **parts;
 708         char *node;
 709         char *up;
 710         char *prev;
 711         char *next;
 712         gchar *tmp;
 713
 714         int page;
 715
 716         /* split out the header line and the text */
 717         parts = g_strsplit (page_text, "\n", 3);
 718
 719         node = get_value_after (parts[0], "Node: ");
 720         up = get_value_after (parts[0], "Up: ");
 721         prev = get_value_after (parts[0], "Prev: ");
 722         next = get_value_after (parts[0], "Next: ");
 723
 724         if (next && g_str_equal (next, "Top")) {
 725           g_free (next);
 726           next = NULL;
 727         }
 728         if (g_str_equal (node, "Top") && prev != NULL) {
 729           g_free (prev);
 730           prev = NULL;
 731         }
 732
 733         /* check to see if this page has been processed already */
 734         page = node2page (nodes2pages, node);
 735         if (processed_table[page]) {
 736                 return;
 737         }
 738         processed_table[page] = 1;
 739
 740         debug_print (DB_DEBUG, "-- Processing Page %s\n\tParent: %s\n", node, up);
 741
 742         iter = g_slice_alloc0 (sizeof (GtkTreeIter));
 743         /* check to see if we need to process our parent and siblings */
 744         if (up && g_ascii_strncasecmp (up, "(dir)", 5) && strcmp (up, "Top"))
 745         {
 746                 page = node2page (nodes2pages, up);
 747                 if (!processed_table[page])
 748                 {
 749                   debug_print (DB_DEBUG, "%% Processing Node %s\n", up);
 750                   process_page (tree, nodes2pages,
 751                                 nodes2iters, processed_table, page_list,
 752                                 page_list[page]);
 753                 }
 754         }
 755         if (prev && g_ascii_strncasecmp (prev, "(dir)", 5))
 756           {
 757             if (strncmp (node, "Top", 3)) {
 758               /* Special case the Top node to always appear first */
 759             } else {
 760               page = node2page (nodes2pages, prev);
 761               if (!processed_table[page])
 762                 {
 763                   debug_print (DB_DEBUG, "%% Processing Node %s\n", prev);
 764                   process_page (tree, nodes2pages,
 765                                 nodes2iters, processed_table, page_list,
 766                                 page_list[page]);
 767                 }
 768             }
 769           }
 770
 771         /* by this point our parent and older sibling should be processed */
 772         if (!up || !g_ascii_strcasecmp (up, "(dir)"))
 773         {
 774           debug_print (DB_DEBUG, "\t> no parent\n");
 775                 if (!prev || !g_ascii_strcasecmp (prev, "(dir)"))
 776                 {
 777                   debug_print (DB_DEBUG, "\t> no previous\n");
 778                         gtk_tree_store_append (tree, iter, NULL);
 779                 }
 780                 else if (prev) {
 781                   GtkTreeIter *real;
 782                   real = find_real_top (GTK_TREE_MODEL (tree),
 783                                         node2iter (nodes2iters, prev));
 784                   if (real) {
 785                     gtk_tree_store_insert_after (tree, iter, NULL,
 786                                                  real);
 787                     gtk_tree_iter_free (real);
 788                   }
 789                   else
 790                     gtk_tree_store_append (tree, iter, NULL);
 791                 }
 792         }
 793         else if (!prev || !g_ascii_strcasecmp (prev, "(dir)") || !strcmp (prev, up))
 794         {
 795           debug_print (DB_DEBUG, "\t> no previous\n");
 796                 gtk_tree_store_append (tree, iter,
 797                         node2iter (nodes2iters, up));
 798         }
 799         else if (up && prev)
 800         {
 801           GtkTreeIter *upit = node2iter (nodes2iters, up);
 802           GtkTreeIter *previt = node2iter (nodes2iters, prev);
 803           GtkTreeIter *nit = NULL;
 804           debug_print (DB_DEBUG, "+++ Parent: %s Previous: %s\n", up, prev);
 805
 806           d (if (upit) debug_print (DB_DEBUG, "++++ Have parent node!\n"));
 807           d (if (previt) debug_print (DB_DEBUG, "++++ Have previous node!\n"));
 808           nit = find_real_sibling (GTK_TREE_MODEL (tree), previt, upit);
 809           if (nit) {
 810             gtk_tree_store_insert_after (tree, iter,
 811                                          upit,
 812                                          nit);
 813             gtk_tree_iter_free (nit);
 814           }
 815           else
 816             gtk_tree_store_append (tree, iter, upit);
 817         }
 818         else
 819         {
 820           debug_print (DB_DEBUG, "# node %s was not put in tree\n", node);
 821           return;
 822         }
 823
 824         d (if (iter) debug_print (DB_DEBUG, "Have a valid iter, storing for %s\n", node));
 825
 826         g_hash_table_insert (nodes2iters, g_strdup (node), iter);
 827         debug_print (DB_DEBUG, "size: %i\n", g_hash_table_size (nodes2iters));
 828
 829         /*tmp = g_strdup_printf ("%i",
 830           node2page (nodes2pages, node));*/
 831         tmp = g_strdup (node);
 832         tmp = g_strdelimit (tmp, " ", '_');
 833         gtk_tree_store_set (tree, iter,
 834                             INFO_PARSER_COLUMN_PAGE_NO, tmp,
 835                             INFO_PARSER_COLUMN_PAGE_NAME, node,
 836                             INFO_PARSER_COLUMN_PAGE_CONTENT, parts[2],
 837                             -1);
 838
 839         g_free (tmp);
 840         g_free (node);
 841         g_free (up);
 842         g_free (prev);
 843         g_free (next);
 844         g_strfreev (parts);
 845 }
 846
 847 struct TagTableFix {
 848   GHashTable *nodes2pages; /* Build this... */
 849   GHashTable *pages2nodes; /* ... using this. */
 850 };
 851
 852 static void
 853 use_offset2page (gpointer o, gpointer p, gpointer ud)
 854 {
 855   struct TagTableFix* ttf = (struct TagTableFix*)ud;
 856
 857   const gchar* node = g_hash_table_lookup (ttf->pages2nodes, p);
 858   if (node) {
 859     g_hash_table_insert (ttf->nodes2pages, g_strdup (node), p);
 860   }
 861 }
 862
 863 /*
 864   We had a nodes2offsets hash table, but sometimes these things
 865   lie. How terribly rude. Anyway, use offsets2pages and pages2nodes
 866   (and injectivity!) to construct the nodes2pages hash table.
 867 */
 868 static GHashTable *
 869 make_nodes2pages (GHashTable* offsets2pages,
 870                   GHashTable* pages2nodes)
 871 {
 872   struct TagTableFix ttf;
 873
 874   ttf.nodes2pages =
 875     g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);
 876   ttf.pages2nodes = pages2nodes;
 877
 878   g_hash_table_foreach (offsets2pages, use_offset2page, &ttf);
 879
 880   return ttf.nodes2pages;
 881 }
 882
 883 /**
 884  * Parse file into a GtkTreeStore containing useful information that we can
 885  * later convert into a nice XML document or something else.
 886  */
 887 GtkTreeStore
 888 *yelp_info_parser_parse_file (char *file)
 889 {
 890         gchar **page_list;
 891         char **ptr;
 892         int pages;
 893         int offset;
 894         GHashTable *offsets2pages = NULL;
 895         GHashTable *pages2nodes = NULL;
 896         GHashTable *nodes2pages = NULL;
 897         GHashTable *nodes2iters = NULL;
 898         int *processed_table;
 899         GtkTreeStore *tree;
 900         int pt;
 901
 902         page_list = expanded_info_file (file);
 903         if (!page_list)
 904           return NULL;
 905
 906         pages = 0;
 907         offset = 0;
 908
 909         offsets2pages = g_hash_table_new_full (g_str_hash, g_str_equal, g_free,
 910                                                NULL);
 911         pages2nodes = g_hash_table_new_full (g_direct_hash, g_direct_equal, NULL,
 912                                              g_free);
 913
 914         for (ptr = page_list; *ptr != NULL; ptr++)
 915         {
 916           gchar *name = NULL;
 917
 918           g_hash_table_insert (offsets2pages,
 919                                g_strdup_printf ("%i", offset),
 920                                GINT_TO_POINTER (pages));
 921
 922           name = get_value_after (*ptr, "Node: ");
 923           if (name)
 924             g_hash_table_insert (pages2nodes,
 925                                  GINT_TO_POINTER (pages), name);
 926
 927           offset += strlen (*ptr);
 928           if (pages) offset += 2;
 929           pages++;
 930
 931           pt = page_type (*ptr);
 932           if (pt == PAGE_INDIRECT) {
 933             g_warning ("Found an indirect page in a file "
 934                        "we thought we'd expanded.");
 935           }
 936         }
 937
 938         /* Now consolidate (and correct) the two hash tables */
 939         nodes2pages = make_nodes2pages (offsets2pages, pages2nodes);
 940
 941         g_hash_table_destroy (offsets2pages);
 942         g_hash_table_destroy (pages2nodes);
 943
 944         processed_table = g_malloc0 (pages * sizeof (int));
 945         tree = gtk_tree_store_new (INFO_PARSER_N_COLUMNS, G_TYPE_STRING, G_TYPE_STRING,
 946                         G_TYPE_STRING);
 947         nodes2iters = g_hash_table_new_full (g_str_hash, g_str_equal, g_free,
 948                                              (GDestroyNotify) gtk_tree_iter_free);
 949
 950         pages = 0;
 951         for (ptr = page_list; *ptr != NULL; ptr++)
 952         {
 953           if (page_type (*ptr) != PAGE_NODE) continue;
 954           process_page (tree, nodes2pages, nodes2iters,
 955                         processed_table, page_list, *ptr);
 956         }
 957
 958         g_strfreev (page_list);
 959
 960         g_hash_table_destroy (nodes2iters);
 961         g_hash_table_destroy (nodes2pages);
 962
 963         g_free (processed_table);
 964
 965         return tree;
 966 }
 967
 968 /* End Part 1 */
 969 /* Part 2: Parse Tree into XML */
 970 static void
 971 parse_tree_level (GtkTreeStore *tree, xmlNodePtr *node, GtkTreeIter iter)
 972 {
 973     GtkTreeIter children, parent;
 974         xmlNodePtr newnode;
 975
 976         char *page_no = NULL;
 977         char *page_name = NULL;
 978         char *page_content = NULL;
 979         gboolean notes = FALSE;
 980
 981         debug_print (DB_DEBUG, "Decended\n");
 982         do
 983         {
 984                 gtk_tree_model_get (GTK_TREE_MODEL (tree), &iter,
 985                                 INFO_PARSER_COLUMN_PAGE_NO, &page_no,
 986                                 INFO_PARSER_COLUMN_PAGE_NAME, &page_name,
 987                                 INFO_PARSER_COLUMN_PAGE_CONTENT, &page_content,
 988                                 -1);
 989                 debug_print (DB_DEBUG, "Got Section: %s\n", page_name);
 990                 if (strstr (page_content, "*Note") ||
 991                     strstr (page_content, "*note")) {
 992                   notes = TRUE;
 993                 }
 994                 if (strstr (page_content, "* Menu:")) {
 995                   newnode = yelp_info_parse_menu (tree, node, page_content, notes);
 996                 } else {
 997                   newnode = xmlNewTextChild (*node, NULL,
 998                                              BAD_CAST "Section",
 999                                              NULL);
1000                   if (!notes)
1001                     info_body_text (newnode, NULL, NULL, FALSE, page_content);
1002
1003                   else {
1004                     /* Handle notes here */
1005                     info_process_text_notes (&newnode, page_content, tree);
1006                   }
1007                 }
1008                 /* if we free the page content, now it's in the XML, we can
1009                  * save some memory */
1010                 g_free (page_content);
1011                 page_content = NULL;
1012
1013                 if (gtk_tree_model_iter_parent (GTK_TREE_MODEL (tree), &parent, &iter)) {
1014                     gchar *parent_id;
1015                     gtk_tree_model_get (GTK_TREE_MODEL (tree), &parent,
1016                                         INFO_PARSER_COLUMN_PAGE_NO, &parent_id,
1017                                         -1);
1018                     xmlNewProp (newnode, BAD_CAST "up", BAD_CAST parent_id);
1019                     g_free (parent_id);
1020                 }
1021
1022                 xmlNewProp (newnode, BAD_CAST "id",
1023                             BAD_CAST page_no);
1024                 xmlNewProp (newnode, BAD_CAST "name",
1025                             BAD_CAST page_name);
1026                 if (gtk_tree_model_iter_children (GTK_TREE_MODEL (tree),
1027                                 &children,
1028                                 &iter))
1029                   parse_tree_level (tree, &newnode, children);
1030                 g_free (page_no);
1031                 g_free (page_name);
1032         }
1033         while (gtk_tree_model_iter_next (GTK_TREE_MODEL (tree), &iter));
1034         debug_print (DB_DEBUG, "Ascending\n");
1035 }
1036
1037 xmlDocPtr
1038 yelp_info_parser_parse_tree (GtkTreeStore *tree)
1039 {
1040         xmlDocPtr doc;
1041         xmlNodePtr node;
1042         GtkTreeIter iter;
1043
1044         /*
1045         xmlChar *xmlbuf;
1046         int bufsiz;
1047         */
1048
1049         doc = xmlNewDoc (BAD_CAST "1.0");
1050         node = xmlNewNode (NULL, BAD_CAST "Info");
1051         xmlDocSetRootElement (doc, node);
1052
1053         /* functions I will want:
1054         gtk_tree_model_get_iter_first;
1055         gtk_tree_model_iter_next;
1056         gtk_tree_model_iter_children;
1057         */
1058
1059         if (gtk_tree_model_get_iter_first (GTK_TREE_MODEL (tree), &iter))
1060                 parse_tree_level (tree, &node, iter);
1061         d (else debug_print (DB_DEBUG, "Empty tree?\n"));
1062
1063         /*
1064         xmlDocDumpFormatMemory (doc, &xmlbuf, &bufsiz, 1);
1065         g_print ("XML follows:\n%s\n", xmlbuf);
1066         */
1067
1068         return doc;
1069 }
1070
1071 gboolean
1072 resolve_frag_id (GtkTreeModel *model, GtkTreePath *path, GtkTreeIter *iter,
1073                  gpointer data)
1074 {
1075   gchar *page_no = NULL;
1076   gchar *page_name = NULL;
1077   gchar **xref = data;
1078
1079   gtk_tree_model_get (GTK_TREE_MODEL (model), iter,
1080                       INFO_PARSER_COLUMN_PAGE_NO, &page_no,
1081                       INFO_PARSER_COLUMN_PAGE_NAME, &page_name,
1082                       -1);
1083   if (g_str_equal (page_name, *xref)) {
1084     g_free (*xref);
1085     *xref = g_strdup (page_name);
1086     *xref = g_strdelimit (*xref, " ", '_');
1087
1088     g_free (page_name);
1089     g_free (page_no);
1090     return TRUE;
1091   }
1092   g_free (page_name);
1093   g_free (page_no);
1094
1095   return FALSE;
1096 }
1097
1098 gboolean
1099 get_menuoptions (gchar *line, gchar **title, gchar **ref, gchar **desc,
1100                  gchar **xref)
1101 {
1102   /* Since info is actually braindead and allows .s in
1103    * its references, we gotta carefully extract things
1104    * as .s can be in either the title or desc
1105    */
1106   gchar *tmp = line;
1107   gchar *tfind = NULL;
1108
1109   if (!g_str_has_prefix (line, "* "))
1110     return FALSE;
1111
1112   tfind = strchr (tmp, ':');
1113
1114   if (!tfind) /* No : on the line, bail out */
1115     return FALSE;
1116
1117   (*title) = g_strndup (tmp, tfind-tmp);
1118
1119   if (tfind[1] == ':') { /* This happens if the title and ref are the same
1120                          * Most menus are of this type
1121                          */
1122
1123     (*ref) = NULL; /* There is no second part.  The rest is description */
1124
1125     tmp++;
1126     (*xref) = g_strndup (tmp, tfind-tmp);
1127     g_strstrip (*xref);
1128
1129     tfind+=2;
1130     (*desc) = g_strdup (tfind);
1131   } else { /* The other type of menu option */
1132     gchar *td = NULL;
1133
1134     tfind++;
1135     td = strchr (tfind, '.');
1136     if (!td)
1137       return FALSE;
1138     (*ref) = g_strndup (tfind, td-tfind);
1139     (*xref) = g_strdup (*ref);
1140     g_strstrip (*xref);
1141
1142     td++;
1143     (*desc) = g_strdup (td);
1144   }
1145   return TRUE;
1146 }
1147
1148 /* Find the first non whitespace character in str or return pointer to the
1149  * '\0' if there isn't one. */
1150 static gchar*
1151 first_non_space (gchar* str)
1152 {
1153   /* As long as str is null terminated, this is ok! */
1154   while (g_ascii_isspace (*str)) str++;
1155   return str;
1156 }
1157
1158 xmlNodePtr
1159 yelp_info_parse_menu (GtkTreeStore *tree, xmlNodePtr *node,
1160                       gchar *page_content, gboolean notes)
1161 {
1162   gchar **split;
1163   gchar **menuitems;
1164   gchar *tmp = NULL;
1165   xmlNodePtr newnode, menu_node, mholder = NULL;
1166   int i=0;
1167
1168   split = g_strsplit (page_content, "* Menu:", 2);
1169
1170   newnode = xmlNewChild (*node, NULL,
1171                          BAD_CAST "Section", NULL);
1172
1173
1174   if (!notes)
1175     info_body_text (newnode, NULL, NULL, FALSE, split[0]);
1176   else {
1177     info_process_text_notes (&newnode, split[0], tree);
1178   }
1179
1180   menuitems = g_strsplit (split[1], "\n", -1);
1181   g_strfreev (split);
1182
1183   /* The output xml should look something like the following:
1184
1185      <menu>
1186        <menuholder>
1187          <a href="xref:Help-Inv">Help-Inv</a>
1188          <para1>Invisible text in Emacs Info.</para1>
1189        </menuholder>
1190        <menuholder>
1191          <a href="xref:Help-M">Help-M</a>
1192          <para1>Menus.</para1>
1193        </menuholder>
1194        ...
1195      </menu>
1196
1197      (from the top page of info:info). Note the absence of *'s and
1198      ::'s on the links.
1199
1200      If there's a line with no "* Blah::", it looks like a child of
1201      the previous menu item so (for i > 0) deal with that correctly by
1202      not "closing" the <menuholder> tag until we find the next
1203      start.
1204   */
1205
1206   if (menuitems[0] != NULL) {
1207     /* If there are any menu items, make the <menu> node */
1208     menu_node = xmlNewChild (newnode, NULL, BAD_CAST "menu", NULL);
1209   }
1210
1211   while (menuitems[i] != NULL) {
1212     gboolean menu = FALSE;
1213     gchar *title = NULL;
1214     gchar *ref = NULL;
1215     gchar *desc = NULL;
1216     gchar *xref = NULL;
1217     gchar *link_text = NULL;
1218     xmlNodePtr ref1;
1219
1220     menu = get_menuoptions (menuitems[i], &title, &ref, &desc, &xref);
1221
1222     if (menu && (*title == '\0' || *(title + 1) == '\0')) {
1223       g_warning ("Info title unexpectedly short for menu item (%s)",
1224                  menuitems[i]);
1225       menu = FALSE;
1226     }
1227
1228     if (menu) {
1229       mholder = xmlNewChild (menu_node, NULL, BAD_CAST "menuholder", NULL);
1230       gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &xref);
1231
1232       if (ref == NULL) { /* A standard type menu */
1233         /* title+2 skips the "* ". We know we haven't jumped over the
1234            end of the string because strlen (title) >= 3 */
1235         link_text = g_strdup (title+2);
1236
1237         ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
1238                                 BAD_CAST link_text);
1239
1240         tmp = g_strconcat ("xref:", xref, NULL);
1241         xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
1242         g_free (tmp);
1243       } else { /* Indexy type menu  - we gotta do a  little work to fix the
1244                 * spacing
1245                 */
1246         gchar *spacing = ref;
1247         gint c=0;
1248         gchar *sp = NULL;
1249
1250         while (*spacing ==' ') {
1251           c++;
1252           spacing++;
1253         }
1254         sp = g_strndup (ref, c);
1255
1256         link_text = g_strdup (title);
1257
1258         ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
1259                                 BAD_CAST link_text);
1260         tmp = g_strconcat ("xref:", xref, NULL);
1261         xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
1262         g_free (tmp);
1263         xmlNewTextChild (mholder, NULL, BAD_CAST "spacing",
1264                          BAD_CAST sp);
1265         tmp = g_strconcat (g_strstrip(ref), ".", NULL);
1266         ref1 = xmlNewTextChild (mholder, NULL, BAD_CAST "a",
1267                                 BAD_CAST tmp);
1268         g_free (tmp);
1269         tmp = g_strconcat ("xref:", xref, NULL);
1270         xmlNewProp (ref1, BAD_CAST "href", BAD_CAST tmp);
1271
1272         g_free (tmp);
1273         g_free (sp);
1274       }
1275
1276       tmp = g_strconcat ("\n", first_non_space (desc), NULL);
1277
1278       /*
1279         Don't print the link text a second time, because that looks
1280         really stupid.
1281
1282         We don't do a straight check for equality because lots of
1283         .info files have something like
1284
1285           * Foo::    Foo.
1286
1287         Obviously if the longer explanation has more afterwards, we
1288         don't want to omit it, which is why there's the strlen test.
1289       */
1290       if (strncmp (link_text, tmp + 1, strlen (link_text)) ||
1291           strlen (link_text) + 1 < strlen (tmp + 1)) {
1292         xmlNewTextChild (mholder, NULL,
1293                          BAD_CAST "para1", BAD_CAST tmp);
1294       }
1295
1296       g_free (tmp);
1297       g_free (link_text);
1298     }
1299     else if (*(menuitems[i]) != '\0') {
1300       tmp = g_strconcat ("\n", first_non_space (menuitems[i]), NULL);
1301       xmlNewTextChild (mholder ? mholder : menu_node,
1302                        NULL, BAD_CAST "para1",
1303                        BAD_CAST tmp);
1304       g_free (tmp);
1305     }
1306     i++;
1307     g_free (title);
1308     g_free (ref);
1309     g_free (desc);
1310     g_free (xref);
1311
1312   }
1313   g_strfreev (menuitems);
1314
1315   return newnode;
1316 }
1317
1318 void
1319 info_process_text_notes (xmlNodePtr *node, gchar *content, GtkTreeStore *tree)
1320 {
1321   gchar **notes;
1322   gchar **current;
1323   xmlNodePtr ref1;
1324   xmlNodePtr paragraph = NULL;
1325   gboolean first = TRUE;
1326
1327   /*
1328     Split using the regular expression
1329
1330       \*[Nn]ote(?!_)
1331
1332     which deals with either case and the last bit is a lookahead so
1333     that we don't split on things of the form *Note:_, which aren't
1334     real notes.
1335   */
1336   notes = g_regex_split_simple ("\\*[Nn]ote(?!_)", content, 0, 0);
1337
1338   for (current = notes; *current != NULL; current++) {
1339     gchar *url, **urls, **ulink;
1340     gchar *append;
1341     gchar *alt_append, *alt_append1;
1342     gchar *link_text;
1343     gchar *href = NULL;
1344     gchar *break_point = NULL;
1345     gboolean broken = FALSE;
1346     if (first) {
1347       /* The first node is special.  It doesn't have a note ref at the
1348        * start, so we can just add it and forget about it.
1349        */
1350       first = FALSE;
1351       info_body_text (*node, &paragraph, NULL, TRUE, (*current));
1352       continue;
1353     }
1354
1355     /* If we got to here, we now gotta parse the note reference */
1356     append = strchr (*current, ':');
1357     if (!append) {
1358       info_body_text (*node, &paragraph, NULL, TRUE, *current);
1359       continue;
1360     }
1361     append++;
1362     alt_append = append;
1363     alt_append1 = alt_append;
1364     append = strchr (append, ':');
1365     alt_append = strchr (alt_append, '.');
1366     if (alt_append && g_str_has_prefix (alt_append, ".info")) {
1367       broken = TRUE;
1368       alt_append++;
1369       alt_append = strchr (alt_append, '.');
1370     }
1371     alt_append1 = strchr (alt_append1, ',');
1372     if (!append && !alt_append && !alt_append1) {
1373       info_body_text (*node, &paragraph, NULL, TRUE, *current);
1374       continue;
1375     }
1376     if (!append || alt_append || alt_append1) {
1377       if (!append) {
1378         if (alt_append) append = alt_append;
1379         else append = alt_append1;
1380       }
1381       if ((alt_append && alt_append < append))
1382         append = alt_append;
1383       if (alt_append1 && alt_append1 < append)
1384         append = alt_append1;
1385     }
1386     append++;
1387     url = g_strndup (*current, append - (*current));
1388
1389     /* Save a copy of the unadulterated link text for later. */
1390     link_text = g_strconcat ("*Note", url, NULL);
1391
1392     /* By now, we got 2 things.  First, is append which is the (hopefully)
1393      * non-link text.  Second, we got a url.
1394      * The url can be in several forms:
1395      * 1. linkend::
1396      * 2. linkend:(infofile)Linkend.
1397      * 3. Title: Linkend.
1398      * 4. Title: Linkend, (pretty sure this is just broken)
1399      * 5. Title: (infofile.info)Linkend.
1400      * All possibilities should have been picked up.
1401      * Here:
1402      * Clean up the split.  Should be left with a real url and
1403      * a list of fragments that should be linked
1404      * Also goes through and removes extra spaces, leaving only one
1405      * space in place of many
1406      */
1407     urls = g_strsplit (url, "\n", -1);
1408     break_point = strchr (url, '\n');
1409     while (break_point) {
1410       *break_point = ' ';
1411       break_point = strchr (++break_point, '\n');
1412     }
1413     break_point = strchr (url, ' ');
1414     while (break_point) {
1415       if (*(break_point+1) == ' ') {
1416         /* Massive space.  Fix. */
1417         gchar *next = break_point;
1418         gchar *url_copy;
1419         gchar *old = url;
1420         while (*next == ' ')
1421           next++;
1422         next--;
1423         url_copy = g_strndup (url, break_point-url);
1424         url = g_strconcat (url_copy, next, NULL);
1425         g_free (old);
1426         break_point = strchr (url, ' ');
1427         g_free (url_copy);
1428       } else {
1429         break_point++;
1430         break_point = strchr (break_point, ' ');
1431       }
1432     }
1433     if (url[strlen(url)-1] == '.') { /* The 2nd or 3rd sort of link */
1434       gchar *stop = NULL;
1435       gchar *lurl = NULL;
1436       gchar *zloc = NULL;
1437       stop = strchr (url, ':');
1438       lurl = strchr (stop, '(');
1439       if (!lurl) { /* 3rd type of link */
1440         gchar *link;
1441         gint length;
1442         stop++;
1443         link = g_strdup (stop);
1444         link = g_strstrip (link);
1445         length = strlen (link) - 1;
1446         link[length] = '\0';
1447         href = g_strconcat ("xref:", link, NULL);
1448         link[length] = 'a';
1449         g_free (link);
1450
1451
1452       } else { /* 2nd type of link.  Easy. Provided .info is neglected ;) */
1453         if (broken) {
1454           gchar *new_url;
1455           gchar *info;
1456           gchar *stripped;
1457
1458           new_url = g_strdup (lurl);
1459           info = strstr (new_url, ".info)");
1460           stripped = g_strndup (new_url, info-new_url);
1461           info +=5;
1462           lurl = g_strconcat (stripped, info, NULL);
1463           g_free (stripped);
1464           g_free (new_url);
1465         }
1466         zloc = &(lurl[strlen(lurl)-1]);
1467         *zloc = '\0';
1468         href = g_strconcat ("info:", lurl, NULL);
1469         *zloc = 'a';
1470       }
1471     } else { /* First kind of link */
1472       gchar *tmp1;
1473       gchar *frag;
1474
1475       tmp1 = strchr (url, ':');
1476       if (!tmp1)
1477         frag = g_strdup (url);
1478       else
1479         frag = g_strndup (url, tmp1 - url);
1480       g_strstrip (frag);
1481       gtk_tree_model_foreach (GTK_TREE_MODEL (tree), resolve_frag_id, &frag);
1482       href = g_strconcat ("xref:", frag, NULL);
1483       g_free (frag);
1484     }
1485
1486     /* Check we've got a valid paragraph node */
1487     if (!paragraph) {
1488       paragraph = xmlNewChild (*node, NULL, BAD_CAST "para", NULL);
1489     }
1490
1491     /*
1492       Now we're supposed to actually render the link. I have a list of
1493       bits of URL and actually this is really easy - I want to have
1494       the link *text* exactly the same as it appeared in the .info
1495       file, so don't use the list of strings urls, instead use the
1496       whole lot: url (complete with embedded newlines etc.)
1497     */
1498     ref1 = xmlNewTextChild (paragraph, NULL, BAD_CAST "a",
1499                             BAD_CAST link_text);
1500     g_free (link_text);
1501     xmlNewProp (ref1, BAD_CAST "href", BAD_CAST href);
1502
1503     g_strfreev (urls);
1504
1505     /* Finally, we can add the following text as required */
1506     info_body_text (*node, &paragraph, NULL, TRUE, append);
1507
1508     g_free (url);
1509     g_free (href);
1510   }
1511   g_strfreev (notes);
1512 }