libyelp/yelp-man-parser.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 4 -*- */
   2 /*
   3  * Copyright (C) 2003-2010 Shaun McCance <shaunm@gnome.org>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public
  16  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  17  *
  18  * Author: Shaun McCance <shaunm@gnome.org>
  19  */
  20
  21 #ifdef HAVE_CONFIG_H
  22 #include <config.h>
  23 #endif
  24
  25 #include <glib.h>
  26 #include <glib/gi18n.h>
  27 #include <libxml/tree.h>
  28 #include <libxml/xpath.h>
  29 #include <gio/gio.h>
  30 #include <gio/gunixinputstream.h>
  31 #include <string.h>
  32 #include <math.h>
  33
  34 #include "yelp-error.h"
  35 #include "yelp-man-parser.h"
  36
  37 #define MAN_FONTS 8
  38
  39 /* The format has two copies of the title like MAN(1) at the top,
  40  * possibly with a string of text in between for the collection.
  41  *
  42  * Start with the parser on START, then HAVE_TITLE when we've read the
  43  * first word with parentheses. At that point, stick new words into
  44  * the "collection" tag. Then finally switch to BODY when we've seen
  45  * the second copy of the one with parentheses.
  46  */
  47 typedef enum ManParserState
  48 {
  49     START,
  50     HAVE_TITLE,
  51     BODY
  52 } ManParserState;
  53
  54 /* See parse_body_text for how this is used. */
  55 typedef enum ManParserSectionState
  56 {
  57     SECTION_TITLE,
  58     SECTION_BODY
  59 } ManParserSectionState;
  60
  61 struct _YelpManParser {
  62     xmlDocPtr     doc;           /* The top-level XML document */
  63     xmlNodePtr    header;        /* The header node */
  64     xmlNodePtr    section_node;  /* The current section */
  65     xmlNodePtr    sheet_node;    /* The current sheet */
  66
  67     GDataInputStream *stream;    /* The GIO input stream to read from */
  68     gchar            *buffer;    /* The buffer, line at a time */
  69     gsize             length;    /* The buffer length */
  70
  71     gchar            *section;   /* The name of the current section */
  72
  73     /* The width and height of a character according to troff. */
  74     guint char_width;
  75     guint char_height;
  76
  77     /* Count the number of lines we've parsed (needed to get prologue) */
  78     guint line_no;
  79
  80     /* The x f k name command sets the k'th register to be name. */
  81     gchar* font_registers[MAN_FONTS];
  82
  83     /* The current font. Should be the index of one of the
  84      * font_registers. Starts at 0 (of course!)
  85      */
  86     guint current_font;
  87
  88     /* See description of ManParserState above */
  89     ManParserState state;
  90
  91     /* Vertical and horizontal position as far as the troff output is
  92      * concerned. (Measured from top-left).
  93      */
  94     guint vpos, hpos;
  95
  96     /* Text accumulator (needed since it comes through in dribs &
  97      * drabs...) */
  98     GString *accumulator;
  99
 100     /* See parse_body_text for how this is used. */
 101     ManParserSectionState section_state;
 102
 103     /* The indent of the current sheet */
 104     guint sheet_indent;
 105
 106     /* Set to TRUE if there's been a newline since the last text was
 107      * parsed. */
 108     gboolean newline;
 109
 110     /* Count the number of 'N' lines we've seen since the last h
 111      * command. This is because for some reason N doesn't
 112      * automatically move the position forward. Thus immediately after
 113      * one, you see a h24 or the like. Unless there's a space. Then it
 114      * might be wh48. This is set in parse_N (obviously) and used in
 115      * parse_h.
 116      */
 117     guint N_count;
 118
 119     /* Keep track of whether the last character was a space. We can't
 120      * just do this by looking at the last char of accumulator,
 121      * because if there's a font change, it gets zeroed. This gets set
 122      * to TRUE by parse_w and is FALSE the rest of the time.
 123      */
 124     gboolean last_char_was_space;
 125
 126     /* Keep track of the size of the last vertical jump - used to tell
 127      * whether we need to insert extra space above a line.
 128      */
 129     gint last_vertical_jump;
 130
 131     /* The title we read earlier (eg 'Foo(2)') */
 132     gchar *title_str;
 133 };
 134
 135 static gboolean parser_parse_line (YelpManParser *parser, GError **error);
 136 static gboolean parse_prologue_line (YelpManParser *parser, GError **error);
 137
 138 /* Parsers for different types of line */
 139 typedef gboolean (*LineParser)(YelpManParser *, GError **);
 140 #define DECLARE_LINE_PARSER(name) \
 141     static gboolean (name) (YelpManParser *parser, GError **error);
 142
 143 DECLARE_LINE_PARSER (parse_xf)
 144 DECLARE_LINE_PARSER (parse_f)
 145 DECLARE_LINE_PARSER (parse_V)
 146 DECLARE_LINE_PARSER (parse_H)
 147 DECLARE_LINE_PARSER (parse_v)
 148 DECLARE_LINE_PARSER (parse_h)
 149 DECLARE_LINE_PARSER (parse_text)
 150 DECLARE_LINE_PARSER (parse_w)
 151 DECLARE_LINE_PARSER (parse_body_text)
 152 DECLARE_LINE_PARSER (parse_n)
 153 DECLARE_LINE_PARSER (parse_N)
 154 DECLARE_LINE_PARSER (parse_C)
 155 DECLARE_LINE_PARSER (parse_p)
 156
 157 /* Declare a sort of alist registry of parsers for different lines. */
 158 struct LineParsePair
 159 {
 160     const gchar *prefix;
 161     LineParser handler;
 162 };
 163 static struct LineParsePair line_parsers[] = {
 164     { "x f", parse_xf }, { "f", parse_f },
 165     { "V", parse_V }, { "H", parse_H },
 166     { "v", parse_v }, { "h", parse_h },
 167     { "t", parse_text },
 168     { "w", parse_w },
 169     { "n", parse_n },
 170     { "N", parse_N },
 171     { "C", parse_C },
 172     { "p", parse_p },
 173     { NULL, NULL }
 174 };
 175
 176 /******************************************************************************/
 177 /* Parser helper functions (managing the state of the various parsing
 178  * bits) */
 179 static void finish_span (YelpManParser *parser);
 180 static guint dx_to_em_count (YelpManParser *parser, guint dx);
 181 static void append_nbsps (YelpManParser *parser, guint k);
 182 static void deal_with_newlines (YelpManParser *parser);
 183 static void new_sheet (YelpManParser *parser);
 184 static void register_title (YelpManParser *parser,
 185                             const gchar* name, const gchar* section);
 186 static void right_truncate_common (gchar *dst, const gchar *src);
 187 static gboolean cheeky_call_parse_line (YelpManParser *parser,
 188                                         GError **error,
 189                                         gchar first_char,
 190                                         const gchar *text);
 191 static void cleanup_parsed_page (YelpManParser *parser);
 192 static gboolean parse_last_line (YelpManParser *parser, gchar* line);
 193 static void unicode_strstrip (gchar *str);
 194
 195 /*
 196   A link_inserter takes
 197     (1) an array of offsets for the different spans within the string
 198     (2) the match info from the regex match
 199
 200   It's then responsible for mangling the XML tree to insert the actual
 201   link. Finally, it should return the offset into the string of the
 202   end of what it's just dealt with. If necessary, it should also fix
 203   up offsets to point correctly at the last node inserted.
 204  */
 205 typedef struct {
 206     gsize      start, end;
 207     xmlNodePtr elt;
 208 } offset_elt_pair;
 209
 210 typedef gsize (*link_inserter)(offset_elt_pair *,
 211                                const GMatchInfo *);
 212
 213 static void fixup_links (YelpManParser *parser,
 214                          const GRegex *matcher,
 215                          link_inserter inserter);
 216
 217 static gsize man_link_inserter (offset_elt_pair *offsets,
 218                                 const GMatchInfo *match_info);
 219 static gsize http_link_inserter (offset_elt_pair *offsets,
 220                                  const GMatchInfo *match_info);
 221
 222 /******************************************************************************/
 223 /* Translations for the 'C' command. This is indeed hackish, but the
 224  * -Tutf8 output doesn't seem to give include files so we can do this
 225  * at runtime :-(
 226  *
 227  * On my machine, this data's at /usr/share/groff/current/tmac/ in
 228  * latin1.tmac, unicode.tmac and I worked out the lq and rq from
 229  * running man: I'm not sure where that comes from!
 230  */
 231 struct StringPair
 232 {
 233     const gchar *from;
 234     gunichar to;
 235 };
 236 static const struct StringPair char_translations[] = {
 237     { "r!", 161 },
 238     { "ct", 162 },
 239     { "Po", 163 },
 240     { "Cs", 164 },
 241     { "Ye", 165 },
 242     { "bb", 166 },
 243     { "sc", 167 },
 244     { "ad", 168 },
 245     { "co", 169 },
 246     { "Of", 170 },
 247     { "Fo", 171 },
 248     { "tno", 172 },
 249     { "%", 173 },
 250     { "rg", 174 },
 251     { "a-", 175 },
 252     { "de", 176 },
 253     { "t+-", 177 },
 254     { "S2", 178 },
 255     { "S3", 179 },
 256     { "aa", 180 },
 257     { "mc", 181 },
 258     { "ps", 182 },
 259     { "pc", 183 },
 260     { "ac", 184 },
 261     { "S1", 185 },
 262     { "Om", 186 },
 263     { "Fc", 187 },
 264     { "14", 188 },
 265     { "12", 189 },
 266     { "34", 190 },
 267     { "r?", 191 },
 268     { "`A", 192 },
 269     { "'A", 193 },
 270     { "^A", 194 },
 271     { "~A", 195 },
 272     { ":A", 196 },
 273     { "oA", 197 },
 274     { "AE", 198 },
 275     { ",C", 199 },
 276     { "`E", 200 },
 277     { "'E", 201 },
 278     { "^E", 202 },
 279     { ":E", 203 },
 280     { "`I", 204 },
 281     { "'I", 205 },
 282     { "^I", 206 },
 283     { ":I", 207 },
 284     { "-D", 208 },
 285     { "~N", 209 },
 286     { "`O", 210 },
 287     { "'O", 211 },
 288     { "^O", 212 },
 289     { "~O", 213 },
 290     { ":O", 214 },
 291     { "tmu", 215 },
 292     { "/O", 216 },
 293     { "`U", 217 },
 294     { "'U", 218 },
 295     { "^U", 219 },
 296     { ":U", 220 },
 297     { "'Y", 221 },
 298     { "TP", 222 },
 299     { "ss", 223 },
 300     { "`a", 224 },
 301     { "'a", 225 },
 302     { "^a", 226 },
 303     { "~a", 227 },
 304     { ":a", 228 },
 305     { "oa", 229 },
 306     { "ae", 230 },
 307     { ",c", 231 },
 308     { "`e", 232 },
 309     { "'e", 233 },
 310     { "^e", 234 },
 311     { ":e", 235 },
 312     { "`i", 236 },
 313     { "'i", 237 },
 314     { "^i", 238 },
 315     { ":i", 239 },
 316     { "Sd", 240 },
 317     { "~n", 241 },
 318     { "`o", 242 },
 319     { "'o", 243 },
 320     { "^o", 244 },
 321     { "~o", 245 },
 322     { ":o", 246 },
 323     { "tdi", 247 },
 324     { "/o", 248 },
 325     { "`u", 249 },
 326     { "'u", 250 },
 327     { "^u", 251 },
 328     { ":u", 252 },
 329     { "'y", 253 },
 330     { "Tp", 254 },
 331     { ":y", 255 },
 332     { "hy", '-' },
 333     { "oq", '`' },
 334     { "cq", '\'' },
 335     { "lq", 8220 }, // left smart quotes
 336     { "rq", 8221 }, // right smart quotes
 337     { "en", 8211 }, // en-dash
 338     { "em", 8212 }, // em-dash
 339     { "la", 10216 }, // left angle bracket
 340     { "ra", 10217 }, // left angle bracket
 341     { "rs", '\\' },
 342     { "<=", 8804 }, // < or equal to sign
 343     { ">=", 8805 }, // > or equal to sign
 344     { "aq", '\'' },
 345     { "tm", 8482 }, // trademark symbol
 346     { NULL, 0 }
 347 };
 348
 349 /******************************************************************************/
 350
 351 YelpManParser *
 352 yelp_man_parser_new (void)
 353 {
 354     YelpManParser *parser = g_new0 (YelpManParser, 1);
 355     parser->accumulator = g_string_sized_new (1024);
 356     return parser;
 357 }
 358
 359 /*
 360   This function is responsible for taking a path to a man file and
 361   returning something in the groff intermediate output format for us
 362   to use.
 363
 364   If something goes wrong, we return NULL and set error to be a
 365   YelpError describing the problem.
 366 */
 367 static GInputStream*
 368 get_troff (gchar *path, GError **error)
 369 {
 370     gint ystdout;
 371     GError *err = NULL;
 372     const gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", path, NULL };
 373     gchar **my_argv;
 374
 375     /* g_strdupv() should accept a "const gchar **". */
 376     my_argv = g_strdupv ((gchar **) argv);
 377
 378     if (!g_spawn_async_with_pipes (NULL, my_argv, NULL,
 379                                    G_SPAWN_SEARCH_PATH, NULL, NULL,
 380                                    NULL, NULL, &ystdout, NULL, &err)) {
 381         /* We failed to run the man program. Return a "Huh?" error. */
 382         *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
 383                               "%s", err->message);
 384         g_error_free (err);
 385         g_strfreev (my_argv);
 386         return NULL;
 387     }
 388
 389     g_strfreev (my_argv);
 390
 391     return (GInputStream*) g_unix_input_stream_new (ystdout, TRUE);
 392 }
 393
 394 xmlDocPtr
 395 yelp_man_parser_parse_file (YelpManParser *parser,
 396                             gchar *path,
 397                             GError **error)
 398 {
 399     GInputStream *troff_stream;
 400     gboolean ret;
 401     xmlNodePtr root;
 402
 403     troff_stream = get_troff (path, error);
 404     if (!troff_stream) return NULL;
 405
 406     parser->stream = g_data_input_stream_new (troff_stream);
 407
 408     parser->doc = xmlNewDoc (BAD_CAST "1.0");
 409     root = xmlNewNode (NULL, BAD_CAST "Man");
 410     xmlDocSetRootElement (parser->doc, root);
 411
 412     parser->header = xmlNewNode (NULL, BAD_CAST "header");
 413     xmlAddChild (root, parser->header);
 414
 415     while (1) {
 416        parser->buffer =
 417        g_data_input_stream_read_line (parser->stream,
 418                                       &(parser->length),
 419                                       NULL, NULL);
 420        if (parser->buffer == NULL) break;
 421
 422        parser->line_no++;
 423        ret = parser_parse_line (parser, error);
 424
 425        g_free (parser->buffer);
 426
 427        if (!ret) {
 428            xmlFreeDoc (parser->doc);
 429            parser->doc = NULL;
 430            break;
 431        }
 432     }
 433
 434     cleanup_parsed_page (parser);
 435
 436     g_object_unref (parser->stream);
 437
 438     return parser->doc;
 439 }
 440
 441 void
 442 yelp_man_parser_free (YelpManParser *parser)
 443 {
 444     guint k;
 445     if (parser) {
 446         for (k=0; k<MAN_FONTS; k++)
 447             g_free (parser->font_registers[k]);
 448     }
 449     g_string_free (parser->accumulator, TRUE);
 450     g_free (parser->title_str);
 451     g_free (parser->section);
 452     g_free (parser);
 453 }
 454
 455 /******************************************************************************/
 456
 457 /* Sets the k'th font register to be name. Copies name, so free it
 458  * afterwards. k should be in [0,MAN_FONTS). It seems that man always
 459  * gives us ones at least 1, but groff_out(5) says non-negative.
 460  */
 461 static void
 462 set_font_register (YelpManParser *parser, guint k, const gchar* name)
 463 {
 464     if (k > MAN_FONTS) {
 465         g_warning ("Tried to set nonexistant font register %u to %s",
 466                    k, name);
 467         return;
 468     }
 469     g_free (parser->font_registers[k]);
 470     parser->font_registers[k] = g_strdup (name);
 471 }
 472
 473 static const gchar*
 474 get_font (const YelpManParser *parser)
 475 {
 476     guint k = parser->current_font;
 477     if (k > MAN_FONTS ||
 478         parser->font_registers[k] == NULL) {
 479
 480         g_warning ("Tried to get nonexistant font register %u", k);
 481
 482         return "";
 483     }
 484
 485     return parser->font_registers[k];
 486 }
 487
 488 /******************************************************************************/
 489
 490 /*
 491   Convenience macros to scan a string, checking for the correct number
 492   of things read.
 493
 494   Also to raise an error. Add an %s to the end of the format string,
 495   which automatically gets given parser->buffer.
 496  */
 497 #define SSCANF(fmt,num,...)                                 \
 498     (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num))
 499
 500 #define PARSE_ERROR(...)                                    \
 501     g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING,         \
 502                  __VA_ARGS__, parser->buffer)
 503 #define RAISE_PARSE_ERROR(...)                              \
 504     { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; }
 505
 506 static gboolean
 507 parser_parse_line (YelpManParser *parser, GError **error)
 508 {
 509     const struct LineParsePair *p;
 510
 511     if (parser->line_no <= 3)
 512         return parse_prologue_line (parser, error);
 513
 514     p = line_parsers;
 515     while (p->handler != NULL) {
 516         if (g_str_has_prefix (parser->buffer, p->prefix)) {
 517             return p->handler(parser, error);
 518         }
 519         p++;
 520     }
 521     return TRUE;
 522 }
 523
 524 static gboolean
 525 parse_prologue_line (YelpManParser *parser, GError **error)
 526 {
 527     if (parser->line_no != 2) return TRUE;
 528
 529     /* This is the interesting line, which should look like
 530               x res 240 24 40
 531        The interesting bits are the 24 and the 40, which are the
 532        width and height of a character as far as -Tutf8 is
 533        concerned.
 534     */
 535     if (SSCANF ("x %*s %*u %u %u", 2,
 536                 &parser->char_width, &parser->char_height)) {
 537         RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s");
 538     }
 539
 540     return TRUE;
 541 }
 542
 543 static gboolean
 544 parse_xf (YelpManParser *parser, GError **error)
 545 {
 546     gchar name[10];
 547     guint k;
 548
 549     if (SSCANF ("x f%*s %u %10s", 2, &k, name)) {
 550         RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s");
 551     }
 552     set_font_register (parser, k, name);
 553     return TRUE;
 554 }
 555
 556 static gboolean
 557 parse_f (YelpManParser *parser, GError **error)
 558 {
 559     guint k;
 560     if (SSCANF ("f%u", 1, &k)) {
 561         RAISE_PARSE_ERROR ("Invalid font line from troff: %s");
 562     }
 563     finish_span (parser);
 564
 565     parser->current_font = k;
 566
 567     return TRUE;
 568 }
 569
 570 static gboolean
 571 parse_v (YelpManParser *parser, GError **error)
 572 {
 573     guint dy;
 574     if (SSCANF ("v%u", 1, &dy)) {
 575         RAISE_PARSE_ERROR ("Invalid v line from troff: %s");
 576     }
 577     parser->last_vertical_jump += dy;
 578     parser->vpos += dy;
 579     return TRUE;
 580 }
 581
 582 static gboolean
 583 parse_h (YelpManParser *parser, GError **error)
 584 {
 585     guint dx;
 586     int k;
 587
 588     if (SSCANF ("h%u", 1, &dx)) {
 589         RAISE_PARSE_ERROR ("Invalid h line from troff: %s");
 590     }
 591     parser->hpos += dx;
 592
 593     /* This is a bit hackish to be honest but... if we're in something
 594      * that'll end up in a span, a spacing h command means that a gap
 595      * should appear. It seems that the easiest way to get this is to
 596      * insert nonbreaking spaces (eugh!)
 597      *
 598      * Of course we don't want to do this when chained from wh24 or
 599      * whatever, so use the last_char_was_space flag
 600      * but... unfortunately some documents actually use stuff like
 601      * wh96 for spacing (eg the lists in perl(1)). So (very hackish!),
 602      * ignore double spaces, since that's probably just been put in to
 603      * make the text justified (eugh), but allow bigger jumps.
 604      *
 605      * Incidentally, the perl manual here has bizarre gaps in the
 606      * synopsis section. God knows why, but man displays them too so
 607      * it's not our fault! :-)
 608      */
 609     k = dx_to_em_count (parser, dx);
 610
 611     if ((!parser->last_char_was_space) || (k > 2)) {
 612
 613         k -= parser->N_count;
 614         if (k < 0) k = 0;
 615
 616         append_nbsps (parser, k);
 617     }
 618
 619     parser->N_count = 0;
 620
 621     return TRUE;
 622 }
 623
 624 static gboolean
 625 parse_V (YelpManParser *parser, GError **error)
 626 {
 627     guint y;
 628     if (SSCANF ("V%u", 1, &y)) {
 629         RAISE_PARSE_ERROR ("Invalid V line from troff: %s");
 630     }
 631     parser->last_vertical_jump += y - parser->vpos;
 632     parser->vpos = y;
 633     return TRUE;
 634 }
 635
 636 static gboolean
 637 parse_H (YelpManParser *parser, GError **error)
 638 {
 639     guint x;
 640     if (SSCANF ("H%u", 1, &x)) {
 641         RAISE_PARSE_ERROR ("Invalid H line from troff: %s");
 642     }
 643     parser->hpos = x;
 644     return TRUE;
 645 }
 646
 647 static gboolean
 648 parse_text (YelpManParser *parser, GError **error)
 649 {
 650     gchar *text, *section, *tmp;
 651     const gchar *acc;
 652
 653     /*
 654       Sneakily, this might get called with something other than t
 655       starting the buffer: see parse_C and parse_N.
 656     */
 657     if (parser->buffer[0] == 't') {
 658         parser->N_count = 0;
 659     }
 660
 661     if (parser->state == START) {
 662         /* This should be the 'Title String(1)' line. It might come in
 663          * chunks (for example, it might be more than one line
 664          * long!). So just read bits until we get a (blah) bit: stick
 665          * everything in the accumulator and check for
 666          * parentheses. When we've got some, stick the parsed title in
 667          * the header and switch to HAVE_TITLE.
 668          *
 669          * The parse_n code will error out if we didn't manage to get
 670          * a title before the first newline and otherwise is in charge
 671          * of switching to body-parsing mode.
 672          */
 673         g_string_append (parser->accumulator, parser->buffer+1);
 674
 675         acc = parser->accumulator->str;
 676
 677         section = strchr (acc, '(');
 678
 679         if (section) {
 680             section++;
 681             tmp = strchr (section, ')');
 682         }
 683
 684         if (section && tmp) {
 685             /* We've got 'Blah (3)' or the like in the accumulator */
 686             if (*(tmp+1) != '\0') {
 687                 RAISE_PARSE_ERROR ("Don't understand title line: '%s'");
 688             }
 689             parser->state = HAVE_TITLE;
 690             parser->title_str = g_strdup (acc);
 691
 692             text = g_strndup (acc, (section - 1) - acc);
 693             section = g_strndup (section, tmp - section);
 694
 695             register_title (parser, text, section);
 696
 697             g_string_truncate (parser->accumulator, 0);
 698
 699             g_free (text);
 700             parser->section = section;
 701         }
 702
 703         return TRUE;
 704     }
 705
 706     if (parser->state == BODY)
 707         return parse_body_text (parser, error);
 708
 709     /* In state HAVE_TITLE */
 710     else {
 711         /* We expect (maybe!) to get some lines in between the two
 712          * occurrences of the title itself. So collect up all the text
 713          * we get and then we'll remove the copy of the title at the
 714          * end (hopefully) when we find a newline in parse_n.
 715          */
 716         g_string_append (parser->accumulator, parser->buffer+1);
 717         return TRUE;
 718     }
 719 }
 720
 721 static gboolean
 722 parse_body_text (YelpManParser *parser, GError **error)
 723 {
 724     /*
 725       It's this function which is responsible for trying to get *some*
 726       semantic information back out of the manual page.
 727
 728       The highest-level chopping up is into sections. We use the
 729       heuristic that if either
 730         (1) We haven't got a section yet or
 731         (2) text starts a line (hpos=0)
 732       then it's a section title.
 733
 734       It's possible to have spaces in section titles, so we carry on
 735       accumulating the section title until the next newline.
 736     */
 737     if (parser->section_state == SECTION_BODY &&
 738         (!parser->section_node || (parser->hpos == 0))) {
 739         g_string_truncate (parser->accumulator, 0);
 740         /* End the current sheet & section */
 741         parser->section_state = SECTION_TITLE;
 742         parser->sheet_node = NULL;
 743
 744         parser->section_node =
 745             xmlAddChild (xmlDocGetRootElement (parser->doc),
 746                          xmlNewNode (NULL, BAD_CAST "section"));
 747     }
 748
 749     if (parser->section_state != SECTION_TITLE) {
 750         deal_with_newlines (parser);
 751     }
 752
 753     g_string_append (parser->accumulator, parser->buffer+1);
 754
 755     /* Move hpos forward per char */
 756     parser->hpos += strlen (parser->buffer+1) * parser->char_width;
 757
 758     parser->last_char_was_space = FALSE;
 759
 760     return TRUE;
 761 }
 762
 763 /*
 764   w is a sort of prefix argument. It indicates a space, so we register
 765   that here, then call parser_parse_line again on the rest of the
 766   string to deal with that.
 767  */
 768 static gboolean
 769 parse_w (YelpManParser *parser, GError **error)
 770 {
 771     gboolean ret;
 772
 773     if (parser->state != START) {
 774         g_string_append_c (parser->accumulator, ' ');
 775     }
 776
 777     parser->buffer++;
 778     parser->last_char_was_space = TRUE;
 779
 780     ret = parser_parse_line (parser, error);
 781
 782     parser->buffer--;
 783     return ret;
 784 }
 785
 786 static gboolean
 787 parse_n (YelpManParser *parser, GError **error)
 788 {
 789     xmlNodePtr node;
 790
 791     /* When we're in the header, the parse_n is responsible for
 792      * switching to body text. (See the body of parse_text() for more
 793      * of an explanation).
 794      */
 795     if (parser->state == START) {
 796         /* Oh no! We've not got a proper title yet! Ho hum, let's
 797            stick whatever's going into a 'title title' and have a null
 798            section. Sob.
 799         */
 800         register_title (parser,
 801                         parser->accumulator->str,
 802                         "unknown section");
 803         g_string_truncate (parser->accumulator, 0);
 804         parser->state = BODY;
 805         return TRUE;
 806     }
 807
 808     if (parser->state == HAVE_TITLE) {
 809         /* What we've got so far is the manual's collection, followed
 810            by the title again. So we want to get rid of the latter if
 811            possible...
 812         */
 813         right_truncate_common (parser->accumulator->str,
 814                                parser->title_str);
 815         unicode_strstrip (parser->accumulator->str);
 816
 817         xmlNewTextChild (parser->header,
 818                          NULL, BAD_CAST "collection",
 819                          BAD_CAST parser->accumulator->str);
 820         g_string_truncate (parser->accumulator, 0);
 821         parser->state = BODY;
 822         parser->section_state = SECTION_BODY;
 823         return TRUE;
 824     }
 825
 826     /* parser->state == BODY */
 827     if (parser->section_state == SECTION_TITLE) {
 828
 829         g_strchomp (parser->accumulator->str);
 830         xmlNewTextChild (parser->section_node, NULL,
 831                          BAD_CAST "title",
 832                          BAD_CAST parser->accumulator->str);
 833         g_string_truncate (parser->accumulator, 0);
 834
 835         parser->section_state = SECTION_BODY;
 836     }
 837     else if (parser->sheet_node != NULL) {
 838         /*
 839           In the body of a section, when we get to a newline we should
 840           have an accumulator with text in it and a non-null sheet
 841           (hopefully!).
 842
 843           We know the current font, so add a span for that font
 844           containing the relevant text. Then add a <br/> tag.
 845         */
 846         finish_span (parser);
 847         node = xmlNewNode (NULL, BAD_CAST "br");
 848         xmlAddChild (parser->sheet_node, node);
 849     }
 850
 851     parser->newline = TRUE;
 852     parser->last_char_was_space = FALSE;
 853
 854     return TRUE;
 855 }
 856
 857 static void
 858 finish_span (YelpManParser *parser)
 859 {
 860     xmlNodePtr node;
 861
 862     if (parser->accumulator->str[0] != '\0') {
 863         node = xmlNewTextChild (parser->sheet_node, NULL,
 864                                 BAD_CAST "span",
 865                                 BAD_CAST parser->accumulator->str);
 866         xmlNewProp (node, BAD_CAST "class",
 867                     BAD_CAST get_font (parser));
 868         g_string_truncate (parser->accumulator, 0);
 869     }
 870 }
 871
 872 static guint
 873 dx_to_em_count (YelpManParser *parser, guint dx)
 874 {
 875     return (int)(dx / ((float)parser->char_width));
 876 }
 877
 878 static gboolean
 879 parse_N (YelpManParser *parser, GError **error)
 880 {
 881     gint n;
 882     gchar tmp[2];
 883
 884     if (SSCANF ("N%i", 1, &n)) {
 885         RAISE_PARSE_ERROR ("Strange format for N line: %s");
 886     }
 887     if (n > 127) {
 888         RAISE_PARSE_ERROR ("N line has non-7-bit character: %s");
 889     }
 890     if (n < -200) {
 891         RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s");
 892     }
 893
 894     if (n < 0) {
 895         append_nbsps (parser, -n);
 896         parser->N_count += -n;
 897         return TRUE;
 898     }
 899
 900     parser->N_count++;
 901
 902     tmp[0] = (gchar)n;
 903     tmp[1] = '\0';
 904
 905     return cheeky_call_parse_line (parser, error, 'N', tmp);
 906 }
 907
 908 static void
 909 append_nbsps (YelpManParser *parser, guint k)
 910 {
 911     for (; k > 0; k--) {
 912         /* 0xc2 0xa0 is nonbreaking space in utf8 */
 913         g_string_append_c (parser->accumulator, 0xc2);
 914         g_string_append_c (parser->accumulator, 0xa0);
 915     }
 916 }
 917
 918 static gboolean
 919 parse_C (YelpManParser *parser, GError **error)
 920 {
 921     gchar name[16];
 922     gunichar code = 0;
 923     guint k;
 924     gint len;
 925
 926     if (SSCANF ("C%16s", 1, name)) {
 927         RAISE_PARSE_ERROR ("Can't understand special character: %s");
 928     }
 929
 930     for (k=0; char_translations[k].from; k++) {
 931         if (g_str_equal (char_translations[k].from, name)) {
 932             code = char_translations[k].to;
 933             break;
 934         }
 935     }
 936     if (sscanf (name, "u%x", &k) == 1) {
 937         code = k;
 938     }
 939
 940     if (!code) {
 941         g_warning ("Couldn't parse troff special character: '%s'",
 942                    name);
 943         code = 65533; /* Unicode replacement character */
 944     }
 945
 946     /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */
 947     len = g_unichar_to_utf8 (code, name);
 948     name[len] = '\0';
 949
 950     parser->N_count++;
 951
 952     return cheeky_call_parse_line (parser, error, 'C', name);
 953 }
 954
 955 static void
 956 deal_with_newlines (YelpManParser *parser)
 957 {
 958     /*
 959       If newline is true, this is the first word on a line.
 960
 961       In which case, we check to see whether hpos agrees with the
 962       current sheet's indent. If so (or if there isn't a sheet yet!),
 963       we just add to the accumulator. If not, start a new sheet with
 964       the correct indent.
 965
 966       If we aren't the first word on the line, just add to the
 967       accumulator.
 968     */
 969     gchar tmp[64];
 970     guint jump_lines;
 971     gboolean made_sheet = FALSE, dont_jump = FALSE;
 972
 973     /* This only happens at the start of a section, where there's
 974        already a gap
 975     */
 976     if (!parser->sheet_node) {
 977         dont_jump = TRUE;
 978     }
 979
 980     if ((!parser->sheet_node) ||
 981         (parser->newline && (parser->hpos != parser->sheet_indent))) {
 982         new_sheet (parser);
 983         made_sheet = TRUE;
 984     }
 985
 986     if (parser->newline) {
 987         if ((parser->last_vertical_jump > 0) && (!dont_jump)) {
 988             jump_lines =
 989                 parser->last_vertical_jump/parser->char_height;
 990         } else {
 991             jump_lines = 1;
 992         }
 993
 994         if (jump_lines > 1) {
 995             if (!made_sheet) new_sheet (parser);
 996             made_sheet = TRUE;
 997         }
 998
 999         snprintf (tmp, 64, "%u", dx_to_em_count (parser, parser->hpos));
1000         xmlNewProp (parser->sheet_node,
1001                     BAD_CAST "indent", BAD_CAST tmp);
1002
1003         if (made_sheet) {
1004             snprintf (tmp, 64, "%u", jump_lines-1);
1005             xmlNewProp (parser->sheet_node,
1006                         BAD_CAST "jump", BAD_CAST tmp);
1007         }
1008     }
1009
1010     parser->newline = FALSE;
1011     parser->last_vertical_jump = 0;
1012 }
1013
1014 static gboolean
1015 parse_p (YelpManParser *parser, GError **error)
1016 {
1017     parser->vpos = 0;
1018     parser->hpos = 0;
1019     return TRUE;
1020 }
1021
1022 static void
1023 new_sheet (YelpManParser *parser)
1024 {
1025     /* We don't need to worry about finishing the current sheet,
1026        since the accumulator etc. get cleared on newlines and we
1027        know we're at the start of a line.
1028     */
1029     parser->sheet_node =
1030         xmlAddChild (parser->section_node,
1031                      xmlNewNode (NULL, BAD_CAST "sheet"));
1032     parser->sheet_indent = parser->hpos;
1033 }
1034
1035 static void
1036 register_title (YelpManParser *parser,
1037                 const gchar* name, const gchar* section)
1038 {
1039     xmlNewTextChild (parser->header,
1040                      NULL, BAD_CAST "title", BAD_CAST name);
1041     xmlNewTextChild (parser->header,
1042                      NULL, BAD_CAST "section", BAD_CAST section);
1043 }
1044
1045 static void
1046 right_truncate_common (gchar *dst, const gchar *src)
1047 {
1048     guint len_src = strlen (src);
1049     guint len_dst = strlen (dst);
1050
1051     guint k = (len_src < len_dst) ? len_src - 1 : len_dst - 1;
1052
1053     dst += len_dst - 1;
1054     src += len_src - 1;
1055
1056     while (k > 0) {
1057         if (*dst != *src) break;
1058         *dst = '\0';
1059
1060         k--;
1061         dst--;
1062         src--;
1063     }
1064 }
1065
1066 static gboolean
1067 cheeky_call_parse_line (YelpManParser *parser, GError **error,
1068                         gchar first_char, const gchar* text)
1069 {
1070     /* Do a cunning trick. There's all sorts of code that parse_text
1071      * does, which we don't want to duplicate in parse_N and
1072      * parse_C. So feed a buffer back to parse_text. Tada! Start it
1073      * with "C" or "N" rather than "t" so clever stuff in parse_text
1074      * can tell the difference.
1075      */
1076     gchar *tmp;
1077     gboolean ret;
1078     guint len = strlen (text);
1079
1080     tmp = parser->buffer;
1081     parser->buffer = g_new (gchar, 2 + len);
1082     parser->buffer[0] = first_char;
1083     strncpy (parser->buffer + 1, text, len + 1);
1084
1085     ret = parse_text (parser, error);
1086
1087     g_free (parser->buffer);
1088     parser->buffer = tmp;
1089
1090     return ret;
1091 }
1092
1093 static void
1094 cleanup_parsed_page (YelpManParser *parser)
1095 {
1096     /* First job: the last line usually has the version, date and
1097      * title (again!). The code above misunderstands and parses this
1098      * as a section, so we need to "undo" this and stick the data in
1099      * the header where it belongs.
1100      *
1101      * parser->section_node should still point to it. We assume this
1102      * has happened if it has exactly one child element (the <title>
1103      * tag)
1104      */
1105     gchar *lastline;
1106     GRegex *regex;
1107     gchar regex_string [1024];
1108
1109     if (xmlChildElementCount (parser->section_node) == 1) {
1110         lastline = (gchar *)xmlNodeGetContent (parser->section_node);
1111
1112         /* If parse_last_line works, it sets the data from it in the
1113            <header> tag, so delete the final section. */
1114         if (parse_last_line (parser, lastline)) {
1115             xmlUnlinkNode (parser->section_node);
1116             xmlFreeNode (parser->section_node);
1117         }
1118         else {
1119             /* Oh dear. This would be unexpected and doesn't seem to
1120                happen with man on my system. But we probably shouldn't
1121                ditch the info, so let's leave the <section> tag and
1122                print a warning message to the console.
1123             */
1124             g_warning ("Unexpected final line in man document (%s)\n",
1125                        lastline);
1126         }
1127
1128         xmlFree (lastline);
1129     }
1130
1131     /* Next job: Go through and stick the links in. Text that looks
1132      * like man(1) should be converted to a link to man:man(1) and
1133      * urls should also be linkified.
1134      *
1135      * Unfortunately, it's not entirely clear what constitutes a valid
1136      * section. All sections must be alphanumeric and the logic we use
1137      * to avoid extra hits (eg "one or more widget(s)") is that either
1138      * the section must start with a digit or (if the current section
1139      * doesn't) must start with the same letter as the current
1140      * section.
1141      */
1142     snprintf (regex_string, 1024,
1143               "([a-zA-Z0-9\\-_.:]+)\\(((%c|[0-9])[a-zA-Z0-9]*)\\)",
1144               parser->section ? parser->section[0] : '0');
1145     regex = g_regex_new (regex_string, 0, 0, NULL);
1146     g_return_if_fail (regex);
1147     fixup_links (parser, regex, man_link_inserter);
1148     g_regex_unref (regex);
1149
1150     /* Now for http:// links.
1151      */
1152     regex = g_regex_new ("https?:\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+"
1153                          "([\\w\\-\\.,@?^=%&:/~\\+#]*"
1154                          "[\\w\\-\\@?^=%&/~\\+#])?",
1155                          0, 0, NULL);
1156     g_return_if_fail (regex);
1157     fixup_links (parser, regex, http_link_inserter);
1158     g_regex_unref (regex);
1159 }
1160
1161 static gchar *
1162 skip_whitespace (gchar *text)
1163 {
1164     while (g_unichar_isspace (g_utf8_get_char (text))) {
1165         text = g_utf8_next_char (text);
1166     }
1167     return text;
1168 }
1169
1170 static gchar *
1171 last_non_whitespace (gchar *text)
1172 {
1173     gchar *end = text + strlen(text);
1174     gchar *prev;
1175
1176     prev = g_utf8_find_prev_char (text, end);
1177     if (!prev) {
1178         /* The string must have been zero-length. */
1179         return NULL;
1180     }
1181
1182     while (g_unichar_isspace (g_utf8_get_char (prev))) {
1183         end = prev;
1184         prev = g_utf8_find_prev_char (text, prev);
1185         if (!prev) return NULL;
1186     }
1187     return end;
1188 }
1189
1190 static gchar *
1191 find_contiguous_whitespace (gchar *text, guint ws_len)
1192 {
1193     guint counter = 0;
1194     gchar *ws_start;
1195     while (*text) {
1196         if (g_unichar_isspace (g_utf8_get_char (text))) {
1197             if (!counter) ws_start = text;
1198             counter++;
1199         }
1200         else counter = 0;
1201
1202         if (counter == ws_len) return ws_start;
1203
1204         text = g_utf8_next_char (text);
1205     }
1206     return NULL;
1207 }
1208
1209 static gboolean
1210 parse_last_line (YelpManParser *parser, gchar* line)
1211 {
1212     /* We expect a line of the form
1213            '1.2.3      blah 2009       libfoo(1)'
1214        where the spaces are all nbsp's.
1215
1216        Look for a gap of at least 3 in a row. If we find that, expand
1217        either side and declare the stuff before to be the version
1218        number and then the stuff afterwards to be the start of the
1219        date. Then do the same thing on the next gap, if there is one.
1220     */
1221     gchar *gap, *date_start;
1222
1223     gchar *version;
1224     gchar *date;
1225
1226     gap = find_contiguous_whitespace (line, 3);
1227     if (!gap) return FALSE;
1228
1229     version = g_strndup (line, gap - line);
1230
1231     date_start = skip_whitespace (gap);
1232
1233     gap = find_contiguous_whitespace (date_start, 3);
1234     if (!gap) return FALSE;
1235
1236     date = g_strndup (date_start, gap - date_start);
1237
1238     xmlNewProp (parser->header, BAD_CAST "version", BAD_CAST version);
1239     xmlNewProp (parser->header, BAD_CAST "date", BAD_CAST date);
1240
1241     g_free (version);
1242     g_free (date);
1243
1244     return TRUE;
1245 }
1246
1247 /* This should work like g_strstrip, but that's an ASCII-only version
1248  * and I want to strip the nbsp's that I so thoughtfully plaster
1249  * stuff with...
1250  */
1251 static void
1252 unicode_strstrip (gchar *str)
1253 {
1254     gchar *start, *end;
1255
1256     if (str == NULL) return;
1257
1258     end = last_non_whitespace (str);
1259
1260     if (!end) {
1261         /* String is zero-length or entirely whitespace */
1262         *str = '\0';
1263         return;
1264     }
1265     start = skip_whitespace (str);
1266
1267     g_memmove (str, start, end - start);
1268     *(str + (end - start)) = '\0';
1269 }
1270
1271 static void
1272 sheet_fixup_links (xmlNodePtr sheet,
1273                    const GRegex *regex, link_inserter inserter)
1274 {
1275     /*
1276       This works as follows: grab (<span>) nodes from a sheet in
1277       order and stick their contents into a string. Since a sheet
1278       won't be ludicrously long, we can just grab everything and then
1279       work over it, but we need to keep track of which node points at
1280       which bit of the string so we can call inserter helpfully. To do
1281       so, use byte offsets, since that seems less likely to go
1282       horribly wrong!
1283     */
1284     GString *accumulator = g_string_new ("");
1285     xmlNodePtr span;
1286     xmlChar *tmp;
1287     gsize offset = 0;
1288     gsize len;
1289     offset_elt_pair pair;
1290     GMatchInfo *match_info;
1291
1292     /* Make pairs zero-terminated so that code can iterate through it
1293      * looking for something with elt = NULL. */
1294     GArray *pairs = g_array_new (TRUE, FALSE,
1295                                  sizeof (offset_elt_pair));
1296
1297     g_return_if_fail (regex);
1298     g_return_if_fail (inserter);
1299     g_return_if_fail (sheet);
1300
1301     for (span = sheet->children; span != NULL; span = span->next) {
1302         if (span->type != XML_ELEMENT_NODE) continue;
1303
1304         if (strcmp ((const char*) span->name, "span") != 0) {
1305
1306             if (strcmp ((const char*) span->name, "a") == 0)
1307                 continue;
1308
1309             if (strcmp ((const char*) span->name, "br") == 0) {
1310                 /* If the last character in the accumulator is a
1311                  * hyphen, we don't want to include that in the link
1312                  * we make. If not, append a newline to the
1313                  * accumulator (so we don't mistakenly make links from
1314                  * "see\nthis(2)" to seethis(2).
1315                  *
1316                  * Either way, we add the <br> to the list of pairs
1317                  * since we might need to do stuff with it if it's in
1318                  * the middle of a link.
1319                  */
1320                 len = strlen (accumulator->str);
1321                 if (len > 0 && accumulator->str [len-1] == '-') {
1322                     g_string_truncate (accumulator, len - 1);
1323                     offset--;
1324                 }
1325                 else {
1326                     g_string_append_c (accumulator, '\n');
1327                     offset++;
1328                 }
1329                 pair.start = offset;
1330                 pair.end = offset;
1331                 pair.elt = span; /* Er, br in fact. */
1332                 g_array_append_val (pairs, pair);
1333
1334                 continue;
1335             }
1336
1337             g_warning ("Expected all child elements to be "
1338                        "<span>, <br> or <a>, but "
1339                        "have found a <%s>.",
1340                        (gchar *) span->name);
1341             continue;
1342         }
1343
1344         tmp = xmlNodeGetContent (span);
1345         g_string_append (accumulator, (gchar *) tmp);
1346         len = strlen ((const char*) tmp);
1347
1348         pair.start = offset;
1349         pair.end = offset + len;
1350         pair.elt = span;
1351
1352         g_array_append_val (pairs, pair);
1353
1354         offset += len;
1355         xmlFree (tmp);
1356     }
1357
1358     /* We've got the data. Now try to match the regex against it as
1359      * many times as possible
1360      */
1361     offset = 0;
1362     g_regex_match_full (regex, accumulator->str,
1363                         -1, offset, 0, &match_info, NULL);
1364     while (g_match_info_matches (match_info)) {
1365         offset = inserter ((offset_elt_pair *)pairs->data,
1366                            match_info);
1367
1368         g_match_info_free (match_info);
1369
1370         g_regex_match_full (regex, accumulator->str,
1371                             -1, offset, 0, &match_info, NULL);
1372     }
1373
1374     g_string_free (accumulator, TRUE);
1375     g_array_unref (pairs);
1376 }
1377
1378 static void
1379 fixup_links (YelpManParser *parser,
1380              const GRegex *regex, link_inserter inserter)
1381 {
1382     /* Iterate over all the <sheet>'s in the xml document */
1383     xmlXPathContextPtr context;
1384     xmlXPathObjectPtr path_obj;
1385     xmlNodeSetPtr nodeset;
1386     gint i;
1387
1388     context = xmlXPathNewContext (parser->doc);
1389     g_return_if_fail (context);
1390
1391     path_obj = xmlXPathEvalExpression (BAD_CAST "//sheet", context);
1392     g_return_if_fail (path_obj);
1393
1394     nodeset = path_obj->nodesetval;
1395     g_return_if_fail (nodeset);
1396
1397     for (i = 0; i < nodeset->nodeNr; ++i) {
1398         sheet_fixup_links (nodeset->nodeTab[i], regex, inserter);
1399     }
1400
1401     xmlXPathFreeObject (path_obj);
1402     xmlXPathFreeContext (context);
1403 }
1404
1405 /*
1406   This inserts new_child under parent. If older_sibling is non-NULL,
1407   we stick it immediately after it. Otherwise, insert as the first
1408   child of the parent.
1409
1410   Returns the inserted child.
1411  */
1412 static xmlNodePtr
1413 insert_child_after (xmlNodePtr parent, xmlNodePtr older_sibling,
1414                     xmlNodePtr new_child)
1415 {
1416     g_return_val_if_fail (parent && new_child, new_child);
1417
1418     if (older_sibling) {
1419         xmlAddNextSibling (older_sibling, new_child);
1420     }
1421     else if (parent->children == NULL) {
1422         xmlAddChild (parent, new_child);
1423     }
1424     else {
1425         xmlAddPrevSibling (parent->children, new_child);
1426     }
1427
1428     return new_child;
1429 }
1430
1431 static void
1432 copy_prop (xmlNodePtr to, xmlNodePtr from, const xmlChar *name)
1433 {
1434     xmlChar *prop = xmlGetProp (from, name);
1435     g_return_if_fail (prop);
1436     xmlSetProp (to, name, prop);
1437     xmlFree (prop);
1438 }
1439
1440 static gsize
1441 do_node_replacement (xmlNodePtr anchor_node,
1442                      offset_elt_pair *offsets,
1443                      gsize startpos, gsize endpos)
1444 {
1445     xmlNodePtr node, sibling_before;
1446     gchar *gtmp;
1447     xmlChar *xtmp, *xshort;
1448     gsize look_from;
1449
1450     /* Find the first element by searching through offsets. I suppose
1451      * a binary search would be cleverer, but I doubt that this will
1452      * take significant amounts of time.
1453      *
1454      * We should never fall off the end, but (just in case) the GArray
1455      * that holds the offsets is zero-terminated and elt should never
1456      * be NULL so we can stop if necessary
1457      */
1458     while ((offsets->end <= startpos) && offsets->elt) {
1459         offsets++;
1460     }
1461     g_return_val_if_fail (offsets->elt, endpos);
1462
1463     /* xtmp is NULL by default, but we do this here so that if we read
1464      * the node in the if block below, we don't have to do it a second
1465      * time.
1466      */
1467     xtmp = NULL;
1468     sibling_before = offsets->elt->prev;
1469     look_from = startpos;
1470
1471     /* Maybe there's text in the relevant span before the start of
1472      * the stuff we want to replace with a link.
1473      */
1474     if (startpos > offsets->start) {
1475         node = xmlNewNode (NULL, BAD_CAST "span");
1476         copy_prop (node, offsets->elt, BAD_CAST "class");
1477
1478         xtmp = xmlNodeGetContent (offsets->elt);
1479         gtmp = g_strndup ((const gchar*)xtmp, startpos - offsets->start);
1480         xmlNodeAddContent (node, BAD_CAST gtmp);
1481         g_free (gtmp);
1482
1483         sibling_before = insert_child_after (offsets->elt->parent,
1484                                              sibling_before, node);
1485     }
1486
1487     insert_child_after (offsets->elt->parent,
1488                         sibling_before, anchor_node);
1489
1490     /* The main loop. Here we work over each span that overlaps with
1491      * the link we're adding. We add a similar span as a child of the
1492      * anchor node and then delete the existing one.  */
1493     while (look_from < endpos) {
1494         if (!xtmp) xtmp = xmlNodeGetContent (offsets->elt);
1495
1496         if (strcmp ((const gchar*)offsets->elt->name, "br") == 0) {
1497             node = xmlNewChild (anchor_node,
1498                                 NULL, BAD_CAST "br", NULL);
1499             xmlUnlinkNode (offsets->elt);
1500             xmlFreeNode (offsets->elt);
1501             xmlFree (xtmp);
1502             xtmp = NULL;
1503             offsets++;
1504         }
1505         else if (endpos < offsets->end) {
1506             xshort = BAD_CAST g_strndup ((const gchar*)xtmp,
1507                                          endpos - offsets->start);
1508
1509             node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1510                                 xshort + (look_from-offsets->start));
1511             copy_prop (node, offsets->elt, BAD_CAST "class");
1512
1513             node = xmlNewNode (NULL, BAD_CAST "span");
1514             xmlNodeAddContent (node,
1515                                xtmp + (endpos - offsets->start));
1516             copy_prop (node, offsets->elt, BAD_CAST "class");
1517             xmlAddNextSibling (anchor_node, node);
1518
1519             xmlFree (xshort);
1520
1521             xmlUnlinkNode (offsets->elt);
1522             xmlFreeNode (offsets->elt);
1523             xmlFree (xtmp);
1524             xtmp = NULL;
1525
1526             offsets->start = endpos;
1527             offsets->elt = node;
1528         }
1529         else {
1530             node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1531                                 xtmp + (look_from - offsets->start));
1532             copy_prop (node, offsets->elt, BAD_CAST "class");
1533
1534             xmlUnlinkNode (offsets->elt);
1535             xmlFreeNode (offsets->elt);
1536             xmlFree (xtmp);
1537             xtmp = NULL;
1538             offsets++;
1539         }
1540
1541         if (!offsets->elt) {
1542             /* We got to the end of a sheet and of the stuff we're
1543              * doing at the same time
1544              */
1545             return endpos;
1546         }
1547
1548         look_from = offsets->start;
1549     }
1550
1551     return offsets->start;
1552 }
1553
1554 static gsize
1555 do_link_insertion (const gchar *url,
1556                    offset_elt_pair *offsets,
1557                    gsize startpos, gsize endpos)
1558 {
1559     xmlNodePtr anchor_node = xmlNewNode (NULL, BAD_CAST "a");
1560
1561     xmlNewProp (anchor_node, BAD_CAST "href", BAD_CAST url);
1562
1563     return do_node_replacement (anchor_node, offsets,
1564                                 startpos, endpos);
1565 }
1566
1567 static gsize
1568 man_link_inserter (offset_elt_pair *offsets,
1569                    const GMatchInfo *match_info)
1570 {
1571     gchar *name, *section;
1572     gchar url[1024];
1573
1574     gint startpos, endpos;
1575
1576     g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1577
1578     name = g_match_info_fetch (match_info, 1);
1579     section = g_match_info_fetch (match_info, 2);
1580
1581     g_return_val_if_fail (name && section, endpos);
1582
1583     snprintf (url, 1024, "man:%s(%s)", name, section);
1584
1585     g_free (name);
1586     g_free (section);
1587
1588     return do_link_insertion (url, offsets, startpos, endpos);
1589 }
1590
1591 static gsize
1592 http_link_inserter (offset_elt_pair *offsets,
1593                     const GMatchInfo *match_info)
1594 {
1595     gchar *url;
1596     gint startpos, endpos;
1597     gsize ret;
1598
1599     url = g_match_info_fetch (match_info, 0);
1600     g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1601
1602     ret = do_link_insertion (url, offsets, startpos, endpos);
1603
1604     g_free (url);
1605
1606     return ret;
1607 }