Update Chinese (China) translation
[yelp.git] / libyelp / yelp-man-parser.c
blob46073a2e213b54febfd050485b672152e2a5e4b7
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 4 -*- */
2 /*
3 * Copyright (C) 2003-2010 Shaun McCance <shaunm@gnome.org>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 * Author: Shaun McCance <shaunm@gnome.org>
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
25 #include <glib.h>
26 #include <glib/gi18n.h>
27 #include <libxml/tree.h>
28 #include <libxml/xpath.h>
29 #include <gio/gio.h>
30 #include <gio/gunixinputstream.h>
31 #include <string.h>
32 #include <math.h>
34 #include "yelp-error.h"
35 #include "yelp-man-parser.h"
37 #define MAN_FONTS 8
39 /* The format has two copies of the title like MAN(1) at the top,
40 * possibly with a string of text in between for the collection.
42 * Start with the parser on START, then HAVE_TITLE when we've read the
43 * first word with parentheses. At that point, stick new words into
44 * the "collection" tag. Then finally switch to BODY when we've seen
45 * the second copy of the one with parentheses.
47 typedef enum ManParserState
49 START,
50 HAVE_TITLE,
51 BODY
52 } ManParserState;
54 /* See parse_body_text for how this is used. */
55 typedef enum ManParserSectionState
57 SECTION_TITLE,
58 SECTION_BODY
59 } ManParserSectionState;
61 struct _YelpManParser {
62 xmlDocPtr doc; /* The top-level XML document */
63 xmlNodePtr header; /* The header node */
64 xmlNodePtr section_node; /* The current section */
65 xmlNodePtr sheet_node; /* The current sheet */
67 GDataInputStream *stream; /* The GIO input stream to read from */
68 gchar *buffer; /* The buffer, line at a time */
69 gsize length; /* The buffer length */
71 gchar *section; /* The name of the current section */
73 /* The width and height of a character according to troff. */
74 guint char_width;
75 guint char_height;
77 /* Count the number of lines we've parsed (needed to get prologue) */
78 guint line_no;
80 /* The x f k name command sets the k'th register to be name. */
81 gchar* font_registers[MAN_FONTS];
83 /* The current font. Should be the index of one of the
84 * font_registers. Starts at 0 (of course!)
86 guint current_font;
88 /* See description of ManParserState above */
89 ManParserState state;
91 /* Vertical and horizontal position as far as the troff output is
92 * concerned. (Measured from top-left).
94 guint vpos, hpos;
96 /* Text accumulator (needed since it comes through in dribs &
97 * drabs...) */
98 GString *accumulator;
100 /* See parse_body_text for how this is used. */
101 ManParserSectionState section_state;
103 /* The indent of the current sheet */
104 guint sheet_indent;
106 /* Set to TRUE if there's been a newline since the last text was
107 * parsed. */
108 gboolean newline;
110 /* Count the number of 'N' lines we've seen since the last h
111 * command. This is because for some reason N doesn't
112 * automatically move the position forward. Thus immediately after
113 * one, you see a h24 or the like. Unless there's a space. Then it
114 * might be wh48. This is set in parse_N (obviously) and used in
115 * parse_h.
117 guint N_count;
119 /* Keep track of whether the last character was a space. We can't
120 * just do this by looking at the last char of accumulator,
121 * because if there's a font change, it gets zeroed. This gets set
122 * to TRUE by parse_w and is FALSE the rest of the time.
124 gboolean last_char_was_space;
126 /* Keep track of the size of the last vertical jump - used to tell
127 * whether we need to insert extra space above a line.
129 gint last_vertical_jump;
131 /* The title we read earlier (eg 'Foo(2)') */
132 gchar *title_str;
135 static gboolean parser_parse_line (YelpManParser *parser, GError **error);
136 static gboolean parse_prologue_line (YelpManParser *parser, GError **error);
138 /* Parsers for different types of line */
139 typedef gboolean (*LineParser)(YelpManParser *, GError **);
140 #define DECLARE_LINE_PARSER(name) \
141 static gboolean (name) (YelpManParser *parser, GError **error);
143 DECLARE_LINE_PARSER (parse_xf)
144 DECLARE_LINE_PARSER (parse_f)
145 DECLARE_LINE_PARSER (parse_V)
146 DECLARE_LINE_PARSER (parse_H)
147 DECLARE_LINE_PARSER (parse_v)
148 DECLARE_LINE_PARSER (parse_h)
149 DECLARE_LINE_PARSER (parse_text)
150 DECLARE_LINE_PARSER (parse_w)
151 DECLARE_LINE_PARSER (parse_body_text)
152 DECLARE_LINE_PARSER (parse_n)
153 DECLARE_LINE_PARSER (parse_N)
154 DECLARE_LINE_PARSER (parse_C)
155 DECLARE_LINE_PARSER (parse_p)
157 /* Declare a sort of alist registry of parsers for different lines. */
158 struct LineParsePair
160 const gchar *prefix;
161 LineParser handler;
163 static struct LineParsePair line_parsers[] = {
164 { "x f", parse_xf }, { "f", parse_f },
165 { "V", parse_V }, { "H", parse_H },
166 { "v", parse_v }, { "h", parse_h },
167 { "t", parse_text },
168 { "w", parse_w },
169 { "n", parse_n },
170 { "N", parse_N },
171 { "C", parse_C },
172 { "p", parse_p },
173 { NULL, NULL }
176 /******************************************************************************/
177 /* Parser helper functions (managing the state of the various parsing
178 * bits) */
179 static void finish_span (YelpManParser *parser);
180 static guint dx_to_em_count (YelpManParser *parser, guint dx);
181 static void append_nbsps (YelpManParser *parser, guint k);
182 static void deal_with_newlines (YelpManParser *parser);
183 static void new_sheet (YelpManParser *parser);
184 static void register_title (YelpManParser *parser,
185 const gchar* name, const gchar* section);
186 static void right_truncate_common (gchar *dst, const gchar *src);
187 static gboolean cheeky_call_parse_line (YelpManParser *parser,
188 GError **error,
189 gchar first_char,
190 const gchar *text);
191 static void cleanup_parsed_page (YelpManParser *parser);
192 static gboolean parse_last_line (YelpManParser *parser, gchar* line);
193 static void unicode_strstrip (gchar *str);
196 A link_inserter takes
197 (1) an array of offsets for the different spans within the string
198 (2) the match info from the regex match
200 It's then responsible for mangling the XML tree to insert the actual
201 link. Finally, it should return the offset into the string of the
202 end of what it's just dealt with. If necessary, it should also fix
203 up offsets to point correctly at the last node inserted.
205 typedef struct {
206 gsize start, end;
207 xmlNodePtr elt;
208 } offset_elt_pair;
210 typedef gsize (*link_inserter)(offset_elt_pair *,
211 const GMatchInfo *);
213 static void fixup_links (YelpManParser *parser,
214 const GRegex *matcher,
215 link_inserter inserter);
217 static gsize man_link_inserter (offset_elt_pair *offsets,
218 const GMatchInfo *match_info);
219 static gsize http_link_inserter (offset_elt_pair *offsets,
220 const GMatchInfo *match_info);
222 /******************************************************************************/
223 /* Translations for the 'C' command. This is indeed hackish, but the
224 * -Tutf8 output doesn't seem to give include files so we can do this
225 * at runtime :-(
227 * On my machine, this data's at /usr/share/groff/current/tmac/ in
228 * latin1.tmac, unicode.tmac and I worked out the lq and rq from
229 * running man: I'm not sure where that comes from!
231 struct StringPair
233 const gchar *from;
234 gunichar to;
236 static const struct StringPair char_translations[] = {
237 { "r!", 161 },
238 { "ct", 162 },
239 { "Po", 163 },
240 { "Cs", 164 },
241 { "Ye", 165 },
242 { "bb", 166 },
243 { "sc", 167 },
244 { "ad", 168 },
245 { "co", 169 },
246 { "Of", 170 },
247 { "Fo", 171 },
248 { "tno", 172 },
249 { "%", 173 },
250 { "rg", 174 },
251 { "a-", 175 },
252 { "de", 176 },
253 { "t+-", 177 },
254 { "S2", 178 },
255 { "S3", 179 },
256 { "aa", 180 },
257 { "mc", 181 },
258 { "ps", 182 },
259 { "pc", 183 },
260 { "ac", 184 },
261 { "S1", 185 },
262 { "Om", 186 },
263 { "Fc", 187 },
264 { "14", 188 },
265 { "12", 189 },
266 { "34", 190 },
267 { "r?", 191 },
268 { "`A", 192 },
269 { "'A", 193 },
270 { "^A", 194 },
271 { "~A", 195 },
272 { ":A", 196 },
273 { "oA", 197 },
274 { "AE", 198 },
275 { ",C", 199 },
276 { "`E", 200 },
277 { "'E", 201 },
278 { "^E", 202 },
279 { ":E", 203 },
280 { "`I", 204 },
281 { "'I", 205 },
282 { "^I", 206 },
283 { ":I", 207 },
284 { "-D", 208 },
285 { "~N", 209 },
286 { "`O", 210 },
287 { "'O", 211 },
288 { "^O", 212 },
289 { "~O", 213 },
290 { ":O", 214 },
291 { "tmu", 215 },
292 { "/O", 216 },
293 { "`U", 217 },
294 { "'U", 218 },
295 { "^U", 219 },
296 { ":U", 220 },
297 { "'Y", 221 },
298 { "TP", 222 },
299 { "ss", 223 },
300 { "`a", 224 },
301 { "'a", 225 },
302 { "^a", 226 },
303 { "~a", 227 },
304 { ":a", 228 },
305 { "oa", 229 },
306 { "ae", 230 },
307 { ",c", 231 },
308 { "`e", 232 },
309 { "'e", 233 },
310 { "^e", 234 },
311 { ":e", 235 },
312 { "`i", 236 },
313 { "'i", 237 },
314 { "^i", 238 },
315 { ":i", 239 },
316 { "Sd", 240 },
317 { "~n", 241 },
318 { "`o", 242 },
319 { "'o", 243 },
320 { "^o", 244 },
321 { "~o", 245 },
322 { ":o", 246 },
323 { "tdi", 247 },
324 { "/o", 248 },
325 { "`u", 249 },
326 { "'u", 250 },
327 { "^u", 251 },
328 { ":u", 252 },
329 { "'y", 253 },
330 { "Tp", 254 },
331 { ":y", 255 },
332 { "hy", '-' },
333 { "oq", '`' },
334 { "cq", '\'' },
335 { "lq", 8220 }, // left smart quotes
336 { "rq", 8221 }, // right smart quotes
337 { "en", 8211 }, // en-dash
338 { "em", 8212 }, // em-dash
339 { "la", 10216 }, // left angle bracket
340 { "ra", 10217 }, // left angle bracket
341 { "rs", '\\' },
342 { "<=", 8804 }, // < or equal to sign
343 { ">=", 8805 }, // > or equal to sign
344 { "aq", '\'' },
345 { "tm", 8482 }, // trademark symbol
346 { NULL, 0 }
349 /******************************************************************************/
351 YelpManParser *
352 yelp_man_parser_new (void)
354 YelpManParser *parser = g_new0 (YelpManParser, 1);
355 parser->accumulator = g_string_sized_new (1024);
356 return parser;
360 This function is responsible for taking a path to a man file and
361 returning something in the groff intermediate output format for us
362 to use.
364 If something goes wrong, we return NULL and set error to be a
365 YelpError describing the problem.
367 static GInputStream*
368 get_troff (gchar *path, GError **error)
370 gint ystdout;
371 GError *err = NULL;
372 const gchar *argv[] = { "man", "-Z", "-Tutf8", "-EUTF-8", path, NULL };
373 gchar **my_argv;
375 /* g_strdupv() should accept a "const gchar **". */
376 my_argv = g_strdupv ((gchar **) argv);
378 if (!g_spawn_async_with_pipes (NULL, my_argv, NULL,
379 G_SPAWN_SEARCH_PATH, NULL, NULL,
380 NULL, NULL, &ystdout, NULL, &err)) {
381 /* We failed to run the man program. Return a "Huh?" error. */
382 *error = g_error_new (YELP_ERROR, YELP_ERROR_UNKNOWN,
383 "%s", err->message);
384 g_error_free (err);
385 g_strfreev (my_argv);
386 return NULL;
389 g_strfreev (my_argv);
391 return (GInputStream*) g_unix_input_stream_new (ystdout, TRUE);
394 xmlDocPtr
395 yelp_man_parser_parse_file (YelpManParser *parser,
396 gchar *path,
397 GError **error)
399 GInputStream *troff_stream;
400 gboolean ret;
401 xmlNodePtr root;
403 troff_stream = get_troff (path, error);
404 if (!troff_stream) return NULL;
406 parser->stream = g_data_input_stream_new (troff_stream);
408 parser->doc = xmlNewDoc (BAD_CAST "1.0");
409 root = xmlNewNode (NULL, BAD_CAST "Man");
410 xmlDocSetRootElement (parser->doc, root);
412 parser->header = xmlNewNode (NULL, BAD_CAST "header");
413 xmlAddChild (root, parser->header);
415 while (1) {
416 parser->buffer =
417 g_data_input_stream_read_line (parser->stream,
418 &(parser->length),
419 NULL, NULL);
420 if (parser->buffer == NULL) break;
422 parser->line_no++;
423 ret = parser_parse_line (parser, error);
425 g_free (parser->buffer);
427 if (!ret) {
428 xmlFreeDoc (parser->doc);
429 parser->doc = NULL;
430 break;
434 cleanup_parsed_page (parser);
436 g_object_unref (parser->stream);
438 return parser->doc;
441 void
442 yelp_man_parser_free (YelpManParser *parser)
444 guint k;
445 if (parser) {
446 for (k=0; k<MAN_FONTS; k++)
447 g_free (parser->font_registers[k]);
449 g_string_free (parser->accumulator, TRUE);
450 g_free (parser->title_str);
451 g_free (parser->section);
452 g_free (parser);
455 /******************************************************************************/
457 /* Sets the k'th font register to be name. Copies name, so free it
458 * afterwards. k should be in [0,MAN_FONTS). It seems that man always
459 * gives us ones at least 1, but groff_out(5) says non-negative.
461 static void
462 set_font_register (YelpManParser *parser, guint k, const gchar* name)
464 if (k > MAN_FONTS) {
465 g_warning ("Tried to set nonexistant font register %u to %s",
466 k, name);
467 return;
469 g_free (parser->font_registers[k]);
470 parser->font_registers[k] = g_strdup (name);
473 static const gchar*
474 get_font (const YelpManParser *parser)
476 guint k = parser->current_font;
477 if (k > MAN_FONTS ||
478 parser->font_registers[k] == NULL) {
480 g_warning ("Tried to get nonexistant font register %u", k);
482 return "";
485 return parser->font_registers[k];
488 /******************************************************************************/
491 Convenience macros to scan a string, checking for the correct number
492 of things read.
494 Also to raise an error. Add an %s to the end of the format string,
495 which automatically gets given parser->buffer.
497 #define SSCANF(fmt,num,...) \
498 (sscanf (parser->buffer, (fmt), __VA_ARGS__) != (num))
500 #define PARSE_ERROR(...) \
501 g_error_new (YELP_ERROR, YELP_ERROR_PROCESSING, \
502 __VA_ARGS__, parser->buffer)
503 #define RAISE_PARSE_ERROR(...) \
504 { *error = PARSE_ERROR (__VA_ARGS__); return FALSE; }
506 static gboolean
507 parser_parse_line (YelpManParser *parser, GError **error)
509 const struct LineParsePair *p;
511 if (parser->line_no <= 3)
512 return parse_prologue_line (parser, error);
514 p = line_parsers;
515 while (p->handler != NULL) {
516 if (g_str_has_prefix (parser->buffer, p->prefix)) {
517 return p->handler(parser, error);
519 p++;
521 return TRUE;
524 static gboolean
525 parse_prologue_line (YelpManParser *parser, GError **error)
527 if (parser->line_no != 2) return TRUE;
529 /* This is the interesting line, which should look like
530 x res 240 24 40
531 The interesting bits are the 24 and the 40, which are the
532 width and height of a character as far as -Tutf8 is
533 concerned.
535 if (SSCANF ("x %*s %*u %u %u", 2,
536 &parser->char_width, &parser->char_height)) {
537 RAISE_PARSE_ERROR ("Wrong 'x res' line from troff: %s");
540 return TRUE;
543 static gboolean
544 parse_xf (YelpManParser *parser, GError **error)
546 gchar name[10];
547 guint k;
549 if (SSCANF ("x f%*s %u %10s", 2, &k, name)) {
550 RAISE_PARSE_ERROR ("Invalid 'x f' line from troff: %s");
552 set_font_register (parser, k, name);
553 return TRUE;
556 static gboolean
557 parse_f (YelpManParser *parser, GError **error)
559 guint k;
560 if (SSCANF ("f%u", 1, &k)) {
561 RAISE_PARSE_ERROR ("Invalid font line from troff: %s");
563 finish_span (parser);
565 parser->current_font = k;
567 return TRUE;
570 static gboolean
571 parse_v (YelpManParser *parser, GError **error)
573 guint dy;
574 if (SSCANF ("v%u", 1, &dy)) {
575 RAISE_PARSE_ERROR ("Invalid v line from troff: %s");
577 parser->last_vertical_jump += dy;
578 parser->vpos += dy;
579 return TRUE;
582 static gboolean
583 parse_h (YelpManParser *parser, GError **error)
585 guint dx;
586 int k;
588 if (SSCANF ("h%u", 1, &dx)) {
589 RAISE_PARSE_ERROR ("Invalid h line from troff: %s");
591 parser->hpos += dx;
593 /* This is a bit hackish to be honest but... if we're in something
594 * that'll end up in a span, a spacing h command means that a gap
595 * should appear. It seems that the easiest way to get this is to
596 * insert nonbreaking spaces (eugh!)
598 * Of course we don't want to do this when chained from wh24 or
599 * whatever, so use the last_char_was_space flag
600 * but... unfortunately some documents actually use stuff like
601 * wh96 for spacing (eg the lists in perl(1)). So (very hackish!),
602 * ignore double spaces, since that's probably just been put in to
603 * make the text justified (eugh), but allow bigger jumps.
605 * Incidentally, the perl manual here has bizarre gaps in the
606 * synopsis section. God knows why, but man displays them too so
607 * it's not our fault! :-)
609 k = dx_to_em_count (parser, dx);
611 if ((!parser->last_char_was_space) || (k > 2)) {
613 k -= parser->N_count;
614 if (k < 0) k = 0;
616 append_nbsps (parser, k);
619 parser->N_count = 0;
621 return TRUE;
624 static gboolean
625 parse_V (YelpManParser *parser, GError **error)
627 guint y;
628 if (SSCANF ("V%u", 1, &y)) {
629 RAISE_PARSE_ERROR ("Invalid V line from troff: %s");
631 parser->last_vertical_jump += y - parser->vpos;
632 parser->vpos = y;
633 return TRUE;
636 static gboolean
637 parse_H (YelpManParser *parser, GError **error)
639 guint x;
640 if (SSCANF ("H%u", 1, &x)) {
641 RAISE_PARSE_ERROR ("Invalid H line from troff: %s");
643 parser->hpos = x;
644 return TRUE;
647 static gboolean
648 parse_text (YelpManParser *parser, GError **error)
650 gchar *text, *section, *tmp;
651 const gchar *acc;
654 Sneakily, this might get called with something other than t
655 starting the buffer: see parse_C and parse_N.
657 if (parser->buffer[0] == 't') {
658 parser->N_count = 0;
661 if (parser->state == START) {
662 /* This should be the 'Title String(1)' line. It might come in
663 * chunks (for example, it might be more than one line
664 * long!). So just read bits until we get a (blah) bit: stick
665 * everything in the accumulator and check for
666 * parentheses. When we've got some, stick the parsed title in
667 * the header and switch to HAVE_TITLE.
669 * The parse_n code will error out if we didn't manage to get
670 * a title before the first newline and otherwise is in charge
671 * of switching to body-parsing mode.
673 g_string_append (parser->accumulator, parser->buffer+1);
675 acc = parser->accumulator->str;
677 section = strchr (acc, '(');
679 if (section) {
680 section++;
681 tmp = strchr (section, ')');
684 if (section && tmp) {
685 /* We've got 'Blah (3)' or the like in the accumulator */
686 if (*(tmp+1) != '\0') {
687 RAISE_PARSE_ERROR ("Don't understand title line: '%s'");
689 parser->state = HAVE_TITLE;
690 parser->title_str = g_strdup (acc);
692 text = g_strndup (acc, (section - 1) - acc);
693 section = g_strndup (section, tmp - section);
695 register_title (parser, text, section);
697 g_string_truncate (parser->accumulator, 0);
699 g_free (text);
700 parser->section = section;
703 return TRUE;
706 if (parser->state == BODY)
707 return parse_body_text (parser, error);
709 /* In state HAVE_TITLE */
710 else {
711 /* We expect (maybe!) to get some lines in between the two
712 * occurrences of the title itself. So collect up all the text
713 * we get and then we'll remove the copy of the title at the
714 * end (hopefully) when we find a newline in parse_n.
716 g_string_append (parser->accumulator, parser->buffer+1);
717 return TRUE;
721 static gboolean
722 parse_body_text (YelpManParser *parser, GError **error)
725 It's this function which is responsible for trying to get *some*
726 semantic information back out of the manual page.
728 The highest-level chopping up is into sections. We use the
729 heuristic that if either
730 (1) We haven't got a section yet or
731 (2) text starts a line (hpos=0)
732 then it's a section title.
734 It's possible to have spaces in section titles, so we carry on
735 accumulating the section title until the next newline.
737 if (parser->section_state == SECTION_BODY &&
738 (!parser->section_node || (parser->hpos == 0))) {
739 g_string_truncate (parser->accumulator, 0);
740 /* End the current sheet & section */
741 parser->section_state = SECTION_TITLE;
742 parser->sheet_node = NULL;
744 parser->section_node =
745 xmlAddChild (xmlDocGetRootElement (parser->doc),
746 xmlNewNode (NULL, BAD_CAST "section"));
749 if (parser->section_state != SECTION_TITLE) {
750 deal_with_newlines (parser);
753 g_string_append (parser->accumulator, parser->buffer+1);
755 /* Move hpos forward per char */
756 parser->hpos += strlen (parser->buffer+1) * parser->char_width;
758 parser->last_char_was_space = FALSE;
760 return TRUE;
764 w is a sort of prefix argument. It indicates a space, so we register
765 that here, then call parser_parse_line again on the rest of the
766 string to deal with that.
768 static gboolean
769 parse_w (YelpManParser *parser, GError **error)
771 gboolean ret;
773 if (parser->state != START) {
774 g_string_append_c (parser->accumulator, ' ');
777 parser->buffer++;
778 parser->last_char_was_space = TRUE;
780 ret = parser_parse_line (parser, error);
782 parser->buffer--;
783 return ret;
786 static gboolean
787 parse_n (YelpManParser *parser, GError **error)
789 xmlNodePtr node;
791 /* When we're in the header, the parse_n is responsible for
792 * switching to body text. (See the body of parse_text() for more
793 * of an explanation).
795 if (parser->state == START) {
796 /* Oh no! We've not got a proper title yet! Ho hum, let's
797 stick whatever's going into a 'title title' and have a null
798 section. Sob.
800 register_title (parser,
801 parser->accumulator->str,
802 "unknown section");
803 g_string_truncate (parser->accumulator, 0);
804 parser->state = BODY;
805 return TRUE;
808 if (parser->state == HAVE_TITLE) {
809 /* What we've got so far is the manual's collection, followed
810 by the title again. So we want to get rid of the latter if
811 possible...
813 right_truncate_common (parser->accumulator->str,
814 parser->title_str);
815 unicode_strstrip (parser->accumulator->str);
817 xmlNewTextChild (parser->header,
818 NULL, BAD_CAST "collection",
819 BAD_CAST parser->accumulator->str);
820 g_string_truncate (parser->accumulator, 0);
821 parser->state = BODY;
822 parser->section_state = SECTION_BODY;
823 return TRUE;
826 /* parser->state == BODY */
827 if (parser->section_state == SECTION_TITLE) {
829 g_strchomp (parser->accumulator->str);
830 xmlNewTextChild (parser->section_node, NULL,
831 BAD_CAST "title",
832 BAD_CAST parser->accumulator->str);
833 g_string_truncate (parser->accumulator, 0);
835 parser->section_state = SECTION_BODY;
837 else if (parser->sheet_node != NULL) {
839 In the body of a section, when we get to a newline we should
840 have an accumulator with text in it and a non-null sheet
841 (hopefully!).
843 We know the current font, so add a span for that font
844 containing the relevant text. Then add a <br/> tag.
846 finish_span (parser);
847 node = xmlNewNode (NULL, BAD_CAST "br");
848 xmlAddChild (parser->sheet_node, node);
851 parser->newline = TRUE;
852 parser->last_char_was_space = FALSE;
854 return TRUE;
857 static void
858 finish_span (YelpManParser *parser)
860 xmlNodePtr node;
862 if (parser->accumulator->str[0] != '\0') {
863 node = xmlNewTextChild (parser->sheet_node, NULL,
864 BAD_CAST "span",
865 BAD_CAST parser->accumulator->str);
866 xmlNewProp (node, BAD_CAST "class",
867 BAD_CAST get_font (parser));
868 g_string_truncate (parser->accumulator, 0);
872 static guint
873 dx_to_em_count (YelpManParser *parser, guint dx)
875 return (int)(dx / ((float)parser->char_width));
878 static gboolean
879 parse_N (YelpManParser *parser, GError **error)
881 gint n;
882 gchar tmp[2];
884 if (SSCANF ("N%i", 1, &n)) {
885 RAISE_PARSE_ERROR ("Strange format for N line: %s");
887 if (n > 127) {
888 RAISE_PARSE_ERROR ("N line has non-7-bit character: %s");
890 if (n < -200) {
891 RAISE_PARSE_ERROR ("Bizarrely many nbsps in N line: %s");
894 if (n < 0) {
895 append_nbsps (parser, -n);
896 parser->N_count += -n;
897 return TRUE;
900 parser->N_count++;
902 tmp[0] = (gchar)n;
903 tmp[1] = '\0';
905 return cheeky_call_parse_line (parser, error, 'N', tmp);
908 static void
909 append_nbsps (YelpManParser *parser, guint k)
911 for (; k > 0; k--) {
912 /* 0xc2 0xa0 is nonbreaking space in utf8 */
913 g_string_append_c (parser->accumulator, 0xc2);
914 g_string_append_c (parser->accumulator, 0xa0);
918 static gboolean
919 parse_C (YelpManParser *parser, GError **error)
921 gchar name[16];
922 gunichar code = 0;
923 guint k;
924 gint len;
926 if (SSCANF ("C%16s", 1, name)) {
927 RAISE_PARSE_ERROR ("Can't understand special character: %s");
930 for (k=0; char_translations[k].from; k++) {
931 if (g_str_equal (char_translations[k].from, name)) {
932 code = char_translations[k].to;
933 break;
936 if (sscanf (name, "u%x", &k) == 1) {
937 code = k;
940 if (!code) {
941 g_warning ("Couldn't parse troff special character: '%s'",
942 name);
943 code = 65533; /* Unicode replacement character */
946 /* Output buffer must be length >= 6. 16 >= 6, so we're ok. */
947 len = g_unichar_to_utf8 (code, name);
948 name[len] = '\0';
950 parser->N_count++;
952 return cheeky_call_parse_line (parser, error, 'C', name);
955 static void
956 deal_with_newlines (YelpManParser *parser)
959 If newline is true, this is the first word on a line.
961 In which case, we check to see whether hpos agrees with the
962 current sheet's indent. If so (or if there isn't a sheet yet!),
963 we just add to the accumulator. If not, start a new sheet with
964 the correct indent.
966 If we aren't the first word on the line, just add to the
967 accumulator.
969 gchar tmp[64];
970 guint jump_lines;
971 gboolean made_sheet = FALSE, dont_jump = FALSE;
973 /* This only happens at the start of a section, where there's
974 already a gap
976 if (!parser->sheet_node) {
977 dont_jump = TRUE;
980 if ((!parser->sheet_node) ||
981 (parser->newline && (parser->hpos != parser->sheet_indent))) {
982 new_sheet (parser);
983 made_sheet = TRUE;
986 if (parser->newline) {
987 if ((parser->last_vertical_jump > 0) && (!dont_jump)) {
988 jump_lines =
989 parser->last_vertical_jump/parser->char_height;
990 } else {
991 jump_lines = 1;
994 if (jump_lines > 1) {
995 if (!made_sheet) new_sheet (parser);
996 made_sheet = TRUE;
999 snprintf (tmp, 64, "%u", dx_to_em_count (parser, parser->hpos));
1000 xmlNewProp (parser->sheet_node,
1001 BAD_CAST "indent", BAD_CAST tmp);
1003 if (made_sheet) {
1004 snprintf (tmp, 64, "%u", jump_lines-1);
1005 xmlNewProp (parser->sheet_node,
1006 BAD_CAST "jump", BAD_CAST tmp);
1010 parser->newline = FALSE;
1011 parser->last_vertical_jump = 0;
1014 static gboolean
1015 parse_p (YelpManParser *parser, GError **error)
1017 parser->vpos = 0;
1018 parser->hpos = 0;
1019 return TRUE;
1022 static void
1023 new_sheet (YelpManParser *parser)
1025 /* We don't need to worry about finishing the current sheet,
1026 since the accumulator etc. get cleared on newlines and we
1027 know we're at the start of a line.
1029 parser->sheet_node =
1030 xmlAddChild (parser->section_node,
1031 xmlNewNode (NULL, BAD_CAST "sheet"));
1032 parser->sheet_indent = parser->hpos;
1035 static void
1036 register_title (YelpManParser *parser,
1037 const gchar* name, const gchar* section)
1039 xmlNewTextChild (parser->header,
1040 NULL, BAD_CAST "title", BAD_CAST name);
1041 xmlNewTextChild (parser->header,
1042 NULL, BAD_CAST "section", BAD_CAST section);
1045 static void
1046 right_truncate_common (gchar *dst, const gchar *src)
1048 guint len_src = strlen (src);
1049 guint len_dst = strlen (dst);
1051 guint k = (len_src < len_dst) ? len_src - 1 : len_dst - 1;
1053 dst += len_dst - 1;
1054 src += len_src - 1;
1056 while (k > 0) {
1057 if (*dst != *src) break;
1058 *dst = '\0';
1060 k--;
1061 dst--;
1062 src--;
1066 static gboolean
1067 cheeky_call_parse_line (YelpManParser *parser, GError **error,
1068 gchar first_char, const gchar* text)
1070 /* Do a cunning trick. There's all sorts of code that parse_text
1071 * does, which we don't want to duplicate in parse_N and
1072 * parse_C. So feed a buffer back to parse_text. Tada! Start it
1073 * with "C" or "N" rather than "t" so clever stuff in parse_text
1074 * can tell the difference.
1076 gchar *tmp;
1077 gboolean ret;
1078 guint len = strlen (text);
1080 tmp = parser->buffer;
1081 parser->buffer = g_new (gchar, 2 + len);
1082 parser->buffer[0] = first_char;
1083 strncpy (parser->buffer + 1, text, len + 1);
1085 ret = parse_text (parser, error);
1087 g_free (parser->buffer);
1088 parser->buffer = tmp;
1090 return ret;
1093 static void
1094 cleanup_parsed_page (YelpManParser *parser)
1096 /* First job: the last line usually has the version, date and
1097 * title (again!). The code above misunderstands and parses this
1098 * as a section, so we need to "undo" this and stick the data in
1099 * the header where it belongs.
1101 * parser->section_node should still point to it. We assume this
1102 * has happened if it has exactly one child element (the <title>
1103 * tag)
1105 gchar *lastline;
1106 GRegex *regex;
1107 gchar regex_string [1024];
1109 if (xmlChildElementCount (parser->section_node) == 1) {
1110 lastline = (gchar *)xmlNodeGetContent (parser->section_node);
1112 /* If parse_last_line works, it sets the data from it in the
1113 <header> tag, so delete the final section. */
1114 if (parse_last_line (parser, lastline)) {
1115 xmlUnlinkNode (parser->section_node);
1116 xmlFreeNode (parser->section_node);
1118 else {
1119 /* Oh dear. This would be unexpected and doesn't seem to
1120 happen with man on my system. But we probably shouldn't
1121 ditch the info, so let's leave the <section> tag and
1122 print a warning message to the console.
1124 g_warning ("Unexpected final line in man document (%s)\n",
1125 lastline);
1128 xmlFree (lastline);
1131 /* Next job: Go through and stick the links in. Text that looks
1132 * like man(1) should be converted to a link to man:man(1) and
1133 * urls should also be linkified.
1135 * Unfortunately, it's not entirely clear what constitutes a valid
1136 * section. All sections must be alphanumeric and the logic we use
1137 * to avoid extra hits (eg "one or more widget(s)") is that either
1138 * the section must start with a digit or (if the current section
1139 * doesn't) must start with the same letter as the current
1140 * section.
1142 snprintf (regex_string, 1024,
1143 "([a-zA-Z0-9\\-_.:]+)\\(((%c|[0-9])[a-zA-Z0-9]*)\\)",
1144 parser->section ? parser->section[0] : '0');
1145 regex = g_regex_new (regex_string, 0, 0, NULL);
1146 g_return_if_fail (regex);
1147 fixup_links (parser, regex, man_link_inserter);
1148 g_regex_unref (regex);
1150 /* Now for http:// links.
1152 regex = g_regex_new ("https?:\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+"
1153 "([\\w\\-\\.,@?^=%&:/~\\+#]*"
1154 "[\\w\\-\\@?^=%&/~\\+#])?",
1155 0, 0, NULL);
1156 g_return_if_fail (regex);
1157 fixup_links (parser, regex, http_link_inserter);
1158 g_regex_unref (regex);
1161 static gchar *
1162 skip_whitespace (gchar *text)
1164 while (g_unichar_isspace (g_utf8_get_char (text))) {
1165 text = g_utf8_next_char (text);
1167 return text;
1170 static gchar *
1171 last_non_whitespace (gchar *text)
1173 gchar *end = text + strlen(text);
1174 gchar *prev;
1176 prev = g_utf8_find_prev_char (text, end);
1177 if (!prev) {
1178 /* The string must have been zero-length. */
1179 return NULL;
1182 while (g_unichar_isspace (g_utf8_get_char (prev))) {
1183 end = prev;
1184 prev = g_utf8_find_prev_char (text, prev);
1185 if (!prev) return NULL;
1187 return end;
1190 static gchar *
1191 find_contiguous_whitespace (gchar *text, guint ws_len)
1193 guint counter = 0;
1194 gchar *ws_start;
1195 while (*text) {
1196 if (g_unichar_isspace (g_utf8_get_char (text))) {
1197 if (!counter) ws_start = text;
1198 counter++;
1200 else counter = 0;
1202 if (counter == ws_len) return ws_start;
1204 text = g_utf8_next_char (text);
1206 return NULL;
1209 static gboolean
1210 parse_last_line (YelpManParser *parser, gchar* line)
1212 /* We expect a line of the form
1213 '1.2.3 blah 2009 libfoo(1)'
1214 where the spaces are all nbsp's.
1216 Look for a gap of at least 3 in a row. If we find that, expand
1217 either side and declare the stuff before to be the version
1218 number and then the stuff afterwards to be the start of the
1219 date. Then do the same thing on the next gap, if there is one.
1221 gchar *gap, *date_start;
1223 gchar *version;
1224 gchar *date;
1226 gap = find_contiguous_whitespace (line, 3);
1227 if (!gap) return FALSE;
1229 version = g_strndup (line, gap - line);
1231 date_start = skip_whitespace (gap);
1233 gap = find_contiguous_whitespace (date_start, 3);
1234 if (!gap) return FALSE;
1236 date = g_strndup (date_start, gap - date_start);
1238 xmlNewProp (parser->header, BAD_CAST "version", BAD_CAST version);
1239 xmlNewProp (parser->header, BAD_CAST "date", BAD_CAST date);
1241 g_free (version);
1242 g_free (date);
1244 return TRUE;
1247 /* This should work like g_strstrip, but that's an ASCII-only version
1248 * and I want to strip the nbsp's that I so thoughtfully plaster
1249 * stuff with...
1251 static void
1252 unicode_strstrip (gchar *str)
1254 gchar *start, *end;
1256 if (str == NULL) return;
1258 end = last_non_whitespace (str);
1260 if (!end) {
1261 /* String is zero-length or entirely whitespace */
1262 *str = '\0';
1263 return;
1265 start = skip_whitespace (str);
1267 g_memmove (str, start, end - start);
1268 *(str + (end - start)) = '\0';
1271 static void
1272 sheet_fixup_links (xmlNodePtr sheet,
1273 const GRegex *regex, link_inserter inserter)
1276 This works as follows: grab (<span>) nodes from a sheet in
1277 order and stick their contents into a string. Since a sheet
1278 won't be ludicrously long, we can just grab everything and then
1279 work over it, but we need to keep track of which node points at
1280 which bit of the string so we can call inserter helpfully. To do
1281 so, use byte offsets, since that seems less likely to go
1282 horribly wrong!
1284 GString *accumulator = g_string_new ("");
1285 xmlNodePtr span;
1286 xmlChar *tmp;
1287 gsize offset = 0;
1288 gsize len;
1289 offset_elt_pair pair;
1290 GMatchInfo *match_info;
1292 /* Make pairs zero-terminated so that code can iterate through it
1293 * looking for something with elt = NULL. */
1294 GArray *pairs = g_array_new (TRUE, FALSE,
1295 sizeof (offset_elt_pair));
1297 g_return_if_fail (regex);
1298 g_return_if_fail (inserter);
1299 g_return_if_fail (sheet);
1301 for (span = sheet->children; span != NULL; span = span->next) {
1302 if (span->type != XML_ELEMENT_NODE) continue;
1304 if (strcmp ((const char*) span->name, "span") != 0) {
1306 if (strcmp ((const char*) span->name, "a") == 0)
1307 continue;
1309 if (strcmp ((const char*) span->name, "br") == 0) {
1310 /* If the last character in the accumulator is a
1311 * hyphen, we don't want to include that in the link
1312 * we make. If not, append a newline to the
1313 * accumulator (so we don't mistakenly make links from
1314 * "see\nthis(2)" to seethis(2).
1316 * Either way, we add the <br> to the list of pairs
1317 * since we might need to do stuff with it if it's in
1318 * the middle of a link.
1320 len = strlen (accumulator->str);
1321 if (len > 0 && accumulator->str [len-1] == '-') {
1322 g_string_truncate (accumulator, len - 1);
1323 offset--;
1325 else {
1326 g_string_append_c (accumulator, '\n');
1327 offset++;
1329 pair.start = offset;
1330 pair.end = offset;
1331 pair.elt = span; /* Er, br in fact. */
1332 g_array_append_val (pairs, pair);
1334 continue;
1337 g_warning ("Expected all child elements to be "
1338 "<span>, <br> or <a>, but "
1339 "have found a <%s>.",
1340 (gchar *) span->name);
1341 continue;
1344 tmp = xmlNodeGetContent (span);
1345 g_string_append (accumulator, (gchar *) tmp);
1346 len = strlen ((const char*) tmp);
1348 pair.start = offset;
1349 pair.end = offset + len;
1350 pair.elt = span;
1352 g_array_append_val (pairs, pair);
1354 offset += len;
1355 xmlFree (tmp);
1358 /* We've got the data. Now try to match the regex against it as
1359 * many times as possible
1361 offset = 0;
1362 g_regex_match_full (regex, accumulator->str,
1363 -1, offset, 0, &match_info, NULL);
1364 while (g_match_info_matches (match_info)) {
1365 offset = inserter ((offset_elt_pair *)pairs->data,
1366 match_info);
1368 g_match_info_free (match_info);
1370 g_regex_match_full (regex, accumulator->str,
1371 -1, offset, 0, &match_info, NULL);
1374 g_string_free (accumulator, TRUE);
1375 g_array_unref (pairs);
1378 static void
1379 fixup_links (YelpManParser *parser,
1380 const GRegex *regex, link_inserter inserter)
1382 /* Iterate over all the <sheet>'s in the xml document */
1383 xmlXPathContextPtr context;
1384 xmlXPathObjectPtr path_obj;
1385 xmlNodeSetPtr nodeset;
1386 gint i;
1388 context = xmlXPathNewContext (parser->doc);
1389 g_return_if_fail (context);
1391 path_obj = xmlXPathEvalExpression (BAD_CAST "//sheet", context);
1392 g_return_if_fail (path_obj);
1394 nodeset = path_obj->nodesetval;
1395 g_return_if_fail (nodeset);
1397 for (i = 0; i < nodeset->nodeNr; ++i) {
1398 sheet_fixup_links (nodeset->nodeTab[i], regex, inserter);
1401 xmlXPathFreeObject (path_obj);
1402 xmlXPathFreeContext (context);
1406 This inserts new_child under parent. If older_sibling is non-NULL,
1407 we stick it immediately after it. Otherwise, insert as the first
1408 child of the parent.
1410 Returns the inserted child.
1412 static xmlNodePtr
1413 insert_child_after (xmlNodePtr parent, xmlNodePtr older_sibling,
1414 xmlNodePtr new_child)
1416 g_return_val_if_fail (parent && new_child, new_child);
1418 if (older_sibling) {
1419 xmlAddNextSibling (older_sibling, new_child);
1421 else if (parent->children == NULL) {
1422 xmlAddChild (parent, new_child);
1424 else {
1425 xmlAddPrevSibling (parent->children, new_child);
1428 return new_child;
1431 static void
1432 copy_prop (xmlNodePtr to, xmlNodePtr from, const xmlChar *name)
1434 xmlChar *prop = xmlGetProp (from, name);
1435 g_return_if_fail (prop);
1436 xmlSetProp (to, name, prop);
1437 xmlFree (prop);
1440 static gsize
1441 do_node_replacement (xmlNodePtr anchor_node,
1442 offset_elt_pair *offsets,
1443 gsize startpos, gsize endpos)
1445 xmlNodePtr node, sibling_before;
1446 gchar *gtmp;
1447 xmlChar *xtmp, *xshort;
1448 gsize look_from;
1450 /* Find the first element by searching through offsets. I suppose
1451 * a binary search would be cleverer, but I doubt that this will
1452 * take significant amounts of time.
1454 * We should never fall off the end, but (just in case) the GArray
1455 * that holds the offsets is zero-terminated and elt should never
1456 * be NULL so we can stop if necessary
1458 while ((offsets->end <= startpos) && offsets->elt) {
1459 offsets++;
1461 g_return_val_if_fail (offsets->elt, endpos);
1463 /* xtmp is NULL by default, but we do this here so that if we read
1464 * the node in the if block below, we don't have to do it a second
1465 * time.
1467 xtmp = NULL;
1468 sibling_before = offsets->elt->prev;
1469 look_from = startpos;
1471 /* Maybe there's text in the relevant span before the start of
1472 * the stuff we want to replace with a link.
1474 if (startpos > offsets->start) {
1475 node = xmlNewNode (NULL, BAD_CAST "span");
1476 copy_prop (node, offsets->elt, BAD_CAST "class");
1478 xtmp = xmlNodeGetContent (offsets->elt);
1479 gtmp = g_strndup ((const gchar*)xtmp, startpos - offsets->start);
1480 xmlNodeAddContent (node, BAD_CAST gtmp);
1481 g_free (gtmp);
1483 sibling_before = insert_child_after (offsets->elt->parent,
1484 sibling_before, node);
1487 insert_child_after (offsets->elt->parent,
1488 sibling_before, anchor_node);
1490 /* The main loop. Here we work over each span that overlaps with
1491 * the link we're adding. We add a similar span as a child of the
1492 * anchor node and then delete the existing one. */
1493 while (look_from < endpos) {
1494 if (!xtmp) xtmp = xmlNodeGetContent (offsets->elt);
1496 if (strcmp ((const gchar*)offsets->elt->name, "br") == 0) {
1497 node = xmlNewChild (anchor_node,
1498 NULL, BAD_CAST "br", NULL);
1499 xmlUnlinkNode (offsets->elt);
1500 xmlFreeNode (offsets->elt);
1501 xmlFree (xtmp);
1502 xtmp = NULL;
1503 offsets++;
1505 else if (endpos < offsets->end) {
1506 xshort = BAD_CAST g_strndup ((const gchar*)xtmp,
1507 endpos - offsets->start);
1509 node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1510 xshort + (look_from-offsets->start));
1511 copy_prop (node, offsets->elt, BAD_CAST "class");
1513 node = xmlNewNode (NULL, BAD_CAST "span");
1514 xmlNodeAddContent (node,
1515 xtmp + (endpos - offsets->start));
1516 copy_prop (node, offsets->elt, BAD_CAST "class");
1517 xmlAddNextSibling (anchor_node, node);
1519 xmlFree (xshort);
1521 xmlUnlinkNode (offsets->elt);
1522 xmlFreeNode (offsets->elt);
1523 xmlFree (xtmp);
1524 xtmp = NULL;
1526 offsets->start = endpos;
1527 offsets->elt = node;
1529 else {
1530 node = xmlNewChild (anchor_node, NULL, BAD_CAST "span",
1531 xtmp + (look_from - offsets->start));
1532 copy_prop (node, offsets->elt, BAD_CAST "class");
1534 xmlUnlinkNode (offsets->elt);
1535 xmlFreeNode (offsets->elt);
1536 xmlFree (xtmp);
1537 xtmp = NULL;
1538 offsets++;
1541 if (!offsets->elt) {
1542 /* We got to the end of a sheet and of the stuff we're
1543 * doing at the same time
1545 return endpos;
1548 look_from = offsets->start;
1551 return offsets->start;
1554 static gsize
1555 do_link_insertion (const gchar *url,
1556 offset_elt_pair *offsets,
1557 gsize startpos, gsize endpos)
1559 xmlNodePtr anchor_node = xmlNewNode (NULL, BAD_CAST "a");
1561 xmlNewProp (anchor_node, BAD_CAST "href", BAD_CAST url);
1563 return do_node_replacement (anchor_node, offsets,
1564 startpos, endpos);
1567 static gsize
1568 man_link_inserter (offset_elt_pair *offsets,
1569 const GMatchInfo *match_info)
1571 gchar *name, *section;
1572 gchar url[1024];
1574 gint startpos, endpos;
1576 g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1578 name = g_match_info_fetch (match_info, 1);
1579 section = g_match_info_fetch (match_info, 2);
1581 g_return_val_if_fail (name && section, endpos);
1583 snprintf (url, 1024, "man:%s(%s)", name, section);
1585 g_free (name);
1586 g_free (section);
1588 return do_link_insertion (url, offsets, startpos, endpos);
1591 static gsize
1592 http_link_inserter (offset_elt_pair *offsets,
1593 const GMatchInfo *match_info)
1595 gchar *url;
1596 gint startpos, endpos;
1597 gsize ret;
1599 url = g_match_info_fetch (match_info, 0);
1600 g_match_info_fetch_pos (match_info, 0, &startpos, &endpos);
1602 ret = do_link_insertion (url, offsets, startpos, endpos);
1604 g_free (url);
1606 return ret;