1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil -*- */
3 * Copyright (C) 2005 Davyd Madeley <davyd@madeley.id.au>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
20 * Author: Davyd Madeley <davyd@madeley.id.au>
31 #include "yelp-info-parser.h"
32 #include "yelp-magic-decompressor.h"
33 #include "yelp-debug.h"
36 GtkTreeIter
* find_real_top (GtkTreeModel
*model
,
38 GtkTreeIter
* find_real_sibling (GtkTreeModel
*model
,
41 xmlNodePtr
yelp_info_parse_menu (GtkTreeStore
*tree
,
45 gboolean
get_menuoptions (gchar
*line
,
50 gboolean
resolve_frag_id (GtkTreeModel
*model
,
54 void info_process_text_notes (xmlNodePtr
*node
,
60 Used to output the correct <heading level="?" /> tag.
62 static const gchar
* level_headings
[] = { NULL
, "1", "2", "3" };
65 info_image_get_attributes (gchar
const* string
)
67 GMatchInfo
*match_info
;
72 regex
= g_regex_new ("([^\\s][^\\s=]+)=(?:([^\\s \"]+)|(?:\"((?:[^\\\"]|\\\\[\\\\\"])*)\"))", 0, 0, NULL
);
73 g_regex_match (regex
, string
, 0, &match_info
);
74 while (g_match_info_matches (match_info
))
80 h
= g_hash_table_new (g_str_hash
, g_str_equal
);
81 key
= g_match_info_fetch (match_info
, 1);
82 value
= g_match_info_fetch (match_info
, 2);
84 value
= g_match_info_fetch (match_info
, 3);
85 g_hash_table_insert (h
, key
, value
);
86 g_match_info_next (match_info
, NULL
);
88 g_match_info_free (match_info
);
89 g_regex_unref (regex
);
95 info elements look like \0\b[<TAGNAME>\0\b] and take attribute=value
96 pairs, i.e. for image: \0\b[image src="foo.png" \0\b]
98 #define INFO_TAG_0 "\0"
99 #define INFO_TAG_1 "\b"
100 #define INFO_TAG_OPEN_2 INFO_TAG_1 "["
101 #define INFO_TAG_CLOSE_2 INFO_TAG_1 "]"
102 #define INFO_TAG_OPEN_2_RE INFO_TAG_1 "[[]"
103 #define INFO_TAG_CLOSE_2_RE INFO_TAG_1 "[]]"
104 #define INFO_TAG_OPEN INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2
105 #define INFO_TAG_CLOSE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2
106 #define INFO_TAG_OPEN_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2_RE
107 #define INFO_TAG_CLOSE_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2_RE
108 /* C/glib * cannot really handle \0 in strings, convert to '@' */
109 #define INFO_C_TAG_0 "@"
110 #define INFO_C_TAG_OPEN INFO_C_TAG_0 INFO_TAG_OPEN_2
111 #define INFO_C_TAG_CLOSE INFO_C_TAG_0 INFO_TAG_CLOSE_2
112 #define INFO_C_TAG_OPEN_RE INFO_C_TAG_0 INFO_TAG_OPEN_2_RE
113 #define INFO_C_TAG_CLOSE_RE INFO_C_TAG_0 INFO_TAG_CLOSE_2_RE
114 #define INFO_C_IMAGE_TAG_OPEN INFO_C_TAG_OPEN "image"
115 #define INFO_C_IMAGE_TAG_OPEN_RE INFO_C_TAG_OPEN_RE "image"
118 info_insert_image (xmlNodePtr parent
, GMatchInfo
*match_info
)
120 GHashTable
*h
= info_image_get_attributes (g_match_info_fetch (match_info
, 1));
123 source
= (gchar
*)g_hash_table_lookup (h
, "src");
125 if (!h
|| !source
|| !*source
)
126 return xmlNewTextChild (parent
, NULL
, BAD_CAST
"para",
127 BAD_CAST
"[broken image]");
129 gchar
*title
= (gchar
*)g_hash_table_lookup (h
, "title");
130 gchar
*text
= (gchar
*)g_hash_table_lookup (h
, "text");
131 gchar
*alt
= (gchar
*)g_hash_table_lookup (h
, "alt");
132 g_hash_table_destroy (h
);
133 xmlNodePtr img
= xmlNewChild (parent
, NULL
, BAD_CAST
"img", NULL
);
134 xmlNewProp (img
, BAD_CAST
"src", BAD_CAST source
);
135 xmlNewProp (img
, BAD_CAST
"title", BAD_CAST (title
? title
: ""));
136 xmlNewProp (img
, BAD_CAST
"text", BAD_CAST (text
? text
: ""));
137 xmlNewProp (img
, BAD_CAST
"alt", BAD_CAST (alt
? alt
: ""));
145 If every element of `str' is `ch' then return TRUE, else FALSE.
148 string_all_char_p (const gchar
* str
, gchar ch
)
150 for (; *str
; str
++) {
151 if (*str
!= ch
) return FALSE
;
157 If `line' is a line of '*', '=' or '-', return 1,2,3 respectively
158 for the heading level. If it's anything else, return 0.
161 header_underline_level (const gchar
* line
)
163 if (*line
!= '*' && *line
!= '=' && *line
!= '-')
166 if (string_all_char_p (line
, '*')) return 1;
167 if (string_all_char_p (line
, '=')) return 2;
168 if (string_all_char_p (line
, '-')) return 3;
174 Use g_strjoinv to join up the strings from `strings', but they might
175 not actually be a null-terminated array. `end' should be strings+n,
176 where I want the first n strings (strings+0, ..., strings+(n-1)). It
177 shouldn't point outside of the array allocated, but it can point at
178 the null string at the end.
181 join_strings_subset (const gchar
*separator
,
182 gchar
** strings
, gchar
** end
)
184 g_assert(end
> strings
);
189 gchar
*glob
= g_strjoinv (separator
, strings
);
195 Create a text node, child of `parent', with the lines strictly
196 between `first' and `last'.
199 lines_subset_text_child (xmlNodePtr parent
, xmlNsPtr ns
,
200 gchar
** first
, gchar
** last
)
202 /* TODO? Currently we're copying the split strings again, which is
203 less efficient than somehow storing lengths and using a sort of
204 window on `content'. But that's much more difficult, so unless
205 there's a problem, let's go with the stupid approach. */
209 glob
= join_strings_subset ("\n", first
, last
);
210 xmlAddChild (parent
, xmlNewText (BAD_CAST glob
));
216 Convert body text CONTENT to xml nodes. This function is responsible
217 for spotting headings etc and splitting them out correctly.
219 paragraph is as described in info_body_text, but cannot be null.
221 If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
224 TODO: IWBN add a regex match for *Note: here and call the *Note ==>
225 <a href> logic of info_process_text_notes from here.
228 info_body_parse_text (xmlNodePtr parent
, xmlNodePtr
*paragraph
,
230 gboolean inline_p
, const gchar
*content
)
232 /* The easiest things to spot are headings: they look like a line of
233 * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
234 * them, we split content into single lines and work with them. */
235 gchar
**lines
= g_strsplit (content
, "\n", 0);
236 gchar
**first
= lines
, **last
= lines
;
238 xmlNodePtr header_node
;
240 /* Deal with the possibility that `content' is empty */
241 if (*lines
== NULL
) {
243 xmlNewTextChild (parent
, NULL
, BAD_CAST
"para", BAD_CAST
"");
248 /* Use a pair of pointers, first and last, which point to two lines,
249 * the chunk of the body we're displaying (inclusive) */
250 for (; *last
; last
++) {
252 /* Check for a blank line */
253 if (**last
== '\0') {
256 *paragraph
= xmlNewChild (parent
, ns
, BAD_CAST
"para", NULL
);
258 lines_subset_text_child (*paragraph
, ns
, first
, last
);
260 /* On the next iteration, last==first both pointing at the next
268 /* Check for a header */
269 header_level
= header_underline_level (*last
);
271 /* Write out any lines beforehand */
272 lines_subset_text_child (parent
, ns
, first
, last
-1);
273 /* Now write out the actual header line */
274 header_node
= xmlNewTextChild (parent
, ns
, BAD_CAST
"header",
276 xmlNewProp (header_node
, BAD_CAST
"level",
277 BAD_CAST level_headings
[header_level
]);
284 /* Write out any lines left */
286 *paragraph
= xmlNewChild (parent
, ns
, BAD_CAST
"para", NULL
);
288 lines_subset_text_child (*paragraph
, ns
, first
, last
);
294 info_body_text is responsible for taking a hunk of the info page's
295 body and turning it into paragraph tags. It searches out images and
296 marks them up properly if necessary.
298 parent should be the node in which we're currently storing text and
299 paragraph a pointer to a <para> tag or NULL. At blank lines, we
300 finish with the current para tag and switch to a new one.
302 It uses info_body_parse_text to mark up the actual bits of text.
305 info_body_text (xmlNodePtr parent
, xmlNodePtr
*paragraph
, xmlNsPtr ns
,
306 gboolean inline_p
, gchar
const *content
)
308 xmlNodePtr thepara
= NULL
;
309 if (paragraph
== NULL
) paragraph
= &thepara
;
311 if (!strstr (content
, INFO_C_IMAGE_TAG_OPEN
)) {
312 info_body_parse_text (parent
, paragraph
, ns
, inline_p
, content
);
316 gint content_len
= strlen (content
);
318 GRegex
*regex
= g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE
"((?:[^" INFO_TAG_1
"]|[^" INFO_C_TAG_0
"]+" INFO_TAG_1
")*)" INFO_C_TAG_CLOSE_RE
")", 0, 0, NULL
);
319 GMatchInfo
*match_info
;
321 g_regex_match (regex
, content
, 0, &match_info
);
322 while (g_match_info_matches (match_info
))
326 gboolean image_found
= g_match_info_fetch_pos (match_info
, 0,
327 &image_start
, &image_end
);
328 gchar
*before
= g_strndup (&content
[pos
], image_start
- pos
);
330 info_body_parse_text (parent
, paragraph
, NULL
, TRUE
, before
);
333 /* End the paragraph that was before */
337 info_insert_image (parent
, match_info
);
338 g_match_info_next (match_info
, NULL
);
340 gchar
*after
= g_strndup (&content
[pos
], content_len
- pos
);
341 info_body_parse_text (parent
, paragraph
, NULL
, TRUE
, after
);
345 /* Part 1: Parse File Into Tree Store */
356 page_type (char *page
)
358 if (g_ascii_strncasecmp (page
, "Tag Table:\n", 11) == 0)
359 return PAGE_TAG_TABLE
;
360 else if (g_ascii_strncasecmp (page
, "Indirect:\n", 10) == 0)
361 return PAGE_INDIRECT
;
362 else if (g_ascii_strncasecmp (page
, "File:", 5) == 0 ||
363 g_ascii_strncasecmp (page
, "Node:", 5) == 0)
371 *open_info_file (const gchar
*file
)
374 GConverter
*converter
;
375 GFileInputStream
*file_stream
;
376 GInputStream
*stream
;
383 gfile
= g_file_new_for_path (file
);
384 file_stream
= g_file_read (gfile
, NULL
, NULL
);
385 converter
= (GConverter
*) yelp_magic_decompressor_new ();
386 stream
= g_converter_input_stream_new ((GInputStream
*) file_stream
, converter
);
387 string
= g_string_new (NULL
);
389 while ((bytes
= g_input_stream_read (stream
, buf
, 1024, NULL
, NULL
)) > 0)
390 g_string_append_len (string
, buf
, bytes
);
392 g_object_unref (stream
);
396 /* C/glib * cannot really handle \0 in strings, convert. */
397 for (i
= 0; i
< (string
->len
- 1); i
++)
398 if (str
[i
] == INFO_TAG_OPEN
[0] && str
[i
+1] == INFO_TAG_OPEN
[1])
399 str
[i
] = INFO_C_TAG_OPEN
[0];
401 g_string_free (string
, FALSE
);
407 find_info_part (gchar
*part_name
, const gchar
*base
)
409 /* New and improved. We now assume that all parts are
410 * in the same subdirectory as the base file. Makes
411 * life much simpler and is (afaict) always true
415 gchar
*bzfname
, *gzfname
, *lzfd
, *fname
;
417 tmp
= g_strrstr (base
, "/");
418 path
= g_strndup (base
, tmp
-base
);
420 bzfname
= g_strconcat (path
, "/", part_name
, ".bz2", NULL
);
421 gzfname
= g_strconcat (path
, "/", part_name
, ".gz", NULL
);
422 lzfd
= g_strconcat (path
, "/", part_name
, ".lzma", NULL
);
423 fname
= g_strconcat (path
, "/", part_name
, NULL
);
425 if (g_file_test (bzfname
, G_FILE_TEST_EXISTS
))
426 uri
= g_strdup (bzfname
);
427 else if (g_file_test (gzfname
, G_FILE_TEST_EXISTS
))
428 uri
= g_strdup (gzfname
);
429 else if (g_file_test (lzfd
, G_FILE_TEST_EXISTS
))
430 uri
= g_strdup (lzfd
);
431 else if (g_file_test (fname
, G_FILE_TEST_EXISTS
))
432 uri
= g_strdup (fname
);
444 *process_indirect_map (char *page
, const gchar
*file
)
448 char *composite
= NULL
;
449 size_t composite_len
= 0;
451 lines
= g_strsplit (page
, "\n", 0);
454 Go backwards down the list so that we allocate composite
455 big enough the first time around.
457 for (ptr
= lines
+ 1; *ptr
!= NULL
; ptr
++);
458 for (ptr
--; ptr
!= lines
; ptr
--)
467 debug_print (DB_DEBUG
, "Line: %s\n", *ptr
);
468 items
= g_strsplit (*ptr
, ": ", 2);
472 filename
= find_info_part (items
[0], file
);
473 str
= open_info_file (filename
);
478 pages
= g_strsplit (str
, "\x1f", 2);
486 offset
= atoi(items
[1]);
487 plength
= strlen(pages
[1]);
489 debug_print (DB_DEBUG
, "Need to make string %s+%i bytes = %i\n",
493 if (!composite
) /* not yet created, malloc it */
495 composite_len
= offset
+ plength
;
496 composite
= g_malloc (sizeof (char) *
497 (composite_len
+ 1));
498 memset (composite
, '-', composite_len
);
499 composite
[composite_len
] = '\0';
502 /* Because we're going down the list
503 * backwards, plength should always be short
504 * enough to fit in the memory allocated. But
505 * in case something's broken/malicious, we
506 * should check anyway.
508 if (offset
> composite_len
)
510 if (plength
+ offset
+ 1 > composite_len
)
511 plength
= composite_len
- offset
- 1;
513 composite
[offset
] = '\x1f';
514 memcpy (composite
+ offset
+ 1, pages
[1], plength
);
529 Open up the relevant info file and read it all into memory. If there
530 is an indirect table thingy, we resolve that as we go.
532 Returns a NULL-terminated list of pointers to pages on success and
536 expanded_info_file (const gchar
*file
)
538 gchar
*slurp
= open_info_file (file
);
542 if (!slurp
) return NULL
;
544 /* TODO: There's a lot of copying of bits of memory here. With a bit
545 * more effort we could avoid it. Either we should fix this or
546 * measure the time taken and decide it's irrelevant...
548 * Note: \x1f\n is ^_\n
550 page_list
= g_strsplit (slurp
, "\x1f\n", 0);
554 for (page
= page_list
; *page
!= NULL
; page
++) {
555 if (page_type (*page
) == PAGE_INDIRECT
) {
557 slurp
= process_indirect_map (*page
, file
);
558 g_strfreev (page_list
);
563 page_list
= g_strsplit (slurp
, "\x1f\n", 0);
573 Look for strings in source by key. For example, we extract "blah"
574 from "Node: blah," when the key is "Node: ". To know when to stop,
575 there are two strings: end and cancel.
577 If we find a character from end first, return a copy of the string
578 up to (not including) that character. If we find a character of
579 cancel first, return NULL. If we find neither, return the rest of
582 cancel can be NULL, in which case, we don't do its test.
585 get_value_after_ext (const char *source
, const char *key
,
586 const char *end
, const char *cancel
)
589 size_t not_end
, not_cancel
;
591 start
= strstr (source
, key
);
592 if (!start
) return NULL
;
594 start
+= strlen (key
);
596 not_end
= strcspn (start
, end
);
597 not_cancel
= (cancel
) ? strcspn (start
, cancel
) : not_end
+ 1;
599 if (not_cancel
< not_end
)
602 return g_strndup (start
, not_end
);
606 get_value_after (const char* source
, const char *key
)
608 return get_value_after_ext (source
, key
, ",", "\n\x7f");
612 node2page (GHashTable
*nodes2pages
, char *node
)
616 if (g_hash_table_lookup_extended (nodes2pages
, node
,
618 return GPOINTER_TO_INT(p
);
620 /* This shouldn't happen: we should only ever have to look up pages
622 g_return_val_if_reached (0);
626 *node2iter (GHashTable
*nodes2iters
, char *node
)
630 iter
= g_hash_table_lookup (nodes2iters
, node
);
631 d (if (!iter
) debug_print (DB_WARN
, "Could not retrieve iter for node !%s!\n", node
));
636 *find_real_top (GtkTreeModel
*model
, GtkTreeIter
*it
)
638 GtkTreeIter
*r
= NULL
;
639 GtkTreeIter
*tmp
= NULL
;
644 r
= gtk_tree_iter_copy (it
);
645 tmp
= g_malloc0 (sizeof (GtkTreeIter
));
646 while (gtk_tree_model_iter_parent (model
, tmp
, r
)) {
647 gtk_tree_iter_free (r
);
648 r
= gtk_tree_iter_copy (tmp
);
655 GtkTreeIter
* find_real_sibling (GtkTreeModel
*model
,
656 GtkTreeIter
*it
, GtkTreeIter
*comp
)
659 GtkTreeIter
*tmp
= NULL
;
660 gboolean result
= FALSE
;
668 r
= gtk_tree_iter_copy (it
);
669 tmp
= gtk_tree_iter_copy (it
);
671 reftitle
= gtk_tree_model_get_string_from_iter (model
, comp
);
673 result
= gtk_tree_model_iter_parent (model
, r
, it
);
677 title
= gtk_tree_model_get_string_from_iter (model
, r
);
679 while (!g_str_equal (title
, reftitle
) && result
) {
680 gtk_tree_iter_free (tmp
);
681 tmp
= gtk_tree_iter_copy (r
);
682 result
= gtk_tree_model_iter_parent (model
, r
, tmp
);
684 title
= gtk_tree_model_get_string_from_iter (model
, r
);
687 if (!g_str_equal (title
, reftitle
))
689 gtk_tree_iter_free (tmp
);
693 gtk_tree_iter_free (r
);
701 process_page (GtkTreeStore
*tree
,
702 GHashTable
*nodes2pages
, GHashTable
*nodes2iters
,
703 int *processed_table
, char **page_list
, char *page_text
)
716 /* split out the header line and the text */
717 parts
= g_strsplit (page_text
, "\n", 3);
719 node
= get_value_after (parts
[0], "Node: ");
720 up
= get_value_after (parts
[0], "Up: ");
721 prev
= get_value_after (parts
[0], "Prev: ");
722 next
= get_value_after (parts
[0], "Next: ");
724 if (next
&& g_str_equal (next
, "Top")) {
728 if (g_str_equal (node
, "Top") && prev
!= NULL
) {
733 /* check to see if this page has been processed already */
734 page
= node2page (nodes2pages
, node
);
735 if (processed_table
[page
]) {
738 processed_table
[page
] = 1;
740 debug_print (DB_DEBUG
, "-- Processing Page %s\n\tParent: %s\n", node
, up
);
742 iter
= g_slice_alloc0 (sizeof (GtkTreeIter
));
743 /* check to see if we need to process our parent and siblings */
744 if (up
&& g_ascii_strncasecmp (up
, "(dir)", 5) && strcmp (up
, "Top"))
746 page
= node2page (nodes2pages
, up
);
747 if (!processed_table
[page
])
749 debug_print (DB_DEBUG
, "%% Processing Node %s\n", up
);
750 process_page (tree
, nodes2pages
,
751 nodes2iters
, processed_table
, page_list
,
755 if (prev
&& g_ascii_strncasecmp (prev
, "(dir)", 5))
757 if (strncmp (node
, "Top", 3)) {
758 /* Special case the Top node to always appear first */
760 page
= node2page (nodes2pages
, prev
);
761 if (!processed_table
[page
])
763 debug_print (DB_DEBUG
, "%% Processing Node %s\n", prev
);
764 process_page (tree
, nodes2pages
,
765 nodes2iters
, processed_table
, page_list
,
771 /* by this point our parent and older sibling should be processed */
772 if (!up
|| !g_ascii_strcasecmp (up
, "(dir)"))
774 debug_print (DB_DEBUG
, "\t> no parent\n");
775 if (!prev
|| !g_ascii_strcasecmp (prev
, "(dir)"))
777 debug_print (DB_DEBUG
, "\t> no previous\n");
778 gtk_tree_store_append (tree
, iter
, NULL
);
782 real
= find_real_top (GTK_TREE_MODEL (tree
),
783 node2iter (nodes2iters
, prev
));
785 gtk_tree_store_insert_after (tree
, iter
, NULL
,
787 gtk_tree_iter_free (real
);
790 gtk_tree_store_append (tree
, iter
, NULL
);
793 else if (!prev
|| !g_ascii_strcasecmp (prev
, "(dir)") || !strcmp (prev
, up
))
795 debug_print (DB_DEBUG
, "\t> no previous\n");
796 gtk_tree_store_append (tree
, iter
,
797 node2iter (nodes2iters
, up
));
801 GtkTreeIter
*upit
= node2iter (nodes2iters
, up
);
802 GtkTreeIter
*previt
= node2iter (nodes2iters
, prev
);
803 GtkTreeIter
*nit
= NULL
;
804 debug_print (DB_DEBUG
, "+++ Parent: %s Previous: %s\n", up
, prev
);
806 d (if (upit
) debug_print (DB_DEBUG
, "++++ Have parent node!\n"));
807 d (if (previt
) debug_print (DB_DEBUG
, "++++ Have previous node!\n"));
808 nit
= find_real_sibling (GTK_TREE_MODEL (tree
), previt
, upit
);
810 gtk_tree_store_insert_after (tree
, iter
,
813 gtk_tree_iter_free (nit
);
816 gtk_tree_store_append (tree
, iter
, upit
);
820 debug_print (DB_DEBUG
, "# node %s was not put in tree\n", node
);
824 d (if (iter
) debug_print (DB_DEBUG
, "Have a valid iter, storing for %s\n", node
));
826 g_hash_table_insert (nodes2iters
, g_strdup (node
), iter
);
827 debug_print (DB_DEBUG
, "size: %i\n", g_hash_table_size (nodes2iters
));
829 /*tmp = g_strdup_printf ("%i",
830 node2page (nodes2pages, node));*/
831 tmp
= g_strdup (node
);
832 tmp
= g_strdelimit (tmp
, " ", '_');
833 gtk_tree_store_set (tree
, iter
,
834 INFO_PARSER_COLUMN_PAGE_NO
, tmp
,
835 INFO_PARSER_COLUMN_PAGE_NAME
, node
,
836 INFO_PARSER_COLUMN_PAGE_CONTENT
, parts
[2],
848 GHashTable
*nodes2pages
; /* Build this... */
849 GHashTable
*pages2nodes
; /* ... using this. */
853 use_offset2page (gpointer o
, gpointer p
, gpointer ud
)
855 struct TagTableFix
* ttf
= (struct TagTableFix
*)ud
;
857 const gchar
* node
= g_hash_table_lookup (ttf
->pages2nodes
, p
);
859 g_hash_table_insert (ttf
->nodes2pages
, g_strdup (node
), p
);
864 We had a nodes2offsets hash table, but sometimes these things
865 lie. How terribly rude. Anyway, use offsets2pages and pages2nodes
866 (and injectivity!) to construct the nodes2pages hash table.
869 make_nodes2pages (GHashTable
* offsets2pages
,
870 GHashTable
* pages2nodes
)
872 struct TagTableFix ttf
;
875 g_hash_table_new_full (g_str_hash
, g_str_equal
, g_free
, NULL
);
876 ttf
.pages2nodes
= pages2nodes
;
878 g_hash_table_foreach (offsets2pages
, use_offset2page
, &ttf
);
880 return ttf
.nodes2pages
;
884 * Parse file into a GtkTreeStore containing useful information that we can
885 * later convert into a nice XML document or something else.
888 *yelp_info_parser_parse_file (char *file
)
894 GHashTable
*offsets2pages
= NULL
;
895 GHashTable
*pages2nodes
= NULL
;
896 GHashTable
*nodes2pages
= NULL
;
897 GHashTable
*nodes2iters
= NULL
;
898 int *processed_table
;
902 page_list
= expanded_info_file (file
);
909 offsets2pages
= g_hash_table_new_full (g_str_hash
, g_str_equal
, g_free
,
911 pages2nodes
= g_hash_table_new_full (g_direct_hash
, g_direct_equal
, NULL
,
914 for (ptr
= page_list
; *ptr
!= NULL
; ptr
++)
918 g_hash_table_insert (offsets2pages
,
919 g_strdup_printf ("%i", offset
),
920 GINT_TO_POINTER (pages
));
922 name
= get_value_after (*ptr
, "Node: ");
924 g_hash_table_insert (pages2nodes
,
925 GINT_TO_POINTER (pages
), name
);
927 offset
+= strlen (*ptr
);
928 if (pages
) offset
+= 2;
931 pt
= page_type (*ptr
);
932 if (pt
== PAGE_INDIRECT
) {
933 g_warning ("Found an indirect page in a file "
934 "we thought we'd expanded.");
938 /* Now consolidate (and correct) the two hash tables */
939 nodes2pages
= make_nodes2pages (offsets2pages
, pages2nodes
);
941 g_hash_table_destroy (offsets2pages
);
942 g_hash_table_destroy (pages2nodes
);
944 processed_table
= g_malloc0 (pages
* sizeof (int));
945 tree
= gtk_tree_store_new (INFO_PARSER_N_COLUMNS
, G_TYPE_STRING
, G_TYPE_STRING
,
947 nodes2iters
= g_hash_table_new_full (g_str_hash
, g_str_equal
, g_free
,
948 (GDestroyNotify
) gtk_tree_iter_free
);
951 for (ptr
= page_list
; *ptr
!= NULL
; ptr
++)
953 if (page_type (*ptr
) != PAGE_NODE
) continue;
954 process_page (tree
, nodes2pages
, nodes2iters
,
955 processed_table
, page_list
, *ptr
);
958 g_strfreev (page_list
);
960 g_hash_table_destroy (nodes2iters
);
961 g_hash_table_destroy (nodes2pages
);
963 g_free (processed_table
);
969 /* Part 2: Parse Tree into XML */
971 parse_tree_level (GtkTreeStore
*tree
, xmlNodePtr
*node
, GtkTreeIter iter
)
973 GtkTreeIter children
, parent
;
976 char *page_no
= NULL
;
977 char *page_name
= NULL
;
978 char *page_content
= NULL
;
979 gboolean notes
= FALSE
;
981 debug_print (DB_DEBUG
, "Decended\n");
984 gtk_tree_model_get (GTK_TREE_MODEL (tree
), &iter
,
985 INFO_PARSER_COLUMN_PAGE_NO
, &page_no
,
986 INFO_PARSER_COLUMN_PAGE_NAME
, &page_name
,
987 INFO_PARSER_COLUMN_PAGE_CONTENT
, &page_content
,
989 debug_print (DB_DEBUG
, "Got Section: %s\n", page_name
);
990 if (strstr (page_content
, "*Note") ||
991 strstr (page_content
, "*note")) {
994 if (strstr (page_content
, "* Menu:")) {
995 newnode
= yelp_info_parse_menu (tree
, node
, page_content
, notes
);
997 newnode
= xmlNewTextChild (*node
, NULL
,
1001 info_body_text (newnode
, NULL
, NULL
, FALSE
, page_content
);
1004 /* Handle notes here */
1005 info_process_text_notes (&newnode
, page_content
, tree
);
1008 /* if we free the page content, now it's in the XML, we can
1009 * save some memory */
1010 g_free (page_content
);
1011 page_content
= NULL
;
1013 if (gtk_tree_model_iter_parent (GTK_TREE_MODEL (tree
), &parent
, &iter
)) {
1015 gtk_tree_model_get (GTK_TREE_MODEL (tree
), &parent
,
1016 INFO_PARSER_COLUMN_PAGE_NO
, &parent_id
,
1018 xmlNewProp (newnode
, BAD_CAST
"up", BAD_CAST parent_id
);
1022 xmlNewProp (newnode
, BAD_CAST
"id",
1024 xmlNewProp (newnode
, BAD_CAST
"name",
1025 BAD_CAST page_name
);
1026 if (gtk_tree_model_iter_children (GTK_TREE_MODEL (tree
),
1029 parse_tree_level (tree
, &newnode
, children
);
1033 while (gtk_tree_model_iter_next (GTK_TREE_MODEL (tree
), &iter
));
1034 debug_print (DB_DEBUG
, "Ascending\n");
1038 yelp_info_parser_parse_tree (GtkTreeStore
*tree
)
1049 doc
= xmlNewDoc (BAD_CAST
"1.0");
1050 node
= xmlNewNode (NULL
, BAD_CAST
"Info");
1051 xmlDocSetRootElement (doc
, node
);
1053 /* functions I will want:
1054 gtk_tree_model_get_iter_first;
1055 gtk_tree_model_iter_next;
1056 gtk_tree_model_iter_children;
1059 if (gtk_tree_model_get_iter_first (GTK_TREE_MODEL (tree
), &iter
))
1060 parse_tree_level (tree
, &node
, iter
);
1061 d (else debug_print (DB_DEBUG
, "Empty tree?\n"));
1064 xmlDocDumpFormatMemory (doc, &xmlbuf, &bufsiz, 1);
1065 g_print ("XML follows:\n%s\n", xmlbuf);
1072 resolve_frag_id (GtkTreeModel
*model
, GtkTreePath
*path
, GtkTreeIter
*iter
,
1075 gchar
*page_no
= NULL
;
1076 gchar
*page_name
= NULL
;
1077 gchar
**xref
= data
;
1079 gtk_tree_model_get (GTK_TREE_MODEL (model
), iter
,
1080 INFO_PARSER_COLUMN_PAGE_NO
, &page_no
,
1081 INFO_PARSER_COLUMN_PAGE_NAME
, &page_name
,
1083 if (g_str_equal (page_name
, *xref
)) {
1085 *xref
= g_strdup (page_name
);
1086 *xref
= g_strdelimit (*xref
, " ", '_');
1099 get_menuoptions (gchar
*line
, gchar
**title
, gchar
**ref
, gchar
**desc
,
1102 /* Since info is actually braindead and allows .s in
1103 * its references, we gotta carefully extract things
1104 * as .s can be in either the title or desc
1107 gchar
*tfind
= NULL
;
1109 if (!g_str_has_prefix (line
, "* "))
1112 tfind
= strchr (tmp
, ':');
1114 if (!tfind
) /* No : on the line, bail out */
1117 (*title
) = g_strndup (tmp
, tfind
-tmp
);
1119 if (tfind
[1] == ':') { /* This happens if the title and ref are the same
1120 * Most menus are of this type
1123 (*ref
) = NULL
; /* There is no second part. The rest is description */
1126 (*xref
) = g_strndup (tmp
, tfind
-tmp
);
1130 (*desc
) = g_strdup (tfind
);
1131 } else { /* The other type of menu option */
1135 td
= strchr (tfind
, '.');
1138 (*ref
) = g_strndup (tfind
, td
-tfind
);
1139 (*xref
) = g_strdup (*ref
);
1143 (*desc
) = g_strdup (td
);
1148 /* Find the first non whitespace character in str or return pointer to the
1149 * '\0' if there isn't one. */
1151 first_non_space (gchar
* str
)
1153 /* As long as str is null terminated, this is ok! */
1154 while (g_ascii_isspace (*str
)) str
++;
1159 yelp_info_parse_menu (GtkTreeStore
*tree
, xmlNodePtr
*node
,
1160 gchar
*page_content
, gboolean notes
)
1165 xmlNodePtr newnode
, menu_node
, mholder
= NULL
;
1168 split
= g_strsplit (page_content
, "* Menu:", 2);
1170 newnode
= xmlNewChild (*node
, NULL
,
1171 BAD_CAST
"Section", NULL
);
1175 info_body_text (newnode
, NULL
, NULL
, FALSE
, split
[0]);
1177 info_process_text_notes (&newnode
, split
[0], tree
);
1180 menuitems
= g_strsplit (split
[1], "\n", -1);
1183 /* The output xml should look something like the following:
1187 <a href="xref:Help-Inv">Help-Inv</a>
1188 <para1>Invisible text in Emacs Info.</para1>
1191 <a href="xref:Help-M">Help-M</a>
1192 <para1>Menus.</para1>
1197 (from the top page of info:info). Note the absence of *'s and
1200 If there's a line with no "* Blah::", it looks like a child of
1201 the previous menu item so (for i > 0) deal with that correctly by
1202 not "closing" the <menuholder> tag until we find the next
1206 if (menuitems
[0] != NULL
) {
1207 /* If there are any menu items, make the <menu> node */
1208 menu_node
= xmlNewChild (newnode
, NULL
, BAD_CAST
"menu", NULL
);
1211 while (menuitems
[i
] != NULL
) {
1212 gboolean menu
= FALSE
;
1213 gchar
*title
= NULL
;
1217 gchar
*link_text
= NULL
;
1220 menu
= get_menuoptions (menuitems
[i
], &title
, &ref
, &desc
, &xref
);
1222 if (menu
&& (*title
== '\0' || *(title
+ 1) == '\0')) {
1223 g_warning ("Info title unexpectedly short for menu item (%s)",
1229 mholder
= xmlNewChild (menu_node
, NULL
, BAD_CAST
"menuholder", NULL
);
1230 gtk_tree_model_foreach (GTK_TREE_MODEL (tree
), resolve_frag_id
, &xref
);
1232 if (ref
== NULL
) { /* A standard type menu */
1233 /* title+2 skips the "* ". We know we haven't jumped over the
1234 end of the string because strlen (title) >= 3 */
1235 link_text
= g_strdup (title
+2);
1237 ref1
= xmlNewTextChild (mholder
, NULL
, BAD_CAST
"a",
1238 BAD_CAST link_text
);
1240 tmp
= g_strconcat ("xref:", xref
, NULL
);
1241 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST tmp
);
1243 } else { /* Indexy type menu - we gotta do a little work to fix the
1246 gchar
*spacing
= ref
;
1250 while (*spacing
==' ') {
1254 sp
= g_strndup (ref
, c
);
1256 link_text
= g_strdup (title
);
1258 ref1
= xmlNewTextChild (mholder
, NULL
, BAD_CAST
"a",
1259 BAD_CAST link_text
);
1260 tmp
= g_strconcat ("xref:", xref
, NULL
);
1261 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST tmp
);
1263 xmlNewTextChild (mholder
, NULL
, BAD_CAST
"spacing",
1265 tmp
= g_strconcat (g_strstrip(ref
), ".", NULL
);
1266 ref1
= xmlNewTextChild (mholder
, NULL
, BAD_CAST
"a",
1269 tmp
= g_strconcat ("xref:", xref
, NULL
);
1270 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST tmp
);
1276 tmp
= g_strconcat ("\n", first_non_space (desc
), NULL
);
1279 Don't print the link text a second time, because that looks
1282 We don't do a straight check for equality because lots of
1283 .info files have something like
1287 Obviously if the longer explanation has more afterwards, we
1288 don't want to omit it, which is why there's the strlen test.
1290 if (strncmp (link_text
, tmp
+ 1, strlen (link_text
)) ||
1291 strlen (link_text
) + 1 < strlen (tmp
+ 1)) {
1292 xmlNewTextChild (mholder
, NULL
,
1293 BAD_CAST
"para1", BAD_CAST tmp
);
1299 else if (*(menuitems
[i
]) != '\0') {
1300 tmp
= g_strconcat ("\n", first_non_space (menuitems
[i
]), NULL
);
1301 xmlNewTextChild (mholder
? mholder
: menu_node
,
1302 NULL
, BAD_CAST
"para1",
1313 g_strfreev (menuitems
);
1319 info_process_text_notes (xmlNodePtr
*node
, gchar
*content
, GtkTreeStore
*tree
)
1324 xmlNodePtr paragraph
= NULL
;
1325 gboolean first
= TRUE
;
1328 Split using the regular expression
1332 which deals with either case and the last bit is a lookahead so
1333 that we don't split on things of the form *Note:_, which aren't
1336 notes
= g_regex_split_simple ("\\*[Nn]ote(?!_)", content
, 0, 0);
1338 for (current
= notes
; *current
!= NULL
; current
++) {
1339 gchar
*url
, **urls
, **ulink
;
1341 gchar
*alt_append
, *alt_append1
;
1344 gchar
*break_point
= NULL
;
1345 gboolean broken
= FALSE
;
1347 /* The first node is special. It doesn't have a note ref at the
1348 * start, so we can just add it and forget about it.
1351 info_body_text (*node
, ¶graph
, NULL
, TRUE
, (*current
));
1355 /* If we got to here, we now gotta parse the note reference */
1356 append
= strchr (*current
, ':');
1358 info_body_text (*node
, ¶graph
, NULL
, TRUE
, *current
);
1362 alt_append
= append
;
1363 alt_append1
= alt_append
;
1364 append
= strchr (append
, ':');
1365 alt_append
= strchr (alt_append
, '.');
1366 if (alt_append
&& g_str_has_prefix (alt_append
, ".info")) {
1369 alt_append
= strchr (alt_append
, '.');
1371 alt_append1
= strchr (alt_append1
, ',');
1372 if (!append
&& !alt_append
&& !alt_append1
) {
1373 info_body_text (*node
, ¶graph
, NULL
, TRUE
, *current
);
1376 if (!append
|| alt_append
|| alt_append1
) {
1378 if (alt_append
) append
= alt_append
;
1379 else append
= alt_append1
;
1381 if ((alt_append
&& alt_append
< append
))
1382 append
= alt_append
;
1383 if (alt_append1
&& alt_append1
< append
)
1384 append
= alt_append1
;
1387 url
= g_strndup (*current
, append
- (*current
));
1389 /* Save a copy of the unadulterated link text for later. */
1390 link_text
= g_strconcat ("*Note", url
, NULL
);
1392 /* By now, we got 2 things. First, is append which is the (hopefully)
1393 * non-link text. Second, we got a url.
1394 * The url can be in several forms:
1396 * 2. linkend:(infofile)Linkend.
1397 * 3. Title: Linkend.
1398 * 4. Title: Linkend, (pretty sure this is just broken)
1399 * 5. Title: (infofile.info)Linkend.
1400 * All possibilities should have been picked up.
1402 * Clean up the split. Should be left with a real url and
1403 * a list of fragments that should be linked
1404 * Also goes through and removes extra spaces, leaving only one
1405 * space in place of many
1407 urls
= g_strsplit (url
, "\n", -1);
1408 break_point
= strchr (url
, '\n');
1409 while (break_point
) {
1411 break_point
= strchr (++break_point
, '\n');
1413 break_point
= strchr (url
, ' ');
1414 while (break_point
) {
1415 if (*(break_point
+1) == ' ') {
1416 /* Massive space. Fix. */
1417 gchar
*next
= break_point
;
1420 while (*next
== ' ')
1423 url_copy
= g_strndup (url
, break_point
-url
);
1424 url
= g_strconcat (url_copy
, next
, NULL
);
1426 break_point
= strchr (url
, ' ');
1430 break_point
= strchr (break_point
, ' ');
1433 if (url
[strlen(url
)-1] == '.') { /* The 2nd or 3rd sort of link */
1437 stop
= strchr (url
, ':');
1438 lurl
= strchr (stop
, '(');
1439 if (!lurl
) { /* 3rd type of link */
1443 link
= g_strdup (stop
);
1444 link
= g_strstrip (link
);
1445 length
= strlen (link
) - 1;
1446 link
[length
] = '\0';
1447 href
= g_strconcat ("xref:", link
, NULL
);
1452 } else { /* 2nd type of link. Easy. Provided .info is neglected ;) */
1458 new_url
= g_strdup (lurl
);
1459 info
= strstr (new_url
, ".info)");
1460 stripped
= g_strndup (new_url
, info
-new_url
);
1462 lurl
= g_strconcat (stripped
, info
, NULL
);
1466 zloc
= &(lurl
[strlen(lurl
)-1]);
1468 href
= g_strconcat ("info:", lurl
, NULL
);
1471 } else { /* First kind of link */
1475 tmp1
= strchr (url
, ':');
1477 frag
= g_strdup (url
);
1479 frag
= g_strndup (url
, tmp1
- url
);
1481 gtk_tree_model_foreach (GTK_TREE_MODEL (tree
), resolve_frag_id
, &frag
);
1482 href
= g_strconcat ("xref:", frag
, NULL
);
1486 /* Check we've got a valid paragraph node */
1488 paragraph
= xmlNewChild (*node
, NULL
, BAD_CAST
"para", NULL
);
1492 Now we're supposed to actually render the link. I have a list of
1493 bits of URL and actually this is really easy - I want to have
1494 the link *text* exactly the same as it appeared in the .info
1495 file, so don't use the list of strings urls, instead use the
1496 whole lot: url (complete with embedded newlines etc.)
1498 ref1
= xmlNewTextChild (paragraph
, NULL
, BAD_CAST
"a",
1499 BAD_CAST link_text
);
1501 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST href
);
1505 /* Finally, we can add the following text as required */
1506 info_body_text (*node
, ¶graph
, NULL
, TRUE
, append
);