1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil -*- */
3 * Copyright (C) 2005 Davyd Madeley <davyd@madeley.id.au>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 * Author: Davyd Madeley <davyd@madeley.id.au>
29 #include "yelp-info-parser.h"
30 #include "yelp-magic-decompressor.h"
31 #include "yelp-debug.h"
34 static GtkTreeIter
* find_real_top (GtkTreeModel
*model
,
36 static GtkTreeIter
* find_real_sibling (GtkTreeModel
*model
,
39 static xmlNodePtr
yelp_info_parse_menu (GtkTreeStore
*tree
,
43 static gboolean
get_menuoptions (gchar
*line
,
48 static gboolean
resolve_frag_id (GtkTreeModel
*model
,
52 static void info_process_text_notes (xmlNodePtr
*node
,
58 Used to output the correct <heading level="?" /> tag.
60 static const gchar
* level_headings
[] = { NULL
, "1", "2", "3" };
63 info_image_get_attributes (gchar
const* string
)
65 GMatchInfo
*match_info
;
70 regex
= g_regex_new ("([^\\s][^\\s=]+)=(?:([^\\s \"]+)|(?:\"((?:[^\\\"]|\\\\[\\\\\"])*)\"))", 0, 0, NULL
);
71 g_regex_match (regex
, string
, 0, &match_info
);
72 while (g_match_info_matches (match_info
))
78 h
= g_hash_table_new (g_str_hash
, g_str_equal
);
79 key
= g_match_info_fetch (match_info
, 1);
80 value
= g_match_info_fetch (match_info
, 2);
82 value
= g_match_info_fetch (match_info
, 3);
83 g_hash_table_insert (h
, key
, value
);
84 g_match_info_next (match_info
, NULL
);
86 g_match_info_free (match_info
);
87 g_regex_unref (regex
);
93 info elements look like \0\b[<TAGNAME>\0\b] and take attribute=value
94 pairs, i.e. for image: \0\b[image src="foo.png" \0\b]
96 #define INFO_TAG_0 "\0"
97 #define INFO_TAG_1 "\b"
98 #define INFO_TAG_OPEN_2 INFO_TAG_1 "["
99 #define INFO_TAG_CLOSE_2 INFO_TAG_1 "]"
100 #define INFO_TAG_OPEN_2_RE INFO_TAG_1 "[[]"
101 #define INFO_TAG_CLOSE_2_RE INFO_TAG_1 "[]]"
102 #define INFO_TAG_OPEN INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2
103 #define INFO_TAG_CLOSE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2
104 #define INFO_TAG_OPEN_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_OPEN_2_RE
105 #define INFO_TAG_CLOSE_RE INFO_TAG_0 INFO_TAG_1 INFO_TAG_CLOSE_2_RE
106 /* C/glib * cannot really handle \0 in strings, convert to '@' */
107 #define INFO_C_TAG_0 "@"
108 #define INFO_C_TAG_OPEN INFO_C_TAG_0 INFO_TAG_OPEN_2
109 #define INFO_C_TAG_CLOSE INFO_C_TAG_0 INFO_TAG_CLOSE_2
110 #define INFO_C_TAG_OPEN_RE INFO_C_TAG_0 INFO_TAG_OPEN_2_RE
111 #define INFO_C_TAG_CLOSE_RE INFO_C_TAG_0 INFO_TAG_CLOSE_2_RE
112 #define INFO_C_IMAGE_TAG_OPEN INFO_C_TAG_OPEN "image"
113 #define INFO_C_IMAGE_TAG_OPEN_RE INFO_C_TAG_OPEN_RE "image"
116 info_insert_image (xmlNodePtr parent
, GMatchInfo
*match_info
)
122 GHashTable
*h
= info_image_get_attributes (g_match_info_fetch (match_info
, 1));
125 source
= (gchar
*)g_hash_table_lookup (h
, "src");
127 if (!h
|| !source
|| !*source
)
128 return xmlNewTextChild (parent
, NULL
, BAD_CAST
"para",
129 BAD_CAST
"[broken image]");
131 title
= (gchar
*)g_hash_table_lookup (h
, "title");
132 text
= (gchar
*)g_hash_table_lookup (h
, "text");
133 alt
= (gchar
*)g_hash_table_lookup (h
, "alt");
134 g_hash_table_destroy (h
);
135 img
= xmlNewChild (parent
, NULL
, BAD_CAST
"img", NULL
);
136 xmlNewProp (img
, BAD_CAST
"src", BAD_CAST source
);
137 xmlNewProp (img
, BAD_CAST
"title", BAD_CAST (title
? title
: ""));
138 xmlNewProp (img
, BAD_CAST
"text", BAD_CAST (text
? text
: ""));
139 xmlNewProp (img
, BAD_CAST
"alt", BAD_CAST (alt
? alt
: ""));
147 If every element of `str' is `ch' then return TRUE, else FALSE.
150 string_all_char_p (const gchar
* str
, gchar ch
)
152 for (; *str
; str
++) {
153 if (*str
!= ch
) return FALSE
;
159 If `line' is a line of '*', '=' or '-', return 1,2,3 respectively
160 for the heading level. If it's anything else, return 0.
163 header_underline_level (const gchar
* line
)
165 if (*line
!= '*' && *line
!= '=' && *line
!= '-')
168 if (string_all_char_p (line
, '*')) return 1;
169 if (string_all_char_p (line
, '=')) return 2;
170 if (string_all_char_p (line
, '-')) return 3;
176 Use g_strjoinv to join up the strings from `strings', but they might
177 not actually be a null-terminated array. `end' should be strings+n,
178 where I want the first n strings (strings+0, ..., strings+(n-1)). It
179 shouldn't point outside of the array allocated, but it can point at
180 the null string at the end.
183 join_strings_subset (const gchar
*separator
,
184 gchar
** strings
, gchar
** end
)
189 g_assert(end
> strings
);
194 glob
= g_strjoinv (separator
, strings
);
200 Create a text node, child of `parent', with the lines strictly
201 between `first' and `last'.
204 lines_subset_text_child (xmlNodePtr parent
, xmlNsPtr ns
,
205 gchar
** first
, gchar
** last
)
207 /* TODO? Currently we're copying the split strings again, which is
208 less efficient than somehow storing lengths and using a sort of
209 window on `content'. But that's much more difficult, so unless
210 there's a problem, let's go with the stupid approach. */
214 glob
= join_strings_subset ("\n", first
, last
);
215 xmlAddChild (parent
, xmlNewText (BAD_CAST glob
));
221 Convert body text CONTENT to xml nodes. This function is responsible
222 for spotting headings etc and splitting them out correctly.
224 paragraph is as described in info_body_text, but cannot be null.
226 If `inline_p' is true, end with a <para1> tag. Otherwise, end with a
229 TODO: IWBN add a regex match for *Note: here and call the *Note ==>
230 <a href> logic of info_process_text_notes from here.
233 info_body_parse_text (xmlNodePtr parent
, xmlNodePtr
*paragraph
,
235 gboolean inline_p
, const gchar
*content
)
237 /* The easiest things to spot are headings: they look like a line of
238 * '*','=' or '-', corresponding to heading levels 1,2 or 3. To spot
239 * them, we split content into single lines and work with them. */
240 gchar
**lines
= g_strsplit (content
, "\n", 0);
241 gchar
**first
= lines
, **last
= lines
;
243 xmlNodePtr header_node
;
245 /* Deal with the possibility that `content' is empty */
246 if (*lines
== NULL
) {
248 xmlNewTextChild (parent
, NULL
, BAD_CAST
"para", BAD_CAST
"");
253 /* Use a pair of pointers, first and last, which point to two lines,
254 * the chunk of the body we're displaying (inclusive) */
255 for (; *last
; last
++) {
257 /* Check for a blank line */
258 if (**last
== '\0') {
261 *paragraph
= xmlNewChild (parent
, ns
, BAD_CAST
"para", NULL
);
263 lines_subset_text_child (*paragraph
, ns
, first
, last
);
265 /* On the next iteration, last==first both pointing at the next
273 /* Check for a header */
274 header_level
= header_underline_level (*last
);
276 /* Write out any lines beforehand */
277 lines_subset_text_child (parent
, ns
, first
, last
-1);
278 /* Now write out the actual header line */
279 header_node
= xmlNewTextChild (parent
, ns
, BAD_CAST
"header",
281 xmlNewProp (header_node
, BAD_CAST
"level",
282 BAD_CAST level_headings
[header_level
]);
289 /* Write out any lines left */
291 *paragraph
= xmlNewChild (parent
, ns
, BAD_CAST
"para", NULL
);
293 lines_subset_text_child (*paragraph
, ns
, first
, last
);
299 info_body_text is responsible for taking a hunk of the info page's
300 body and turning it into paragraph tags. It searches out images and
301 marks them up properly if necessary.
303 parent should be the node in which we're currently storing text and
304 paragraph a pointer to a <para> tag or NULL. At blank lines, we
305 finish with the current para tag and switch to a new one.
307 It uses info_body_parse_text to mark up the actual bits of text.
310 info_body_text (xmlNodePtr parent
, xmlNodePtr
*paragraph
, xmlNsPtr ns
,
311 gboolean inline_p
, gchar
const *content
)
313 xmlNodePtr thepara
= NULL
;
317 GMatchInfo
*match_info
;
319 if (paragraph
== NULL
) paragraph
= &thepara
;
321 if (!strstr (content
, INFO_C_IMAGE_TAG_OPEN
)) {
322 info_body_parse_text (parent
, paragraph
, ns
, inline_p
, content
);
326 content_len
= strlen (content
);
328 regex
= g_regex_new ("(" INFO_C_IMAGE_TAG_OPEN_RE
"((?:[^" INFO_TAG_1
"]|[^" INFO_C_TAG_0
"]+" INFO_TAG_1
")*)" INFO_C_TAG_CLOSE_RE
")", 0, 0, NULL
);
330 g_regex_match (regex
, content
, 0, &match_info
);
331 while (g_match_info_matches (match_info
))
335 gboolean image_found
= g_match_info_fetch_pos (match_info
, 0,
336 &image_start
, &image_end
);
337 gchar
*before
= g_strndup (&content
[pos
], image_start
- pos
);
339 info_body_parse_text (parent
, paragraph
, NULL
, TRUE
, before
);
342 /* End the paragraph that was before */
346 info_insert_image (parent
, match_info
);
347 g_match_info_next (match_info
, NULL
);
349 after
= g_strndup (&content
[pos
], content_len
- pos
);
350 info_body_parse_text (parent
, paragraph
, NULL
, TRUE
, after
);
354 /* Part 1: Parse File Into Tree Store */
365 page_type (char *page
)
367 if (g_ascii_strncasecmp (page
, "Tag Table:\n", 11) == 0)
368 return PAGE_TAG_TABLE
;
369 else if (g_ascii_strncasecmp (page
, "Indirect:\n", 10) == 0)
370 return PAGE_INDIRECT
;
371 else if (g_ascii_strncasecmp (page
, "File:", 5) == 0 ||
372 g_ascii_strncasecmp (page
, "Node:", 5) == 0)
380 *open_info_file (const gchar
*file
)
383 GConverter
*converter
;
384 GFileInputStream
*file_stream
;
385 GInputStream
*stream
;
392 gfile
= g_file_new_for_path (file
);
393 file_stream
= g_file_read (gfile
, NULL
, NULL
);
394 converter
= (GConverter
*) yelp_magic_decompressor_new ();
395 stream
= g_converter_input_stream_new ((GInputStream
*) file_stream
, converter
);
396 string
= g_string_new (NULL
);
398 while ((bytes
= g_input_stream_read (stream
, buf
, 1024, NULL
, NULL
)) > 0)
399 g_string_append_len (string
, buf
, bytes
);
401 g_object_unref (stream
);
405 /* C/glib * cannot really handle \0 in strings, convert. */
406 for (i
= 0; i
< (string
->len
- 1); i
++)
407 if (str
[i
] == INFO_TAG_OPEN
[0] && str
[i
+1] == INFO_TAG_OPEN
[1])
408 str
[i
] = INFO_C_TAG_OPEN
[0];
410 g_string_free (string
, FALSE
);
416 find_info_part (gchar
*part_name
, const gchar
*base
)
418 /* New and improved. We now assume that all parts are
419 * in the same subdirectory as the base file. Makes
420 * life much simpler and is (afaict) always true
424 gchar
*bzfname
, *gzfname
, *lzfd
, *fname
;
426 tmp
= g_strrstr (base
, "/");
427 path
= g_strndup (base
, tmp
-base
);
429 bzfname
= g_strconcat (path
, "/", part_name
, ".bz2", NULL
);
430 gzfname
= g_strconcat (path
, "/", part_name
, ".gz", NULL
);
431 lzfd
= g_strconcat (path
, "/", part_name
, ".lzma", NULL
);
432 fname
= g_strconcat (path
, "/", part_name
, NULL
);
434 if (g_file_test (bzfname
, G_FILE_TEST_EXISTS
))
435 uri
= g_strdup (bzfname
);
436 else if (g_file_test (gzfname
, G_FILE_TEST_EXISTS
))
437 uri
= g_strdup (gzfname
);
438 else if (g_file_test (lzfd
, G_FILE_TEST_EXISTS
))
439 uri
= g_strdup (lzfd
);
440 else if (g_file_test (fname
, G_FILE_TEST_EXISTS
))
441 uri
= g_strdup (fname
);
453 *process_indirect_map (char *page
, const gchar
*file
)
457 char *composite
= NULL
;
458 size_t composite_len
= 0;
460 lines
= g_strsplit (page
, "\n", 0);
463 Go backwards down the list so that we allocate composite
464 big enough the first time around.
466 for (ptr
= lines
+ 1; *ptr
!= NULL
; ptr
++);
467 for (ptr
--; ptr
!= lines
; ptr
--)
476 debug_print (DB_DEBUG
, "Line: %s\n", *ptr
);
477 items
= g_strsplit (*ptr
, ": ", 2);
481 filename
= find_info_part (items
[0], file
);
482 str
= open_info_file (filename
);
487 pages
= g_strsplit (str
, "\x1f", 2);
495 offset
= (gsize
) atoi (items
[1]);
496 plength
= strlen(pages
[1]);
498 debug_print (DB_DEBUG
, "Need to make string %s+%i bytes = %i\n",
502 if (!composite
) /* not yet created, malloc it */
504 composite_len
= offset
+ plength
;
505 composite
= g_malloc (sizeof (char) *
506 (composite_len
+ 1));
507 memset (composite
, '-', composite_len
);
508 composite
[composite_len
] = '\0';
511 /* Because we're going down the list
512 * backwards, plength should always be short
513 * enough to fit in the memory allocated. But
514 * in case something's broken/malicious, we
515 * should check anyway.
517 if (offset
> composite_len
)
519 if (plength
+ offset
+ 1 > composite_len
)
520 plength
= composite_len
- offset
- 1;
522 composite
[offset
] = '\x1f';
523 memcpy (composite
+ offset
+ 1, pages
[1], plength
);
538 Open up the relevant info file and read it all into memory. If there
539 is an indirect table thingy, we resolve that as we go.
541 Returns a NULL-terminated list of pointers to pages on success and
545 expanded_info_file (const gchar
*file
)
547 gchar
*slurp
= open_info_file (file
);
551 if (!slurp
) return NULL
;
553 /* TODO: There's a lot of copying of bits of memory here. With a bit
554 * more effort we could avoid it. Either we should fix this or
555 * measure the time taken and decide it's irrelevant...
557 * Note: \x1f\n is ^_\n
559 page_list
= g_strsplit (slurp
, "\x1f\n", 0);
563 for (page
= page_list
; *page
!= NULL
; page
++) {
564 if (page_type (*page
) == PAGE_INDIRECT
) {
566 slurp
= process_indirect_map (*page
, file
);
567 g_strfreev (page_list
);
572 page_list
= g_strsplit (slurp
, "\x1f\n", 0);
582 Look for strings in source by key. For example, we extract "blah"
583 from "Node: blah," when the key is "Node: ". To know when to stop,
584 there are two strings: end and cancel.
586 If we find a character from end first, return a copy of the string
587 up to (not including) that character. If we find a character of
588 cancel first, return NULL. If we find neither, return the rest of
591 cancel can be NULL, in which case, we don't do its test.
594 get_value_after_ext (const char *source
, const char *key
,
595 const char *end
, const char *cancel
)
598 size_t not_end
, not_cancel
;
600 start
= strstr (source
, key
);
601 if (!start
) return NULL
;
603 start
+= strlen (key
);
605 not_end
= strcspn (start
, end
);
606 not_cancel
= (cancel
) ? strcspn (start
, cancel
) : not_end
+ 1;
608 if (not_cancel
< not_end
)
611 return g_strndup (start
, not_end
);
615 get_value_after (const char* source
, const char *key
)
617 return get_value_after_ext (source
, key
, ",", "\n\x7f");
621 node2page (GHashTable
*nodes2pages
, char *node
)
625 if (g_hash_table_lookup_extended (nodes2pages
, node
,
627 return GPOINTER_TO_INT(p
);
629 /* This shouldn't happen: we should only ever have to look up pages
631 g_return_val_if_reached (0);
635 *node2iter (GHashTable
*nodes2iters
, char *node
)
639 iter
= g_hash_table_lookup (nodes2iters
, node
);
640 d (if (!iter
) debug_print (DB_WARN
, "Could not retrieve iter for node !%s!\n", node
));
645 *find_real_top (GtkTreeModel
*model
, GtkTreeIter
*it
)
647 GtkTreeIter
*r
= NULL
;
648 GtkTreeIter
*tmp
= NULL
;
653 r
= gtk_tree_iter_copy (it
);
654 tmp
= g_malloc0 (sizeof (GtkTreeIter
));
655 while (gtk_tree_model_iter_parent (model
, tmp
, r
)) {
656 gtk_tree_iter_free (r
);
657 r
= gtk_tree_iter_copy (tmp
);
664 GtkTreeIter
* find_real_sibling (GtkTreeModel
*model
,
665 GtkTreeIter
*it
, GtkTreeIter
*comp
)
668 GtkTreeIter
*tmp
= NULL
;
669 gboolean result
= FALSE
;
677 r
= gtk_tree_iter_copy (it
);
678 tmp
= gtk_tree_iter_copy (it
);
680 reftitle
= gtk_tree_model_get_string_from_iter (model
, comp
);
682 result
= gtk_tree_model_iter_parent (model
, r
, it
);
686 title
= gtk_tree_model_get_string_from_iter (model
, r
);
688 while (!g_str_equal (title
, reftitle
) && result
) {
689 gtk_tree_iter_free (tmp
);
690 tmp
= gtk_tree_iter_copy (r
);
691 result
= gtk_tree_model_iter_parent (model
, r
, tmp
);
693 title
= gtk_tree_model_get_string_from_iter (model
, r
);
696 if (!g_str_equal (title
, reftitle
))
698 gtk_tree_iter_free (tmp
);
702 gtk_tree_iter_free (r
);
710 process_page (GtkTreeStore
*tree
,
711 GHashTable
*nodes2pages
, GHashTable
*nodes2iters
,
712 int *processed_table
, char **page_list
, char *page_text
)
725 /* split out the header line and the text */
726 parts
= g_strsplit (page_text
, "\n", 3);
728 node
= get_value_after (parts
[0], "Node: ");
729 up
= get_value_after (parts
[0], "Up: ");
730 prev
= get_value_after (parts
[0], "Prev: ");
731 next
= get_value_after (parts
[0], "Next: ");
733 if (next
&& g_str_equal (next
, "Top")) {
737 if (g_str_equal (node
, "Top") && prev
!= NULL
) {
742 /* check to see if this page has been processed already */
743 page
= node2page (nodes2pages
, node
);
744 if (processed_table
[page
]) {
747 processed_table
[page
] = 1;
749 debug_print (DB_DEBUG
, "-- Processing Page %s\n\tParent: %s\n", node
, up
);
751 iter
= g_slice_alloc0 (sizeof (GtkTreeIter
));
752 /* check to see if we need to process our parent and siblings */
753 if (up
&& g_ascii_strncasecmp (up
, "(dir)", 5) && strcmp (up
, "Top"))
755 page
= node2page (nodes2pages
, up
);
756 if (!processed_table
[page
])
758 debug_print (DB_DEBUG
, "%% Processing Node %s\n", up
);
759 process_page (tree
, nodes2pages
,
760 nodes2iters
, processed_table
, page_list
,
764 if (prev
&& g_ascii_strncasecmp (prev
, "(dir)", 5))
766 if (strncmp (node
, "Top", 3)) {
767 /* Special case the Top node to always appear first */
769 page
= node2page (nodes2pages
, prev
);
770 if (!processed_table
[page
])
772 debug_print (DB_DEBUG
, "%% Processing Node %s\n", prev
);
773 process_page (tree
, nodes2pages
,
774 nodes2iters
, processed_table
, page_list
,
780 /* by this point our parent and older sibling should be processed */
781 if (!up
|| !g_ascii_strcasecmp (up
, "(dir)"))
783 debug_print (DB_DEBUG
, "\t> no parent\n");
784 if (!prev
|| !g_ascii_strcasecmp (prev
, "(dir)"))
786 debug_print (DB_DEBUG
, "\t> no previous\n");
787 gtk_tree_store_append (tree
, iter
, NULL
);
791 real
= find_real_top (GTK_TREE_MODEL (tree
),
792 node2iter (nodes2iters
, prev
));
794 gtk_tree_store_insert_after (tree
, iter
, NULL
,
796 gtk_tree_iter_free (real
);
799 gtk_tree_store_append (tree
, iter
, NULL
);
802 else if (!prev
|| !g_ascii_strcasecmp (prev
, "(dir)") || !strcmp (prev
, up
))
804 debug_print (DB_DEBUG
, "\t> no previous\n");
805 gtk_tree_store_append (tree
, iter
,
806 node2iter (nodes2iters
, up
));
810 GtkTreeIter
*upit
= node2iter (nodes2iters
, up
);
811 GtkTreeIter
*previt
= node2iter (nodes2iters
, prev
);
812 GtkTreeIter
*nit
= NULL
;
813 debug_print (DB_DEBUG
, "+++ Parent: %s Previous: %s\n", up
, prev
);
815 d (if (upit
) debug_print (DB_DEBUG
, "++++ Have parent node!\n"));
816 d (if (previt
) debug_print (DB_DEBUG
, "++++ Have previous node!\n"));
817 nit
= find_real_sibling (GTK_TREE_MODEL (tree
), previt
, upit
);
819 gtk_tree_store_insert_after (tree
, iter
,
822 gtk_tree_iter_free (nit
);
825 gtk_tree_store_append (tree
, iter
, upit
);
829 debug_print (DB_DEBUG
, "# node %s was not put in tree\n", node
);
833 d (if (iter
) debug_print (DB_DEBUG
, "Have a valid iter, storing for %s\n", node
));
835 g_hash_table_insert (nodes2iters
, g_strdup (node
), iter
);
836 debug_print (DB_DEBUG
, "size: %i\n", g_hash_table_size (nodes2iters
));
838 /*tmp = g_strdup_printf ("%i",
839 node2page (nodes2pages, node));*/
840 tmp
= g_strdup (node
);
841 tmp
= g_strdelimit (tmp
, " ", '_');
842 gtk_tree_store_set (tree
, iter
,
843 INFO_PARSER_COLUMN_PAGE_NO
, tmp
,
844 INFO_PARSER_COLUMN_PAGE_NAME
, node
,
845 INFO_PARSER_COLUMN_PAGE_CONTENT
, parts
[2],
857 GHashTable
*nodes2pages
; /* Build this... */
858 GHashTable
*pages2nodes
; /* ... using this. */
862 use_offset2page (gpointer o
, gpointer p
, gpointer ud
)
864 struct TagTableFix
* ttf
= (struct TagTableFix
*)ud
;
866 const gchar
* node
= g_hash_table_lookup (ttf
->pages2nodes
, p
);
868 g_hash_table_insert (ttf
->nodes2pages
, g_strdup (node
), p
);
873 We had a nodes2offsets hash table, but sometimes these things
874 lie. How terribly rude. Anyway, use offsets2pages and pages2nodes
875 (and injectivity!) to construct the nodes2pages hash table.
878 make_nodes2pages (GHashTable
* offsets2pages
,
879 GHashTable
* pages2nodes
)
881 struct TagTableFix ttf
;
884 g_hash_table_new_full (g_str_hash
, g_str_equal
, g_free
, NULL
);
885 ttf
.pages2nodes
= pages2nodes
;
887 g_hash_table_foreach (offsets2pages
, use_offset2page
, &ttf
);
889 return ttf
.nodes2pages
;
893 * Parse file into a GtkTreeStore containing useful information that we can
894 * later convert into a nice XML document or something else.
897 *yelp_info_parser_parse_file (char *file
)
903 GHashTable
*offsets2pages
= NULL
;
904 GHashTable
*pages2nodes
= NULL
;
905 GHashTable
*nodes2pages
= NULL
;
906 GHashTable
*nodes2iters
= NULL
;
907 int *processed_table
;
911 page_list
= expanded_info_file (file
);
918 offsets2pages
= g_hash_table_new_full (g_str_hash
, g_str_equal
, g_free
,
920 pages2nodes
= g_hash_table_new_full (g_direct_hash
, g_direct_equal
, NULL
,
923 for (ptr
= page_list
; *ptr
!= NULL
; ptr
++)
927 g_hash_table_insert (offsets2pages
,
928 g_strdup_printf ("%i", offset
),
929 GINT_TO_POINTER (pages
));
931 name
= get_value_after (*ptr
, "Node: ");
933 g_hash_table_insert (pages2nodes
,
934 GINT_TO_POINTER (pages
), name
);
936 offset
+= strlen (*ptr
);
937 if (pages
) offset
+= 2;
940 pt
= page_type (*ptr
);
941 if (pt
== PAGE_INDIRECT
) {
942 g_warning ("Found an indirect page in a file "
943 "we thought we'd expanded.");
947 /* Now consolidate (and correct) the two hash tables */
948 nodes2pages
= make_nodes2pages (offsets2pages
, pages2nodes
);
950 g_hash_table_destroy (offsets2pages
);
951 g_hash_table_destroy (pages2nodes
);
953 processed_table
= g_malloc0 (pages
* sizeof (int));
954 tree
= gtk_tree_store_new (INFO_PARSER_N_COLUMNS
, G_TYPE_STRING
, G_TYPE_STRING
,
956 nodes2iters
= g_hash_table_new_full (g_str_hash
, g_str_equal
, g_free
,
957 (GDestroyNotify
) gtk_tree_iter_free
);
960 for (ptr
= page_list
; *ptr
!= NULL
; ptr
++)
962 if (page_type (*ptr
) != PAGE_NODE
) continue;
963 process_page (tree
, nodes2pages
, nodes2iters
,
964 processed_table
, page_list
, *ptr
);
967 g_strfreev (page_list
);
969 g_hash_table_destroy (nodes2iters
);
970 g_hash_table_destroy (nodes2pages
);
972 g_free (processed_table
);
978 /* Part 2: Parse Tree into XML */
980 parse_tree_level (GtkTreeStore
*tree
, xmlNodePtr
*node
, GtkTreeIter iter
)
982 GtkTreeIter children
, parent
;
985 char *page_no
= NULL
;
986 char *page_name
= NULL
;
987 char *page_content
= NULL
;
988 gboolean notes
= FALSE
;
990 debug_print (DB_DEBUG
, "Decended\n");
993 gtk_tree_model_get (GTK_TREE_MODEL (tree
), &iter
,
994 INFO_PARSER_COLUMN_PAGE_NO
, &page_no
,
995 INFO_PARSER_COLUMN_PAGE_NAME
, &page_name
,
996 INFO_PARSER_COLUMN_PAGE_CONTENT
, &page_content
,
998 debug_print (DB_DEBUG
, "Got Section: %s\n", page_name
);
999 if (strstr (page_content
, "*Note") ||
1000 strstr (page_content
, "*note")) {
1003 if (strstr (page_content
, "* Menu:")) {
1004 newnode
= yelp_info_parse_menu (tree
, node
, page_content
, notes
);
1006 newnode
= xmlNewTextChild (*node
, NULL
,
1010 info_body_text (newnode
, NULL
, NULL
, FALSE
, page_content
);
1013 /* Handle notes here */
1014 info_process_text_notes (&newnode
, page_content
, tree
);
1017 /* if we free the page content, now it's in the XML, we can
1018 * save some memory */
1019 g_free (page_content
);
1020 page_content
= NULL
;
1022 if (gtk_tree_model_iter_parent (GTK_TREE_MODEL (tree
), &parent
, &iter
)) {
1024 gtk_tree_model_get (GTK_TREE_MODEL (tree
), &parent
,
1025 INFO_PARSER_COLUMN_PAGE_NO
, &parent_id
,
1027 xmlNewProp (newnode
, BAD_CAST
"up", BAD_CAST parent_id
);
1031 xmlNewProp (newnode
, BAD_CAST
"id",
1033 xmlNewProp (newnode
, BAD_CAST
"name",
1034 BAD_CAST page_name
);
1035 if (gtk_tree_model_iter_children (GTK_TREE_MODEL (tree
),
1038 parse_tree_level (tree
, &newnode
, children
);
1042 while (gtk_tree_model_iter_next (GTK_TREE_MODEL (tree
), &iter
));
1043 debug_print (DB_DEBUG
, "Ascending\n");
1047 yelp_info_parser_parse_tree (GtkTreeStore
*tree
)
1058 doc
= xmlNewDoc (BAD_CAST
"1.0");
1059 node
= xmlNewNode (NULL
, BAD_CAST
"Info");
1060 xmlDocSetRootElement (doc
, node
);
1062 /* functions I will want:
1063 gtk_tree_model_get_iter_first;
1064 gtk_tree_model_iter_next;
1065 gtk_tree_model_iter_children;
1068 if (gtk_tree_model_get_iter_first (GTK_TREE_MODEL (tree
), &iter
))
1069 parse_tree_level (tree
, &node
, iter
);
1070 d (else debug_print (DB_DEBUG
, "Empty tree?\n"));
1073 xmlDocDumpFormatMemory (doc, &xmlbuf, &bufsiz, 1);
1074 g_print ("XML follows:\n%s\n", xmlbuf);
1081 resolve_frag_id (GtkTreeModel
*model
, GtkTreePath
*path
, GtkTreeIter
*iter
,
1084 gchar
*page_no
= NULL
;
1085 gchar
*page_name
= NULL
;
1086 gchar
**xref
= data
;
1088 gtk_tree_model_get (GTK_TREE_MODEL (model
), iter
,
1089 INFO_PARSER_COLUMN_PAGE_NO
, &page_no
,
1090 INFO_PARSER_COLUMN_PAGE_NAME
, &page_name
,
1092 if (g_str_equal (page_name
, *xref
)) {
1094 *xref
= g_strdup (page_name
);
1095 *xref
= g_strdelimit (*xref
, " ", '_');
1108 get_menuoptions (gchar
*line
, gchar
**title
, gchar
**ref
, gchar
**desc
,
1111 /* Since info is actually braindead and allows .s in
1112 * its references, we gotta carefully extract things
1113 * as .s can be in either the title or desc
1116 gchar
*tfind
= NULL
;
1118 if (!g_str_has_prefix (line
, "* "))
1121 tfind
= strchr (tmp
, ':');
1123 if (!tfind
) /* No : on the line, bail out */
1126 (*title
) = g_strndup (tmp
, tfind
-tmp
);
1128 if (tfind
[1] == ':') { /* This happens if the title and ref are the same
1129 * Most menus are of this type
1132 (*ref
) = NULL
; /* There is no second part. The rest is description */
1135 (*xref
) = g_strndup (tmp
, tfind
-tmp
);
1139 (*desc
) = g_strdup (tfind
);
1140 } else { /* The other type of menu option */
1144 td
= strchr (tfind
, '.');
1147 (*ref
) = g_strndup (tfind
, td
-tfind
);
1148 (*xref
) = g_strdup (*ref
);
1152 (*desc
) = g_strdup (td
);
1157 /* Find the first non whitespace character in str or return pointer to the
1158 * '\0' if there isn't one. */
1160 first_non_space (gchar
* str
)
1162 /* As long as str is null terminated, this is ok! */
1163 while (g_ascii_isspace (*str
)) str
++;
1168 yelp_info_parse_menu (GtkTreeStore
*tree
, xmlNodePtr
*node
,
1169 gchar
*page_content
, gboolean notes
)
1174 xmlNodePtr newnode
, menu_node
, mholder
= NULL
;
1177 split
= g_strsplit (page_content
, "* Menu:", 2);
1179 newnode
= xmlNewChild (*node
, NULL
,
1180 BAD_CAST
"Section", NULL
);
1184 info_body_text (newnode
, NULL
, NULL
, FALSE
, split
[0]);
1186 info_process_text_notes (&newnode
, split
[0], tree
);
1189 menuitems
= g_strsplit (split
[1], "\n", -1);
1192 /* The output xml should look something like the following:
1196 <a href="xref:Help-Inv">Help-Inv</a>
1197 <para1>Invisible text in Emacs Info.</para1>
1200 <a href="xref:Help-M">Help-M</a>
1201 <para1>Menus.</para1>
1206 (from the top page of info:info). Note the absence of *'s and
1209 If there's a line with no "* Blah::", it looks like a child of
1210 the previous menu item so (for i > 0) deal with that correctly by
1211 not "closing" the <menuholder> tag until we find the next
1215 if (menuitems
[0] != NULL
) {
1216 /* If there are any menu items, make the <menu> node */
1217 menu_node
= xmlNewChild (newnode
, NULL
, BAD_CAST
"menu", NULL
);
1220 while (menuitems
[i
] != NULL
) {
1221 gboolean menu
= FALSE
;
1222 gchar
*title
= NULL
;
1226 gchar
*link_text
= NULL
;
1229 menu
= get_menuoptions (menuitems
[i
], &title
, &ref
, &desc
, &xref
);
1231 if (menu
&& (*title
== '\0' || *(title
+ 1) == '\0')) {
1232 g_warning ("Info title unexpectedly short for menu item (%s)",
1238 mholder
= xmlNewChild (menu_node
, NULL
, BAD_CAST
"menuholder", NULL
);
1239 gtk_tree_model_foreach (GTK_TREE_MODEL (tree
), resolve_frag_id
, &xref
);
1241 if (ref
== NULL
) { /* A standard type menu */
1242 /* title+2 skips the "* ". We know we haven't jumped over the
1243 end of the string because strlen (title) >= 3 */
1244 link_text
= g_strdup (title
+2);
1246 ref1
= xmlNewTextChild (mholder
, NULL
, BAD_CAST
"a",
1247 BAD_CAST link_text
);
1249 tmp
= g_strconcat ("xref:", xref
, NULL
);
1250 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST tmp
);
1252 } else { /* Indexy type menu - we gotta do a little work to fix the
1255 gchar
*spacing
= ref
;
1259 while (*spacing
==' ') {
1263 sp
= g_strndup (ref
, c
);
1265 link_text
= g_strdup (title
);
1267 ref1
= xmlNewTextChild (mholder
, NULL
, BAD_CAST
"a",
1268 BAD_CAST link_text
);
1269 tmp
= g_strconcat ("xref:", xref
, NULL
);
1270 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST tmp
);
1272 xmlNewTextChild (mholder
, NULL
, BAD_CAST
"spacing",
1274 tmp
= g_strconcat (g_strstrip(ref
), ".", NULL
);
1275 ref1
= xmlNewTextChild (mholder
, NULL
, BAD_CAST
"a",
1278 tmp
= g_strconcat ("xref:", xref
, NULL
);
1279 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST tmp
);
1285 tmp
= g_strconcat ("\n", first_non_space (desc
), NULL
);
1288 Don't print the link text a second time, because that looks
1291 We don't do a straight check for equality because lots of
1292 .info files have something like
1296 Obviously if the longer explanation has more afterwards, we
1297 don't want to omit it, which is why there's the strlen test.
1299 if (strncmp (link_text
, tmp
+ 1, strlen (link_text
)) ||
1300 strlen (link_text
) + 1 < strlen (tmp
+ 1)) {
1301 xmlNewTextChild (mholder
, NULL
,
1302 BAD_CAST
"para1", BAD_CAST tmp
);
1308 else if (*(menuitems
[i
]) != '\0') {
1309 tmp
= g_strconcat ("\n", first_non_space (menuitems
[i
]), NULL
);
1310 xmlNewTextChild (mholder
? mholder
: menu_node
,
1311 NULL
, BAD_CAST
"para1",
1322 g_strfreev (menuitems
);
1328 info_process_text_notes (xmlNodePtr
*node
, gchar
*content
, GtkTreeStore
*tree
)
1333 xmlNodePtr paragraph
= NULL
;
1334 gboolean first
= TRUE
;
1337 Split using the regular expression
1341 which deals with either case and the last bit is a lookahead so
1342 that we don't split on things of the form *Note:_, which aren't
1345 notes
= g_regex_split_simple ("\\*[Nn]ote(?!_)", content
, 0, 0);
1347 for (current
= notes
; *current
!= NULL
; current
++) {
1350 gchar
*alt_append
, *alt_append1
;
1353 gchar
*break_point
= NULL
;
1354 gboolean broken
= FALSE
;
1356 /* The first node is special. It doesn't have a note ref at the
1357 * start, so we can just add it and forget about it.
1360 info_body_text (*node
, ¶graph
, NULL
, TRUE
, (*current
));
1364 /* If we got to here, we now gotta parse the note reference */
1365 append
= strchr (*current
, ':');
1367 info_body_text (*node
, ¶graph
, NULL
, TRUE
, *current
);
1371 alt_append
= append
;
1372 alt_append1
= alt_append
;
1373 append
= strchr (append
, ':');
1374 alt_append
= strchr (alt_append
, '.');
1375 if (alt_append
&& g_str_has_prefix (alt_append
, ".info")) {
1378 alt_append
= strchr (alt_append
, '.');
1380 alt_append1
= strchr (alt_append1
, ',');
1381 if (!append
&& !alt_append
&& !alt_append1
) {
1382 info_body_text (*node
, ¶graph
, NULL
, TRUE
, *current
);
1385 if (!append
|| alt_append
|| alt_append1
) {
1387 if (alt_append
) append
= alt_append
;
1388 else append
= alt_append1
;
1390 if ((alt_append
&& alt_append
< append
))
1391 append
= alt_append
;
1392 if (alt_append1
&& alt_append1
< append
)
1393 append
= alt_append1
;
1396 url
= g_strndup (*current
, append
- (*current
));
1398 /* Save a copy of the unadulterated link text for later. */
1399 link_text
= g_strconcat ("*Note", url
, NULL
);
1401 /* By now, we got 2 things. First, is append which is the (hopefully)
1402 * non-link text. Second, we got a url.
1403 * The url can be in several forms:
1405 * 2. linkend:(infofile)Linkend.
1406 * 3. Title: Linkend.
1407 * 4. Title: Linkend, (pretty sure this is just broken)
1408 * 5. Title: (infofile.info)Linkend.
1409 * All possibilities should have been picked up.
1411 * Clean up the split. Should be left with a real url and
1412 * a list of fragments that should be linked
1413 * Also goes through and removes extra spaces, leaving only one
1414 * space in place of many
1416 urls
= g_strsplit (url
, "\n", -1);
1417 break_point
= strchr (url
, '\n');
1418 while (break_point
) {
1420 break_point
= strchr (++break_point
, '\n');
1422 break_point
= strchr (url
, ' ');
1423 while (break_point
) {
1424 if (*(break_point
+1) == ' ') {
1425 /* Massive space. Fix. */
1426 gchar
*next
= break_point
;
1429 while (*next
== ' ')
1432 url_copy
= g_strndup (url
, break_point
-url
);
1433 url
= g_strconcat (url_copy
, next
, NULL
);
1435 break_point
= strchr (url
, ' ');
1439 break_point
= strchr (break_point
, ' ');
1442 if (url
[strlen(url
)-1] == '.') { /* The 2nd or 3rd sort of link */
1446 stop
= strchr (url
, ':');
1447 lurl
= strchr (stop
, '(');
1448 if (!lurl
) { /* 3rd type of link */
1452 link
= g_strdup (stop
);
1453 link
= g_strstrip (link
);
1454 length
= strlen (link
) - 1;
1455 link
[length
] = '\0';
1456 href
= g_strconcat ("xref:", link
, NULL
);
1461 } else { /* 2nd type of link. Easy. Provided .info is neglected ;) */
1467 new_url
= g_strdup (lurl
);
1468 info
= strstr (new_url
, ".info)");
1469 stripped
= g_strndup (new_url
, info
-new_url
);
1471 lurl
= g_strconcat (stripped
, info
, NULL
);
1475 zloc
= &(lurl
[strlen(lurl
)-1]);
1477 href
= g_strconcat ("info:", lurl
, NULL
);
1480 } else { /* First kind of link */
1484 tmp1
= strchr (url
, ':');
1486 frag
= g_strdup (url
);
1488 frag
= g_strndup (url
, tmp1
- url
);
1490 gtk_tree_model_foreach (GTK_TREE_MODEL (tree
), resolve_frag_id
, &frag
);
1491 href
= g_strconcat ("xref:", frag
, NULL
);
1495 /* Check we've got a valid paragraph node */
1497 paragraph
= xmlNewChild (*node
, NULL
, BAD_CAST
"para", NULL
);
1501 Now we're supposed to actually render the link. I have a list of
1502 bits of URL and actually this is really easy - I want to have
1503 the link *text* exactly the same as it appeared in the .info
1504 file, so don't use the list of strings urls, instead use the
1505 whole lot: url (complete with embedded newlines etc.)
1507 ref1
= xmlNewTextChild (paragraph
, NULL
, BAD_CAST
"a",
1508 BAD_CAST link_text
);
1510 xmlNewProp (ref1
, BAD_CAST
"href", BAD_CAST href
);
1514 /* Finally, we can add the following text as required */
1515 info_body_text (*node
, ¶graph
, NULL
, TRUE
, append
);