Scan media entities as well, not just url entities. This should expand more
[bitlbee.git] / lib / xmltree.c
blob74292be97c0cce16988b97617bb852d9ceb89099
1 /***************************************************************************\
2 * *
3 * BitlBee - An IRC to IM gateway *
4 * Simple XML (stream) parse tree handling code (Jabber/XMPP, mainly) *
5 * *
6 * Copyright 2006 Wilmer van der Gaast <wilmer@gaast.net> *
7 * *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of the GNU Lesser General Public *
10 * License as published by the Free Software Foundation, version *
11 * 2.1. *
12 * *
13 * This library is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
16 * Lesser General Public License for more details. *
17 * *
18 * You should have received a copy of the GNU Lesser General Public License *
19 * along with this library; if not, write to the Free Software Foundation, *
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *
21 * *
22 ****************************************************************************/
24 #include <glib.h>
25 #include <string.h>
26 #include <unistd.h>
27 #include <ctype.h>
28 #include <stdio.h>
30 #include "xmltree.h"
32 #define g_strcasecmp g_ascii_strcasecmp
33 #define g_strncasecmp g_ascii_strncasecmp
35 static void xt_start_element( GMarkupParseContext *ctx, const gchar *element_name, const gchar **attr_names, const gchar **attr_values, gpointer data, GError **error )
37 struct xt_parser *xt = data;
38 struct xt_node *node = g_new0( struct xt_node, 1 ), *nt;
39 int i;
41 node->parent = xt->cur;
42 node->name = g_strdup( element_name );
44 /* First count the number of attributes */
45 for( i = 0; attr_names[i]; i ++ );
47 /* Then allocate a NULL-terminated array. */
48 node->attr = g_new0( struct xt_attr, i + 1 );
50 /* And fill it, saving one variable by starting at the end. */
51 for( i --; i >= 0; i -- )
53 node->attr[i].key = g_strdup( attr_names[i] );
54 node->attr[i].value = g_strdup( attr_values[i] );
57 /* Add it to the linked list of children nodes, if we have a current
58 node yet. */
59 if( xt->cur )
61 if( xt->cur->children )
63 for( nt = xt->cur->children; nt->next; nt = nt->next );
64 nt->next = node;
66 else
68 xt->cur->children = node;
71 else if( xt->root )
73 /* ERROR situation: A second root-element??? */
76 /* Now this node will be the new current node. */
77 xt->cur = node;
78 /* And maybe this is the root? */
79 if( xt->root == NULL )
80 xt->root = node;
83 static void xt_text( GMarkupParseContext *ctx, const gchar *text, gsize text_len, gpointer data, GError **error )
85 struct xt_parser *xt = data;
86 struct xt_node *node = xt->cur;
88 if( node == NULL )
89 return;
91 /* FIXME: Does g_renew also OFFICIALLY accept NULL arguments? */
92 node->text = g_renew( char, node->text, node->text_len + text_len + 1 );
93 memcpy( node->text + node->text_len, text, text_len );
94 node->text_len += text_len;
95 /* Zero termination is always nice to have. */
96 node->text[node->text_len] = 0;
99 static void xt_end_element( GMarkupParseContext *ctx, const gchar *element_name, gpointer data, GError **error )
101 struct xt_parser *xt = data;
103 xt->cur->flags |= XT_COMPLETE;
104 xt->cur = xt->cur->parent;
107 GMarkupParser xt_parser_funcs =
109 xt_start_element,
110 xt_end_element,
111 xt_text,
112 NULL,
113 NULL
116 struct xt_parser *xt_new( const struct xt_handler_entry *handlers, gpointer data )
118 struct xt_parser *xt = g_new0( struct xt_parser, 1 );
120 xt->data = data;
121 xt->handlers = handlers;
122 xt_reset( xt );
124 return xt;
127 /* Reset the parser, flush everything we have so far. For example, we need
128 this for XMPP when doing TLS/SASL to restart the stream. */
129 void xt_reset( struct xt_parser *xt )
131 if( xt->parser )
132 g_markup_parse_context_free( xt->parser );
134 xt->parser = g_markup_parse_context_new( &xt_parser_funcs, 0, xt, NULL );
136 if( xt->root )
138 xt_free_node( xt->root );
139 xt->root = NULL;
140 xt->cur = NULL;
144 /* Feed the parser, don't execute any handler. Returns -1 on errors, 0 on
145 end-of-stream and 1 otherwise. */
146 int xt_feed( struct xt_parser *xt, const char *text, int text_len )
148 if( !g_markup_parse_context_parse( xt->parser, text, text_len, &xt->gerr ) )
150 return -1;
153 return !( xt->root && xt->root->flags & XT_COMPLETE );
156 /* Find completed nodes and see if a handler has to be called. Passing
157 a node isn't necessary if you want to start at the root, just pass
158 NULL. This second argument is needed for recursive calls. */
159 int xt_handle( struct xt_parser *xt, struct xt_node *node, int depth )
161 struct xt_node *c;
162 xt_status st;
163 int i;
165 /* Just in case someone likes infinite loops... */
166 if( xt->root == NULL )
167 return 0;
169 if( node == NULL )
170 return xt_handle( xt, xt->root, depth );
172 if( depth != 0 )
173 for( c = node->children; c; c = c->next )
174 if( !xt_handle( xt, c, depth > 0 ? depth - 1 : depth ) )
175 return 0;
177 if( node->flags & XT_COMPLETE && !( node->flags & XT_SEEN ) )
179 if( xt->handlers ) for( i = 0; xt->handlers[i].func; i ++ )
181 /* This one is fun! \o/ */
183 /* If handler.name == NULL it means it should always match. */
184 if( ( xt->handlers[i].name == NULL ||
185 /* If it's not, compare. There should always be a name. */
186 g_strcasecmp( xt->handlers[i].name, node->name ) == 0 ) &&
187 /* If handler.parent == NULL, it's a match. */
188 ( xt->handlers[i].parent == NULL ||
189 /* If there's a parent node, see if the name matches. */
190 ( node->parent ? g_strcasecmp( xt->handlers[i].parent, node->parent->name ) == 0 :
191 /* If there's no parent, the handler should mention <root> as a parent. */
192 strcmp( xt->handlers[i].parent, "<root>" ) == 0 ) ) )
194 st = xt->handlers[i].func( node, xt->data );
196 if( st == XT_ABORT )
197 return 0;
198 else if( st != XT_NEXT )
199 break;
203 node->flags |= XT_SEEN;
206 return 1;
209 /* Garbage collection: Cleans up all nodes that are handled. Useful for
210 streams because there's no reason to keep a complete packet history
211 in memory. */
212 void xt_cleanup( struct xt_parser *xt, struct xt_node *node, int depth )
214 struct xt_node *c, *prev;
216 if( !xt || !xt->root )
217 return;
219 if( node == NULL )
221 xt_cleanup( xt, xt->root, depth );
222 return;
225 if( node->flags & XT_SEEN && node == xt->root )
227 xt_free_node( xt->root );
228 xt->root = xt->cur = NULL;
229 /* xt->cur should be NULL already, BTW... */
231 return;
234 /* c contains the current node, prev the previous node (or NULL).
235 I admit, this one's pretty horrible. */
236 for( c = node->children, prev = NULL; c; prev = c, c = c ? c->next : node->children )
238 if( c->flags & XT_SEEN )
240 /* Remove the node from the linked list. */
241 if( prev )
242 prev->next = c->next;
243 else
244 node->children = c->next;
246 xt_free_node( c );
248 /* Since the for loop wants to get c->next, make sure
249 c points at something that exists (and that c->next
250 will actually be the next item we should check). c
251 can be NULL now, if we just removed the first item.
252 That explains the ? thing in for(). */
253 c = prev;
255 else
257 /* This node can't be cleaned up yet, but maybe a
258 subnode can. */
259 if( depth != 0 )
260 xt_cleanup( xt, c, depth > 0 ? depth - 1 : depth );
265 struct xt_node *xt_from_string( const char *in )
267 struct xt_parser *parser;
268 struct xt_node *ret;
270 parser = xt_new( NULL, NULL );
271 xt_feed( parser, in, strlen( in ) );
272 ret = parser->root;
273 parser->root = NULL;
274 xt_free( parser );
276 return ret;
279 static void xt_to_string_real( struct xt_node *node, GString *str )
281 char *buf;
282 struct xt_node *c;
283 int i;
285 g_string_append_printf( str, "<%s", node->name );
287 for( i = 0; node->attr[i].key; i ++ )
289 buf = g_markup_printf_escaped( " %s=\"%s\"", node->attr[i].key, node->attr[i].value );
290 g_string_append( str, buf );
291 g_free( buf );
294 if( node->text == NULL && node->children == NULL )
296 g_string_append( str, "/>" );
297 return;
300 g_string_append( str, ">" );
301 if( node->text_len > 0 )
303 buf = g_markup_escape_text( node->text, node->text_len );
304 g_string_append( str, buf );
305 g_free( buf );
308 for( c = node->children; c; c = c->next )
309 xt_to_string_real( c, str );
311 g_string_append_printf( str, "</%s>", node->name );
314 char *xt_to_string( struct xt_node *node )
316 GString *ret;
317 char *real;
319 ret = g_string_new( "" );
320 xt_to_string_real( node, ret );
322 real = ret->str;
323 g_string_free( ret, FALSE );
325 return real;
328 void xt_print( struct xt_node *node )
330 int i;
331 struct xt_node *c;
333 /* Indentation */
334 for( c = node; c->parent; c = c->parent )
335 fprintf( stderr, " " );
337 /* Start the tag */
338 fprintf( stderr, "<%s", node->name );
340 /* Print the attributes */
341 for( i = 0; node->attr[i].key; i ++ )
343 char *v = g_markup_escape_text( node->attr[i].value, -1 );
344 fprintf( stderr, " %s=\"%s\"", node->attr[i].key, v );
345 g_free( v );
348 /* /> in case there's really *nothing* inside this tag, otherwise
349 just >. */
350 /* If this tag doesn't have any content at all... */
351 if( node->text == NULL && node->children == NULL )
353 fprintf( stderr, "/>\n" );
354 return;
355 /* Then we're finished! */
358 /* Otherwise... */
359 fprintf( stderr, ">" );
361 /* Only print the text if it contains more than whitespace (TEST). */
362 if( node->text_len > 0 )
364 for( i = 0; node->text[i] && isspace( node->text[i] ); i ++ );
365 if( node->text[i] )
367 char *v = g_markup_escape_text( node->text, -1 );
368 fprintf( stderr, "%s", v );
369 g_free( v );
373 if( node->children )
374 fprintf( stderr, "\n" );
376 for( c = node->children; c; c = c->next )
377 xt_print( c );
379 if( node->children )
380 for( c = node; c->parent; c = c->parent )
381 fprintf( stderr, " " );
383 /* Non-empty tag is now finished. */
384 fprintf( stderr, "</%s>\n", node->name );
387 struct xt_node *xt_dup( struct xt_node *node )
389 struct xt_node *dup = g_new0( struct xt_node, 1 );
390 struct xt_node *c, *dc = NULL;
391 int i;
393 /* Let's NOT copy the parent element here BTW! Only do it for children. */
395 dup->name = g_strdup( node->name );
396 dup->flags = node->flags;
397 if( node->text )
399 dup->text = g_memdup( node->text, node->text_len + 1 );
400 dup->text_len = node->text_len;
403 /* Count the number of attributes and allocate the new array. */
404 for( i = 0; node->attr[i].key; i ++ );
405 dup->attr = g_new0( struct xt_attr, i + 1 );
407 /* Copy them all! */
408 for( i --; i >= 0; i -- )
410 dup->attr[i].key = g_strdup( node->attr[i].key );
411 dup->attr[i].value = g_strdup( node->attr[i].value );
414 /* This nice mysterious loop takes care of the children. */
415 for( c = node->children; c; c = c->next )
417 if( dc == NULL )
418 dc = dup->children = xt_dup( c );
419 else
420 dc = ( dc->next = xt_dup( c ) );
422 dc->parent = dup;
425 return dup;
428 /* Frees a node. This doesn't clean up references to itself from parents! */
429 void xt_free_node( struct xt_node *node )
431 int i;
433 if( !node )
434 return;
436 g_free( node->name );
437 g_free( node->text );
439 for( i = 0; node->attr[i].key; i ++ )
441 g_free( node->attr[i].key );
442 g_free( node->attr[i].value );
444 g_free( node->attr );
446 while( node->children )
448 struct xt_node *next = node->children->next;
450 xt_free_node( node->children );
451 node->children = next;
454 g_free( node );
457 void xt_free( struct xt_parser *xt )
459 if( !xt )
460 return;
462 if( xt->root )
463 xt_free_node( xt->root );
465 g_markup_parse_context_free( xt->parser );
467 g_free( xt );
470 /* To find a node's child with a specific name, pass the node's children
471 list, not the node itself! The reason you have to do this by hand: So
472 that you can also use this function as a find-next. */
473 struct xt_node *xt_find_node( struct xt_node *node, const char *name )
475 while( node )
477 char *colon;
479 if( g_strcasecmp( node->name, name ) == 0 ||
480 ( ( colon = strchr( node->name, ':' ) ) &&
481 g_strcasecmp( colon + 1, name ) == 0 ) )
482 break;
484 node = node->next;
487 return node;
490 /* More advanced than the one above, understands something like
491 ../foo/bar to find a subnode bar of a node foo which is a child
492 of node's parent. Pass the node directly, not its list of children. */
493 struct xt_node *xt_find_path( struct xt_node *node, const char *name )
495 while( name && *name && node )
497 char *colon, *slash;
498 int n;
500 if( ( slash = strchr( name, '/' ) ) )
501 n = slash - name;
502 else
503 n = strlen( name );
505 if( strncmp( name, "..", n ) == 0 )
507 node = node->parent;
509 else
511 node = node->children;
513 while( node )
515 if( g_strncasecmp( node->name, name, n ) == 0 ||
516 ( ( colon = strchr( node->name, ':' ) ) &&
517 g_strncasecmp( colon + 1, name, n ) == 0 ) )
518 break;
520 node = node->next;
524 name = slash ? slash + 1 : NULL;
527 return node;
530 char *xt_find_attr( struct xt_node *node, const char *key )
532 int i;
533 char *colon;
535 if( !node )
536 return NULL;
538 for( i = 0; node->attr[i].key; i ++ )
539 if( g_strcasecmp( node->attr[i].key, key ) == 0 )
540 break;
542 /* This is an awful hack that only takes care of namespace prefixes
543 inside a tag. Since IMHO excessive namespace usage in XMPP is
544 massive overkill anyway (this code exists for almost four years
545 now and never really missed it): Meh. */
546 if( !node->attr[i].key && strcmp( key, "xmlns" ) == 0 &&
547 ( colon = strchr( node->name, ':' ) ) )
549 *colon = '\0';
550 for( i = 0; node->attr[i].key; i ++ )
551 if( strncmp( node->attr[i].key, "xmlns:", 6 ) == 0 &&
552 strcmp( node->attr[i].key + 6, node->name ) == 0 )
553 break;
554 *colon = ':';
557 return node->attr[i].value;
560 /* Strip a few non-printable characters that aren't allowed in XML streams
561 (and upset some XMPP servers for example). */
562 void xt_strip_text( char *in )
564 char *out = in;
565 static const char nonprint[32] = {
566 0, 0, 0, 0, 0, 0, 0, 0, /* 0..7 */
567 0, 1, 1, 0, 0, 1, 0, 0, /* 9 (tab), 10 (\n), 13 (\r) */
570 if( !in )
571 return;
573 while( *in )
575 if( (unsigned int) *in >= ' ' || nonprint[(unsigned int) *in] )
576 *out ++ = *in;
577 in ++;
579 *out = *in;
582 struct xt_node *xt_new_node( char *name, const char *text, struct xt_node *children )
584 struct xt_node *node, *c;
586 node = g_new0( struct xt_node, 1 );
587 node->name = g_strdup( name );
588 node->children = children;
589 node->attr = g_new0( struct xt_attr, 1 );
591 if( text )
593 node->text = g_strdup( text );
594 xt_strip_text( node->text );
595 node->text_len = strlen( node->text );
598 for( c = children; c; c = c->next )
600 if( c->parent != NULL )
602 /* ERROR CONDITION: They seem to have a parent already??? */
605 c->parent = node;
608 return node;
611 void xt_add_child( struct xt_node *parent, struct xt_node *child )
613 struct xt_node *node;
615 /* This function can actually be used to add more than one child, so
616 do handle this properly. */
617 for( node = child; node; node = node->next )
619 if( node->parent != NULL )
621 /* ERROR CONDITION: They seem to have a parent already??? */
624 node->parent = parent;
627 if( parent->children == NULL )
629 parent->children = child;
631 else
633 for( node = parent->children; node->next; node = node->next );
634 node->next = child;
638 /* Same, but at the beginning. */
639 void xt_insert_child( struct xt_node *parent, struct xt_node *child )
641 struct xt_node *node, *last = NULL;
643 if( child == NULL )
644 return; /* BUG */
646 for( node = child; node; node = node->next )
648 if( node->parent != NULL )
650 /* ERROR CONDITION: They seem to have a parent already??? */
653 node->parent = parent;
654 last = node;
657 last->next = parent->children;
658 parent->children = child;
661 void xt_add_attr( struct xt_node *node, const char *key, const char *value )
663 int i;
665 /* Now actually it'd be nice if we can also change existing attributes
666 (which actually means this function doesn't have the right name).
667 So let's find out if we have this attribute already... */
668 for( i = 0; node->attr[i].key; i ++ )
669 if( strcmp( node->attr[i].key, key ) == 0 )
670 break;
672 if( node->attr[i].key == NULL )
674 /* If not, allocate space for a new attribute. */
675 node->attr = g_renew( struct xt_attr, node->attr, i + 2 );
676 node->attr[i].key = g_strdup( key );
677 node->attr[i+1].key = NULL;
679 else
681 /* Otherwise, free the old value before setting the new one. */
682 g_free( node->attr[i].value );
685 node->attr[i].value = g_strdup( value );
688 int xt_remove_attr( struct xt_node *node, const char *key )
690 int i, last;
692 for( i = 0; node->attr[i].key; i ++ )
693 if( strcmp( node->attr[i].key, key ) == 0 )
694 break;
696 /* If we didn't find the attribute... */
697 if( node->attr[i].key == NULL )
698 return 0;
700 g_free( node->attr[i].key );
701 g_free( node->attr[i].value );
703 /* If it's the last, this is easy: */
704 if( node->attr[i+1].key == NULL )
706 node->attr[i].key = node->attr[i].value = NULL;
708 else /* It's also pretty easy, actually. */
710 /* Find the last item. */
711 for( last = i + 1; node->attr[last+1].key; last ++ );
713 node->attr[i] = node->attr[last];
714 node->attr[last].key = NULL;
715 node->attr[last].value = NULL;
718 /* Let's not bother with reallocating memory here. It takes time and
719 most packets don't stay in memory for long anyway. */
721 return 1;