From 5520a04d8933ce5ee4b1a894ac2127e4bfce7167 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Petr=20P=C3=ADsa=C5=99?= <petr.pisar@atlas.cz>
Date: Wed, 6 Jan 2010 21:14:44 +0100
Subject: [PATCH] Add XML parser preserving physical XML structure

This is necessary for message hash computation
---
 src/Makefile.deps |  23 +++++
 src/Makefile.inc  |   7 +-
 src/physxml.c     | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 2 deletions(-)
 create mode 100644 src/physxml.c

diff --git a/src/Makefile.deps b/src/Makefile.deps
index 23637c5..4d9e080 100644
--- a/src/Makefile.deps
+++ b/src/Makefile.deps
@@ -87,4 +87,27 @@ crypto.o: crypto.c isds_priv.h isds.h \
   /usr/include/libxml2/libxml/threads.h \
   /usr/include/libxml2/libxml/xpath.h \
   /usr/include/libxml2/libxml/xpathInternals.h \
+  /usr/include/libxml2/libxml/xmlsave.h utils.h \
+  /usr/include/gpgme/gpgme.h
+physxml.o: physxml.c isds_priv.h isds.h \
+  /usr/include/libxml2/libxml/parser.h \
+  /usr/include/libxml2/libxml/xmlversion.h \
+  /usr/include/libxml2/libxml/xmlexports.h \
+  /usr/include/libxml2/libxml/tree.h \
+  /usr/include/libxml2/libxml/xmlstring.h \
+  /usr/include/libxml2/libxml/xmlregexp.h \
+  /usr/include/libxml2/libxml/dict.h /usr/include/libxml2/libxml/hash.h \
+  /usr/include/libxml2/libxml/valid.h \
+  /usr/include/libxml2/libxml/xmlerror.h \
+  /usr/include/libxml2/libxml/list.h \
+  /usr/include/libxml2/libxml/xmlautomata.h \
+  /usr/include/libxml2/libxml/entities.h \
+  /usr/include/libxml2/libxml/encoding.h \
+  /usr/include/libxml2/libxml/xmlIO.h \
+  /usr/include/libxml2/libxml/globals.h /usr/include/libxml2/libxml/SAX.h \
+  /usr/include/libxml2/libxml/xlink.h /usr/include/libxml2/libxml/SAX2.h \
+  /usr/include/libxml2/libxml/xmlmemory.h \
+  /usr/include/libxml2/libxml/threads.h \
+  /usr/include/libxml2/libxml/xpath.h \
+  /usr/include/libxml2/libxml/xpathInternals.h \
   /usr/include/libxml2/libxml/xmlsave.h utils.h
diff --git a/src/Makefile.inc b/src/Makefile.inc
index 088a5eb..5e5a6b2 100644
--- a/src/Makefile.inc
+++ b/src/Makefile.inc
@@ -2,7 +2,8 @@ CFLAGS = -std=c99 -Wall -Werror -fPIC -DPIC -g
 # Large files needed by GPGME
 CFLAGS += -D_FILE_OFFSET_BITS=64 -DLARGEFILE_SOURCE=1
 SONAME = libisds.so.0
-SOURCES = isds.c utils.c soap.c validator.c cencode.c cdecode.c crypto.c
+SOURCES = isds.c utils.c soap.c validator.c cencode.c cdecode.c crypto.c \
+		  physxml.c
 PREFIX=/usr/local
 LIBDIR=$(PREFIX)/lib
 INCLUDEDIR=$(PREFIX)/include
@@ -23,9 +24,11 @@ LIBCURL_LDFLAGS := $(shell pkg-config --libs libcurl)
 LIBGCRYPT_LDFLAGS := $(shell libgcrypt-config --libs)
 LIBKSBA_LDFLAGS := $(shell ksba-config --libs)
 LIBGPGME_LDFLAGS := $(shell gpgme-config --libs)
+LIBEXPAT_LDFLAGS := -lexpat
 LDFLAGS += $(LIBCURL_LDFLAGS) $(LIBXML_LDFLAGS) $(LIBGCRYPT_LDFLAGS) \
-		   $(LIBGPGME_LDFLAGS)
+		   $(LIBGPGME_LDFLAGS) $(LIBEXPAT_LDFLAGS)
 ifdef ISDS_USE_KSBA
 	CFLAGS += $(LIBKSBA_LDFLAGS)
 endif
 
+# vim:ft=make
diff --git a/src/physxml.c b/src/physxml.c
new file mode 100644
index 0000000..ff4c00a
--- /dev/null
+++ b/src/physxml.c
@@ -0,0 +1,259 @@
+#define _POSIX_SOURCE   /* For strtok_r */
+#include "isds_priv.h"
+#include "utils.h"
+
+/*#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>*/
+#include <string.h>
+#include <expat.h>
+#include <inttypes.h>
+
+#define PHYSXML_ELEMENT_SEPARATOR "|"
+#define PHYSXML_NS_SEPARATOR ">"
+#define NS_CHAR_SEPARATOR '>'
+
+struct expat_data {
+    XML_Parser parser;
+    const XML_Char **elements;  /* NULL terminated array of elements */
+    _Bool found;
+    size_t *start;
+    size_t *end;
+    int depth;              /* Current parser depth, root element is 0 */
+    int element_depth;      /* elements[element_depth] we are in,
+                                -1 if we are not in any (root mismatch)*/
+};
+
+
+/* Check for expat compile-time configuration */
+_hidden isds_error expat_init(void) {
+    XML_Expat_Version current;
+    const int min_major = 1;
+    const int min_minor = 95;
+    const int min_micro = 8;
+    const XML_Feature *features; /* Static array stored in expat BSS */
+    _Bool ns_supported = 0;
+
+ /*
+ * Max(XML_Size) <= Max(size_t)
+ * XML_Char is char, not a wchar_t
+ * XML_UNICODE is undefined (i.e. strings in UTF-8)
+ * */
+
+    /* Check minimal expat version */
+    current = XML_ExpatVersionInfo();
+    if ( (current.major < min_major) ||
+            (current.major == min_major && current.minor < min_minor) ||
+            (current.major == min_major && current.minor == min_minor &&
+                current.micro < min_micro) ) {
+        isds_log(ILF_ISDS, ILL_CRIT,
+                _("Minimal %d.%d.%d Expat version required. "
+                "Current version is %d.%d.%d\n"),
+                min_major, min_minor, min_micro,
+                current.major, current.minor, current.micro);
+        return IE_ERROR;
+    }
+
+    /* XML_Char must be char, not a wchar_t */
+    features = XML_GetFeatureList();
+    while (features->feature != XML_FEATURE_END) {
+        switch (features->feature) {
+            case XML_FEATURE_UNICODE_WCHAR_T:
+            case XML_FEATURE_UNICODE:
+                isds_log(ILF_ISDS, ILL_CRIT,
+                        _("Expat compiled with UTF-16 (wide) characters\n"));
+                return IE_ERROR;
+                break;
+            case XML_FEATURE_SIZEOF_XML_CHAR:
+                if (features->value != sizeof(char)) {
+                    isds_log(ILF_ISDS, ILL_CRIT,
+                            "Expat compiled with XML_Chars incompatible "
+                            "with chars\n");
+                    return IE_ERROR;
+                }
+                break;
+            case XML_FEATURE_NS:
+                ns_supported = 1;
+            default:
+                break;
+        }
+        features++;
+    }
+
+    if (!ns_supported) {
+        isds_log(ILF_ISDS, ILL_CRIT,
+                _("Expat not compiled with name space support\n"));
+        return IE_ERROR;
+    }
+
+    return IE_SUCCESS;
+}
+
+
+/* Breaks element path address into NULL terminated array of elements in
+ * preserved order. Zeroth array element will be first path element.
+ * @path  element address, content will be damaged
+ * @return array of elements, NULL in case of error */
+static const XML_Char **path2elements(XML_Char *path) {
+    const XML_Char **elements = NULL;
+    XML_Char *tmp_path;
+    char *saveptr = NULL;
+    XML_Char *element;
+    unsigned int depth = 0;
+
+    if (!path) return NULL;
+
+    elements = malloc(sizeof(elements[0]) * (strlen(path) + 1));
+    if (!elements) return NULL;
+
+    elements[0] = NULL;
+
+    tmp_path = path;
+    while ((element = (XML_Char *) strtok_r(tmp_path,
+                    PHYSXML_ELEMENT_SEPARATOR, &saveptr))) {
+        tmp_path = NULL;
+        elements[depth++] = element;
+    }
+
+    elements[depth] = NULL;
+    return elements;
+}
+
+
+/* Examine start and empty element tag.
+ * @name is expanded name */
+static void XMLCALL element_start(void *userData, const XML_Char *name,
+        const XML_Char **atts) {
+    struct expat_data *data = (struct expat_data *) userData;
+    data->depth++;
+
+    const XML_Index index = XML_GetCurrentByteIndex(data->parser);
+    /*const int count = XML_GetCurrentByteCount(data->parser);*/
+    /* XXX: Because document length is stored as size_t, index always fits
+     * size_t. */
+    const size_t boundary = index; 
+
+    /*printf("Start: name=%s, depth=%zd, offset=%#jx "
+            "count=%u => boundary=%#zx\n",
+            name, data->depth, (uintmax_t)index, count, boundary); */
+
+    if ((!data->found) &&
+            (data->depth == data->element_depth + 1) &&
+            (!strcmp(data->elements[data->element_depth + 1], name))) {
+        data->element_depth++;
+
+        /*printf("! Start tag for element `%s' found\n",
+                data->elements[data->element_depth]);*/
+        
+        if (!data->elements[data->element_depth + 1]) {
+            data->found = 1;
+            *data->start = boundary;
+        }
+    }
+}
+
+
+/* Examine end and empty element tag.
+ * @name is expanded name */
+static void XMLCALL element_end(void *userData, const XML_Char *name) {
+
+    struct expat_data *data = (struct expat_data *) userData;
+    enum XML_Status xerr;
+
+    const XML_Index index = (uintmax_t) XML_GetCurrentByteIndex(data->parser);
+    const int count = XML_GetCurrentByteCount(data->parser);
+    /* XXX: Because document length is stored as size_t, index + count always
+     * fits size_t. */
+    const size_t boundary = index + count - 1; 
+
+    /*printf("End:   name=%s, depth=%zd, offset=%#jx "
+            "count=%u => boundary=%#zx\n",
+            name, data->depth, (uintmax_t)index, count, boundary);*/
+
+    if (data->element_depth == data->depth) {
+        if (data->found) {
+            /*printf("! End tag for element `%s' found\n",
+                    data->elements[data->element_depth]);*/
+            *data->end = boundary;
+
+            /* Here we can stop parser
+             * XXX: requires Expat 1.95.8 */
+            xerr = XML_StopParser(data->parser, XML_FALSE);
+            if (xerr != XML_STATUS_OK) {
+                PANIC(_("Error while stopping parser\n"));
+            }
+
+        }
+        data->element_depth--;
+    }
+
+    data->depth--;
+}
+
+
+/* Locate element specified by element path in XML stream.
+ * TODO: Support other encodings than UTF-8
+ * @document is XML documuent as bitstream
+ * @length is size of @docuement in bytes. Zero length is forbidden.
+ * @path is special path (e.g. "|html|head|title",
+ * quallified element names are specified as
+ * NSURI '>' LOCALNAME, ommit NSURI and '>' separator if no namespace
+ * should be addressed (i.e. use only locale name)
+ * You can use PHYSXML_ELEMENT_SEPARATOR and PHYSXML_NS_SEPARATOR string
+ * macros.
+ * @start outputs start of the element location in @document (inclusive,
+ * counts from 0)
+ * @end outputs end of element (inclusive, counts from 0)
+ * @return 0 if element found */
+_hidden isds_error find_element_boundary(void *document, size_t length,
+        char *path, size_t *start, size_t *end) {
+    
+    XML_Parser parser;
+    enum XML_Status xerr;
+    struct expat_data user_data;
+
+    if (!document || !path || !start || !end || length <= 0)
+        return IE_INVAL;
+
+    /* Parse XPath */
+    user_data.elements = path2elements(path);
+    if (!user_data.elements) return IE_NOMEM;
+
+    /* No element means whole document */
+    if (!user_data.elements[0]) {
+        free(user_data.elements);
+        *start = 0;
+        *end = length - 1;
+        return IE_SUCCESS;
+    }
+
+    /* Create parser*/
+    parser = XML_ParserCreateNS(NULL, NS_CHAR_SEPARATOR);
+
+    XML_SetStartElementHandler(parser, element_start);
+    XML_SetEndElementHandler(parser, element_end);
+    
+    user_data.parser = parser;
+    user_data.found = 0;
+    user_data.start = start;
+    user_data.end = end;
+    user_data.depth = -1;
+    user_data.element_depth = -1;
+    XML_SetUserData(parser, &user_data);
+
+    /* Parse it */
+    xerr = XML_Parse(parser, (const char *) document, length, 1);
+    if (xerr != XML_STATUS_OK &&
+            !( (xerr == XML_STATUS_ERROR &&
+                    XML_GetErrorCode(parser) == XML_ERROR_ABORTED))) {
+        free(user_data.elements);
+        isds_log(ILF_ISDS, ILL_CRIT, _("XML_Parse failed\n"));
+        return IE_ERROR;
+    }
+    free(user_data.elements);
+
+    XML_ParserFree(parser);
+    if (user_data.found) return IE_SUCCESS;
+    else return IE_NOEXIST;
+}
+
-- 
2.11.4.GIT