4 * Parser interface for DOM-based parser (libxml) rather than
5 * stream-based SAX-type parser
9 #include "access/htup_details.h"
10 #include "executor/spi.h"
13 #include "lib/stringinfo.h"
14 #include "miscadmin.h"
15 #include "utils/builtins.h"
16 #include "utils/xml.h"
20 #include <libxml/xpath.h>
21 #include <libxml/tree.h>
22 #include <libxml/xmlmemory.h>
23 #include <libxml/xmlerror.h>
24 #include <libxml/parserInternals.h>
28 /* exported for use by xslt_proc.c */
30 PgXmlErrorContext
*pgxml_parser_init(PgXmlStrictness strictness
);
32 /* workspace for pgxml_xpath() */
37 xmlXPathContextPtr ctxt
;
38 xmlXPathObjectPtr res
;
41 /* local declarations */
43 static xmlChar
*pgxmlNodeSetToText(xmlNodeSetPtr nodeset
,
44 xmlChar
*toptagname
, xmlChar
*septagname
,
47 static text
*pgxml_result_to_text(xmlXPathObjectPtr res
, xmlChar
*toptag
,
48 xmlChar
*septag
, xmlChar
*plainsep
);
50 static xmlChar
*pgxml_texttoxmlchar(text
*textstring
);
52 static xmlXPathObjectPtr
pgxml_xpath(text
*document
, xmlChar
*xpath
,
53 xpath_workspace
*workspace
);
55 static void cleanup_workspace(xpath_workspace
*workspace
);
59 * Initialize for xml parsing.
61 * As with the underlying pg_xml_init function, calls to this MUST be followed
62 * by a PG_TRY block that guarantees that pg_xml_done is called.
65 pgxml_parser_init(PgXmlStrictness strictness
)
67 PgXmlErrorContext
*xmlerrcxt
;
69 /* Set up error handling (we share the core's error handler) */
70 xmlerrcxt
= pg_xml_init(strictness
);
72 /* Note: we're assuming an elog cannot be thrown by the following calls */
74 /* Initialize libxml */
81 /* Encodes special characters (<, >, &, " and \r) as XML entities */
83 PG_FUNCTION_INFO_V1(xml_encode_special_chars
);
86 xml_encode_special_chars(PG_FUNCTION_ARGS
)
88 text
*tin
= PG_GETARG_TEXT_PP(0);
93 ts
= pgxml_texttoxmlchar(tin
);
95 tt
= xmlEncodeSpecialChars(NULL
, ts
);
99 tout
= cstring_to_text((char *) tt
);
103 PG_RETURN_TEXT_P(tout
);
107 * Function translates a nodeset into a text representation
109 * iterates over each node in the set and calls xmlNodeDump to write it to
110 * an xmlBuffer -from which an xmlChar * string is returned.
112 * each representation is surrounded by <tagname> ... </tagname>
114 * plainsep is an ordinary (not tag) separator - if used, then nodes are
115 * cast to string as output method
118 pgxmlNodeSetToText(xmlNodeSetPtr nodeset
,
127 buf
= xmlBufferCreate();
129 if ((toptagname
!= NULL
) && (xmlStrlen(toptagname
) > 0))
131 xmlBufferWriteChar(buf
, "<");
132 xmlBufferWriteCHAR(buf
, toptagname
);
133 xmlBufferWriteChar(buf
, ">");
137 for (i
= 0; i
< nodeset
->nodeNr
; i
++)
139 if (plainsep
!= NULL
)
141 xmlBufferWriteCHAR(buf
,
142 xmlXPathCastNodeToString(nodeset
->nodeTab
[i
]));
144 /* If this isn't the last entry, write the plain sep. */
145 if (i
< (nodeset
->nodeNr
) - 1)
146 xmlBufferWriteChar(buf
, (char *) plainsep
);
150 if ((septagname
!= NULL
) && (xmlStrlen(septagname
) > 0))
152 xmlBufferWriteChar(buf
, "<");
153 xmlBufferWriteCHAR(buf
, septagname
);
154 xmlBufferWriteChar(buf
, ">");
157 nodeset
->nodeTab
[i
]->doc
,
161 if ((septagname
!= NULL
) && (xmlStrlen(septagname
) > 0))
163 xmlBufferWriteChar(buf
, "</");
164 xmlBufferWriteCHAR(buf
, septagname
);
165 xmlBufferWriteChar(buf
, ">");
171 if ((toptagname
!= NULL
) && (xmlStrlen(toptagname
) > 0))
173 xmlBufferWriteChar(buf
, "</");
174 xmlBufferWriteCHAR(buf
, toptagname
);
175 xmlBufferWriteChar(buf
, ">");
177 result
= xmlStrdup(buf
->content
);
183 /* Translate a PostgreSQL "varlena" -i.e. a variable length parameter
184 * into the libxml2 representation
187 pgxml_texttoxmlchar(text
*textstring
)
189 return (xmlChar
*) text_to_cstring(textstring
);
192 /* Publicly visible XPath functions */
195 * This is a "raw" xpath function. Check that it returns child elements
198 PG_FUNCTION_INFO_V1(xpath_nodeset
);
201 xpath_nodeset(PG_FUNCTION_ARGS
)
203 text
*document
= PG_GETARG_TEXT_PP(0);
204 text
*xpathsupp
= PG_GETARG_TEXT_PP(1); /* XPath expression */
205 xmlChar
*toptag
= pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2));
206 xmlChar
*septag
= pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(3));
209 xmlXPathObjectPtr res
;
210 xpath_workspace workspace
;
212 xpath
= pgxml_texttoxmlchar(xpathsupp
);
214 res
= pgxml_xpath(document
, xpath
, &workspace
);
216 xpres
= pgxml_result_to_text(res
, toptag
, septag
, NULL
);
218 cleanup_workspace(&workspace
);
224 PG_RETURN_TEXT_P(xpres
);
228 * The following function is almost identical, but returns the elements in
231 PG_FUNCTION_INFO_V1(xpath_list
);
234 xpath_list(PG_FUNCTION_ARGS
)
236 text
*document
= PG_GETARG_TEXT_PP(0);
237 text
*xpathsupp
= PG_GETARG_TEXT_PP(1); /* XPath expression */
238 xmlChar
*plainsep
= pgxml_texttoxmlchar(PG_GETARG_TEXT_PP(2));
241 xmlXPathObjectPtr res
;
242 xpath_workspace workspace
;
244 xpath
= pgxml_texttoxmlchar(xpathsupp
);
246 res
= pgxml_xpath(document
, xpath
, &workspace
);
248 xpres
= pgxml_result_to_text(res
, NULL
, NULL
, plainsep
);
250 cleanup_workspace(&workspace
);
256 PG_RETURN_TEXT_P(xpres
);
260 PG_FUNCTION_INFO_V1(xpath_string
);
263 xpath_string(PG_FUNCTION_ARGS
)
265 text
*document
= PG_GETARG_TEXT_PP(0);
266 text
*xpathsupp
= PG_GETARG_TEXT_PP(1); /* XPath expression */
270 xmlXPathObjectPtr res
;
271 xpath_workspace workspace
;
273 pathsize
= VARSIZE_ANY_EXHDR(xpathsupp
);
276 * We encapsulate the supplied path with "string()" = 8 chars + 1 for NUL
279 /* We could try casting to string using the libxml function? */
281 xpath
= (xmlChar
*) palloc(pathsize
+ 9);
282 memcpy((char *) xpath
, "string(", 7);
283 memcpy((char *) (xpath
+ 7), VARDATA_ANY(xpathsupp
), pathsize
);
284 xpath
[pathsize
+ 7] = ')';
285 xpath
[pathsize
+ 8] = '\0';
287 res
= pgxml_xpath(document
, xpath
, &workspace
);
289 xpres
= pgxml_result_to_text(res
, NULL
, NULL
, NULL
);
291 cleanup_workspace(&workspace
);
297 PG_RETURN_TEXT_P(xpres
);
301 PG_FUNCTION_INFO_V1(xpath_number
);
304 xpath_number(PG_FUNCTION_ARGS
)
306 text
*document
= PG_GETARG_TEXT_PP(0);
307 text
*xpathsupp
= PG_GETARG_TEXT_PP(1); /* XPath expression */
310 xmlXPathObjectPtr res
;
311 xpath_workspace workspace
;
313 xpath
= pgxml_texttoxmlchar(xpathsupp
);
315 res
= pgxml_xpath(document
, xpath
, &workspace
);
322 fRes
= xmlXPathCastToNumber(res
);
324 cleanup_workspace(&workspace
);
326 if (xmlXPathIsNaN(fRes
))
329 PG_RETURN_FLOAT4(fRes
);
333 PG_FUNCTION_INFO_V1(xpath_bool
);
336 xpath_bool(PG_FUNCTION_ARGS
)
338 text
*document
= PG_GETARG_TEXT_PP(0);
339 text
*xpathsupp
= PG_GETARG_TEXT_PP(1); /* XPath expression */
342 xmlXPathObjectPtr res
;
343 xpath_workspace workspace
;
345 xpath
= pgxml_texttoxmlchar(xpathsupp
);
347 res
= pgxml_xpath(document
, xpath
, &workspace
);
352 PG_RETURN_BOOL(false);
354 bRes
= xmlXPathCastToBoolean(res
);
356 cleanup_workspace(&workspace
);
358 PG_RETURN_BOOL(bRes
);
363 /* Core function to evaluate XPath query */
365 static xmlXPathObjectPtr
366 pgxml_xpath(text
*document
, xmlChar
*xpath
, xpath_workspace
*workspace
)
368 int32 docsize
= VARSIZE_ANY_EXHDR(document
);
369 PgXmlErrorContext
*xmlerrcxt
;
370 xmlXPathCompExprPtr comppath
;
372 workspace
->doctree
= NULL
;
373 workspace
->ctxt
= NULL
;
374 workspace
->res
= NULL
;
376 xmlerrcxt
= pgxml_parser_init(PG_XML_STRICTNESS_LEGACY
);
380 workspace
->doctree
= xmlReadMemory((char *) VARDATA_ANY(document
),
383 if (workspace
->doctree
!= NULL
)
385 workspace
->ctxt
= xmlXPathNewContext(workspace
->doctree
);
386 workspace
->ctxt
->node
= xmlDocGetRootElement(workspace
->doctree
);
388 /* compile the path */
389 comppath
= xmlXPathCompile(xpath
);
390 if (comppath
== NULL
)
391 xml_ereport(xmlerrcxt
, ERROR
, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION
,
392 "XPath Syntax Error");
394 /* Now evaluate the path expression. */
395 workspace
->res
= xmlXPathCompiledEval(comppath
, workspace
->ctxt
);
397 xmlXPathFreeCompExpr(comppath
);
402 cleanup_workspace(workspace
);
404 pg_xml_done(xmlerrcxt
, true);
410 if (workspace
->res
== NULL
)
411 cleanup_workspace(workspace
);
413 pg_xml_done(xmlerrcxt
, false);
415 return workspace
->res
;
418 /* Clean up after processing the result of pgxml_xpath() */
420 cleanup_workspace(xpath_workspace
*workspace
)
423 xmlXPathFreeObject(workspace
->res
);
424 workspace
->res
= NULL
;
426 xmlXPathFreeContext(workspace
->ctxt
);
427 workspace
->ctxt
= NULL
;
428 if (workspace
->doctree
)
429 xmlFreeDoc(workspace
->doctree
);
430 workspace
->doctree
= NULL
;
434 pgxml_result_to_text(xmlXPathObjectPtr res
,
448 xpresstr
= pgxmlNodeSetToText(res
->nodesetval
,
454 xpresstr
= xmlStrdup(res
->stringval
);
458 elog(NOTICE
, "unsupported XQuery result: %d", res
->type
);
459 xpresstr
= xmlStrdup((const xmlChar
*) "<unsupported/>");
462 /* Now convert this result back to text */
463 xpres
= cstring_to_text((char *) xpresstr
);
465 /* Free various storage */
472 * xpath_table is a table function. It needs some tidying (as do the
473 * other functions here!
475 PG_FUNCTION_INFO_V1(xpath_table
);
478 xpath_table(PG_FUNCTION_ARGS
)
480 /* Function parameters */
481 char *pkeyfield
= text_to_cstring(PG_GETARG_TEXT_PP(0));
482 char *xmlfield
= text_to_cstring(PG_GETARG_TEXT_PP(1));
483 char *relname
= text_to_cstring(PG_GETARG_TEXT_PP(2));
484 char *xpathset
= text_to_cstring(PG_GETARG_TEXT_PP(3));
485 char *condition
= text_to_cstring(PG_GETARG_TEXT_PP(4));
487 /* SPI (input tuple) support */
488 SPITupleTable
*tuptable
;
490 TupleDesc spi_tupdesc
;
493 ReturnSetInfo
*rsinfo
= (ReturnSetInfo
*) fcinfo
->resultinfo
;
494 AttInMetadata
*attinmeta
;
499 const char *pathsep
= "|";
505 int rownr
; /* For issuing multiple rows from one original
507 bool had_values
; /* To determine end of nodeset results */
508 StringInfoData query_buf
;
509 PgXmlErrorContext
*xmlerrcxt
;
510 volatile xmlDocPtr doctree
= NULL
;
512 InitMaterializedSRF(fcinfo
, MAT_SRF_USE_EXPECTED_DESC
);
514 /* must have at least one output column (for the pkey) */
515 if (rsinfo
->setDesc
->natts
< 1)
517 (errcode(ERRCODE_SYNTAX_ERROR
),
518 errmsg("xpath_table must have at least one output column")));
521 * At the moment we assume that the returned attributes make sense for the
522 * XPath specified (i.e. we trust the caller). It's not fatal if they get
523 * it wrong - the input function for the column type will raise an error
524 * if the path result can't be converted into the correct binary
528 attinmeta
= TupleDescGetAttInMetadata(rsinfo
->setDesc
);
530 values
= (char **) palloc(rsinfo
->setDesc
->natts
* sizeof(char *));
531 xpaths
= (xmlChar
**) palloc(rsinfo
->setDesc
->natts
* sizeof(xmlChar
*));
534 * Split XPaths. xpathset is a writable CString.
536 * Note that we stop splitting once we've done all needed for tupdesc
540 while (numpaths
< (rsinfo
->setDesc
->natts
- 1))
542 xpaths
[numpaths
++] = (xmlChar
*) pos
;
543 pos
= strstr(pos
, pathsep
);
553 /* Now build query */
554 initStringInfo(&query_buf
);
556 /* Build initial sql statement */
557 appendStringInfo(&query_buf
, "SELECT %s, %s FROM %s WHERE %s",
563 if ((ret
= SPI_connect()) < 0)
564 elog(ERROR
, "xpath_table: SPI_connect returned %d", ret
);
566 if ((ret
= SPI_exec(query_buf
.data
, 0)) != SPI_OK_SELECT
)
567 elog(ERROR
, "xpath_table: SPI execution failed for query %s",
570 proc
= SPI_processed
;
571 tuptable
= SPI_tuptable
;
572 spi_tupdesc
= tuptable
->tupdesc
;
575 * Check that SPI returned correct result. If you put a comma into one of
576 * the function parameters, this will catch it when the SPI query returns
579 if (spi_tupdesc
->natts
!= 2)
581 ereport(ERROR
, (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
582 errmsg("expression returning multiple columns is not valid in parameter list"),
583 errdetail("Expected two columns in SPI result, got %d.", spi_tupdesc
->natts
)));
587 * Setup the parser. This should happen after we are done evaluating the
588 * query, in case it calls functions that set up libxml differently.
590 xmlerrcxt
= pgxml_parser_init(PG_XML_STRICTNESS_LEGACY
);
594 /* For each row i.e. document returned from SPI */
597 for (i
= 0; i
< proc
; i
++)
601 xmlXPathContextPtr ctxt
;
602 xmlXPathObjectPtr res
;
604 xmlXPathCompExprPtr comppath
;
607 /* Extract the row data as C Strings */
608 spi_tuple
= tuptable
->vals
[i
];
609 pkey
= SPI_getvalue(spi_tuple
, spi_tupdesc
, 1);
610 xmldoc
= SPI_getvalue(spi_tuple
, spi_tupdesc
, 2);
613 * Clear the values array, so that not-well-formed documents
614 * return NULL in all columns. Note that this also means that
615 * spare columns will be NULL.
617 for (j
= 0; j
< rsinfo
->setDesc
->natts
; j
++)
620 /* Insert primary key */
623 /* Parse the document */
625 doctree
= xmlReadMemory(xmldoc
, strlen(xmldoc
),
628 else /* treat NULL as not well-formed */
633 /* not well-formed, so output all-NULL tuple */
634 ret_tuple
= BuildTupleFromCStrings(attinmeta
, values
);
635 tuplestore_puttuple(rsinfo
->setResult
, ret_tuple
);
636 heap_freetuple(ret_tuple
);
640 /* New loop here - we have to deal with nodeset results */
645 /* Now evaluate the set of xpaths. */
647 for (j
= 0; j
< numpaths
; j
++)
649 ctxt
= xmlXPathNewContext(doctree
);
650 ctxt
->node
= xmlDocGetRootElement(doctree
);
652 /* compile the path */
653 comppath
= xmlXPathCompile(xpaths
[j
]);
654 if (comppath
== NULL
)
655 xml_ereport(xmlerrcxt
, ERROR
,
656 ERRCODE_EXTERNAL_ROUTINE_EXCEPTION
,
657 "XPath Syntax Error");
659 /* Now evaluate the path expression. */
660 res
= xmlXPathCompiledEval(comppath
, ctxt
);
661 xmlXPathFreeCompExpr(comppath
);
668 /* We see if this nodeset has enough nodes */
669 if (res
->nodesetval
!= NULL
&&
670 rownr
< res
->nodesetval
->nodeNr
)
672 resstr
= xmlXPathCastNodeToString(res
->nodesetval
->nodeTab
[rownr
]);
681 resstr
= xmlStrdup(res
->stringval
);
685 elog(NOTICE
, "unsupported XQuery result: %d", res
->type
);
686 resstr
= xmlStrdup((const xmlChar
*) "<unsupported/>");
690 * Insert this into the appropriate column in the
693 values
[j
+ 1] = (char *) resstr
;
695 xmlXPathFreeContext(ctxt
);
698 /* Now add the tuple to the output, if there is one. */
701 ret_tuple
= BuildTupleFromCStrings(attinmeta
, values
);
702 tuplestore_puttuple(rsinfo
->setResult
, ret_tuple
);
703 heap_freetuple(ret_tuple
);
707 } while (had_values
);
725 pg_xml_done(xmlerrcxt
, true);
734 pg_xml_done(xmlerrcxt
, false);
739 * SFRM_Materialize mode expects us to return a NULL Datum. The actual
740 * tuples are in our tuplestore and passed back through rsinfo->setResult.
741 * rsinfo->setDesc is set to the tuple description that we actually used
742 * to build our tuples with, so the caller can verify we did what it was