/* * contrib/xml2/xpath.c * * Parser interface for DOM-based parser (libxml) rather than * stream-based SAX-type parser */ #include "postgres.h" #include "knl/knl_variable.h" #include "executor/spi.h" #include "fmgr.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/xml.h" /* libxml includes */ #include #include #include #include #include PG_MODULE_MAGIC; /* externally accessible functions */ Datum xml_is_well_formed(PG_FUNCTION_ARGS); Datum xml_encode_special_chars(PG_FUNCTION_ARGS); Datum xpath_nodeset(PG_FUNCTION_ARGS); Datum xpath_string(PG_FUNCTION_ARGS); Datum xpath_number(PG_FUNCTION_ARGS); Datum xpath_bool(PG_FUNCTION_ARGS); Datum xpath_list(PG_FUNCTION_ARGS); Datum xpath_table(PG_FUNCTION_ARGS); /* exported for use by xslt_proc.c */ PgXmlErrorContext* pgxml_parser_init(PgXmlStrictness strictness); /* workspace for pgxml_xpath() */ typedef struct { xmlDocPtr doctree; xmlXPathContextPtr ctxt; xmlXPathObjectPtr res; } xpath_workspace; /* local declarations */ static xmlChar* pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlChar* toptagname, xmlChar* septagname, xmlChar* plainsep); static text* pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar* toptag, xmlChar* septag, xmlChar* plainsep); static xmlChar* pgxml_texttoxmlchar(text* textstring); static xmlXPathObjectPtr pgxml_xpath(text* document, xmlChar* xpath, xpath_workspace* workspace); static void cleanup_workspace(xpath_workspace* workspace); /* * Initialize for xml parsing. * * As with the underlying pg_xml_init function, calls to this MUST be followed * by a PG_TRY block that guarantees that pg_xml_done is called. */ PgXmlErrorContext* pgxml_parser_init(PgXmlStrictness strictness) { PgXmlErrorContext* xmlerrcxt = NULL; /* Set up error handling (we share the core's error handler) */ xmlerrcxt = pg_xml_init(strictness); /* Note: we're assuming an elog cannot be thrown by the following calls */ /* Initialize libxml */ xmlInitParser(); xmlSubstituteEntitiesDefault(1); xmlLoadExtDtdDefaultValue = 1; return xmlerrcxt; } /* * Returns true if document is well-formed * * Note: this has been superseded by a core function. We still have to * have it in the contrib module so that existing SQL-level references * to the function won't fail; but in normal usage with up-to-date SQL * definitions for the contrib module, this won't be called. */ PG_FUNCTION_INFO_V1(xml_is_well_formed); Datum xml_is_well_formed(PG_FUNCTION_ARGS) { text* t = PG_GETARG_TEXT_P(0); /* document buffer */ bool result = false; int32 docsize = VARSIZE(t) - VARHDRSZ; xmlDocPtr doctree; PgXmlErrorContext* xmlerrcxt = NULL; xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); PG_TRY(); { doctree = xmlParseMemory((char*)VARDATA(t), docsize); result = (doctree != NULL); if (doctree != NULL) xmlFreeDoc(doctree); } PG_CATCH(); { pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } PG_END_TRY(); pg_xml_done(xmlerrcxt, false); PG_RETURN_BOOL(result); } /* Encodes special characters (<, >, &, " and \r) as XML entities */ PG_FUNCTION_INFO_V1(xml_encode_special_chars); Datum xml_encode_special_chars(PG_FUNCTION_ARGS) { text* tin = PG_GETARG_TEXT_P(0); text* tout = NULL; xmlChar *ts, *tt; ts = pgxml_texttoxmlchar(tin); tt = xmlEncodeSpecialChars(NULL, ts); pfree(ts); tout = cstring_to_text((char*)tt); xmlFree(tt); PG_RETURN_TEXT_P(tout); } /* * Function translates a nodeset into a text representation * * iterates over each node in the set and calls xmlNodeDump to write it to * an xmlBuffer -from which an xmlChar * string is returned. * * each representation is surrounded by ... * * plainsep is an ordinary (not tag) separator - if used, then nodes are * cast to string as output method */ static xmlChar* pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlChar* toptagname, xmlChar* septagname, xmlChar* plainsep) { xmlBufferPtr buf; xmlChar* result = NULL; int i; buf = xmlBufferCreate(); if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) { xmlBufferWriteChar(buf, "<"); xmlBufferWriteCHAR(buf, toptagname); xmlBufferWriteChar(buf, ">"); } if (nodeset != NULL) { for (i = 0; i < nodeset->nodeNr; i++) { if (plainsep != NULL) { xmlBufferWriteCHAR(buf, xmlXPathCastNodeToString(nodeset->nodeTab[i])); /* If this isn't the last entry, write the plain sep. */ if (i < (nodeset->nodeNr) - 1) xmlBufferWriteChar(buf, (char*)plainsep); } else { if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) { xmlBufferWriteChar(buf, "<"); xmlBufferWriteCHAR(buf, septagname); xmlBufferWriteChar(buf, ">"); } xmlNodeDump(buf, nodeset->nodeTab[i]->doc, nodeset->nodeTab[i], 1, 0); if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) { xmlBufferWriteChar(buf, ""); } } } } if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) { xmlBufferWriteChar(buf, ""); } result = xmlStrdup(buf->content); xmlBufferFree(buf); return result; } /* Translate a PostgreSQL "varlena" -i.e. a variable length parameter * into the libxml2 representation */ static xmlChar* pgxml_texttoxmlchar(text* textstring) { return (xmlChar*)text_to_cstring(textstring); } /* Publicly visible XPath functions */ /* * This is a "raw" xpath function. Check that it returns child elements * properly */ PG_FUNCTION_INFO_V1(xpath_nodeset); Datum xpath_nodeset(PG_FUNCTION_ARGS) { text* document = PG_GETARG_TEXT_P(0); text* xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ xmlChar* toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2)); xmlChar* septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3)); xmlChar* xpath = NULL; text* xpres = NULL; xmlXPathObjectPtr res; xpath_workspace workspace; xpath = pgxml_texttoxmlchar(xpathsupp); res = pgxml_xpath(document, xpath, &workspace); xpres = pgxml_result_to_text(res, toptag, septag, NULL); cleanup_workspace(&workspace); pfree(xpath); if (xpres == NULL) PG_RETURN_NULL(); PG_RETURN_TEXT_P(xpres); } /* * The following function is almost identical, but returns the elements in * a list. */ PG_FUNCTION_INFO_V1(xpath_list); Datum xpath_list(PG_FUNCTION_ARGS) { text* document = PG_GETARG_TEXT_P(0); text* xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ xmlChar* plainsep = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2)); xmlChar* xpath = NULL; text* xpres = NULL; xmlXPathObjectPtr res; xpath_workspace workspace; xpath = pgxml_texttoxmlchar(xpathsupp); res = pgxml_xpath(document, xpath, &workspace); xpres = pgxml_result_to_text(res, NULL, NULL, plainsep); cleanup_workspace(&workspace); pfree(xpath); if (xpres == NULL) PG_RETURN_NULL(); PG_RETURN_TEXT_P(xpres); } PG_FUNCTION_INFO_V1(xpath_string); Datum xpath_string(PG_FUNCTION_ARGS) { text* document = PG_GETARG_TEXT_P(0); text* xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ xmlChar* xpath = NULL; int32 pathsize; text* xpres = NULL; xmlXPathObjectPtr res; xpath_workspace workspace; pathsize = VARSIZE(xpathsupp) - VARHDRSZ; /* * We encapsulate the supplied path with "string()" = 8 chars + 1 for NUL * at end */ /* We could try casting to string using the libxml function? */ xpath = (xmlChar*)palloc(pathsize + 9); errno_t rc = strncpy_s((char*)xpath, pathsize + 9, "string(", 7); securec_check_c(rc, "", ""); rc = memcpy_s((char*)(xpath + 7), pathsize + 2, VARDATA(xpathsupp), pathsize); securec_check_c(rc, "", ""); xpath[pathsize + 7] = ')'; xpath[pathsize + 8] = '\0'; res = pgxml_xpath(document, xpath, &workspace); xpres = pgxml_result_to_text(res, NULL, NULL, NULL); cleanup_workspace(&workspace); pfree(xpath); if (xpres == NULL) PG_RETURN_NULL(); PG_RETURN_TEXT_P(xpres); } PG_FUNCTION_INFO_V1(xpath_number); Datum xpath_number(PG_FUNCTION_ARGS) { text* document = PG_GETARG_TEXT_P(0); text* xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ xmlChar* xpath = NULL; float4 fRes; xmlXPathObjectPtr res; xpath_workspace workspace; xpath = pgxml_texttoxmlchar(xpathsupp); res = pgxml_xpath(document, xpath, &workspace); pfree(xpath); if (res == NULL) PG_RETURN_NULL(); fRes = xmlXPathCastToNumber(res); cleanup_workspace(&workspace); if (xmlXPathIsNaN(fRes)) PG_RETURN_NULL(); PG_RETURN_FLOAT4(fRes); } PG_FUNCTION_INFO_V1(xpath_bool); Datum xpath_bool(PG_FUNCTION_ARGS) { text* document = PG_GETARG_TEXT_P(0); text* xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */ xmlChar* xpath = NULL; int bRes; xmlXPathObjectPtr res; xpath_workspace workspace; xpath = pgxml_texttoxmlchar(xpathsupp); res = pgxml_xpath(document, xpath, &workspace); pfree(xpath); if (res == NULL) PG_RETURN_BOOL(false); bRes = xmlXPathCastToBoolean(res); cleanup_workspace(&workspace); PG_RETURN_BOOL(bRes); } /* Core function to evaluate XPath query */ static xmlXPathObjectPtr pgxml_xpath(text* document, xmlChar* xpath, xpath_workspace* workspace) { int32 docsize = VARSIZE(document) - VARHDRSZ; PgXmlErrorContext* xmlerrcxt = NULL; xmlXPathCompExprPtr comppath; workspace->doctree = NULL; workspace->ctxt = NULL; workspace->res = NULL; xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); PG_TRY(); { workspace->doctree = xmlParseMemory((char*)VARDATA(document), docsize); if (workspace->doctree != NULL) { workspace->ctxt = xmlXPathNewContext(workspace->doctree); workspace->ctxt->node = xmlDocGetRootElement(workspace->doctree); /* compile the path */ comppath = xmlXPathCompile(xpath); if (comppath == NULL) xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, "XPath Syntax Error"); /* Now evaluate the path expression. */ workspace->res = xmlXPathCompiledEval(comppath, workspace->ctxt); xmlXPathFreeCompExpr(comppath); } } PG_CATCH(); { cleanup_workspace(workspace); pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } PG_END_TRY(); if (workspace->res == NULL) cleanup_workspace(workspace); pg_xml_done(xmlerrcxt, false); return workspace->res; } /* Clean up after processing the result of pgxml_xpath() */ static void cleanup_workspace(xpath_workspace* workspace) { if (workspace->res) xmlXPathFreeObject(workspace->res); workspace->res = NULL; if (workspace->ctxt) xmlXPathFreeContext(workspace->ctxt); workspace->ctxt = NULL; if (workspace->doctree) xmlFreeDoc(workspace->doctree); workspace->doctree = NULL; } static text* pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar* toptag, xmlChar* septag, xmlChar* plainsep) { xmlChar* xpresstr = NULL; text* xpres = NULL; if (res == NULL) return NULL; switch (res->type) { case XPATH_NODESET: xpresstr = pgxmlNodeSetToText(res->nodesetval, toptag, septag, plainsep); break; case XPATH_STRING: xpresstr = xmlStrdup(res->stringval); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); xpresstr = xmlStrdup((const xmlChar*)""); } /* Now convert this result back to text */ xpres = cstring_to_text((char*)xpresstr); /* Free various storage */ xmlFree(xpresstr); return xpres; } /* * xpath_table is a table function. It needs some tidying (as do the * other functions here! */ PG_FUNCTION_INFO_V1(xpath_table); Datum xpath_table(PG_FUNCTION_ARGS) { /* Function parameters */ char* pkeyfield = text_to_cstring(PG_GETARG_TEXT_PP(0)); char* xmlfield = text_to_cstring(PG_GETARG_TEXT_PP(1)); char* relname = text_to_cstring(PG_GETARG_TEXT_PP(2)); char* xpathset = text_to_cstring(PG_GETARG_TEXT_PP(3)); char* condition = text_to_cstring(PG_GETARG_TEXT_PP(4)); /* SPI (input tuple) support */ SPITupleTable* tuptable = NULL; HeapTuple spi_tuple; TupleDesc spi_tupdesc; /* Output tuple (tuplestore) support */ Tuplestorestate* tupstore = NULL; TupleDesc ret_tupdesc; HeapTuple ret_tuple; ReturnSetInfo* rsinfo = (ReturnSetInfo*)fcinfo->resultinfo; AttInMetadata* attinmeta = NULL; MemoryContext per_query_ctx; MemoryContext oldcontext; char** values; xmlChar** xpaths; char* pos = NULL; const char* pathsep = "|"; int numpaths; int ret; int proc; int i; int j; int rownr; /* For issuing multiple rows from one original * document */ bool had_values = false; /* To determine end of nodeset results */ StringInfoData query_buf; PgXmlErrorContext* xmlerrcxt = NULL; volatile xmlDocPtr doctree = NULL; /* We only have a valid tuple description in table function mode */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (rsinfo->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table must be called as a table function"))); /* * We want to materialise because it means that we don't have to carry * libxml2 parser state between invocations of this function */ if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table requires Materialize mode, but it is not " "allowed in this context"))); /* * The tuplestore must exist in a higher context than this function call * (per_query_ctx is used) */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* * Create the tuplestore - work_mem is the max in-memory size before a * file is created on disk to hold it. */ tupstore = tuplestore_begin_heap(rsinfo->allowedModes & SFRM_Materialize_Random, false, u_sess->attr.attr_memory.work_mem); MemoryContextSwitchTo(oldcontext); /* get the requested return tuple description */ ret_tupdesc = CreateTupleDescCopy(rsinfo->expectedDesc); /* must have at least one output column (for the pkey) */ if (ret_tupdesc->natts < 1) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table must have at least one output column"))); /* * At the moment we assume that the returned attributes make sense for the * XPath specififed (i.e. we trust the caller). It's not fatal if they get * it wrong - the input function for the column type will raise an error * if the path result can't be converted into the correct binary * representation. */ attinmeta = TupleDescGetAttInMetadata(ret_tupdesc); /* Set return mode and allocate value space. */ rsinfo->returnMode = SFRM_Materialize; rsinfo->setDesc = ret_tupdesc; values = (char**)palloc(ret_tupdesc->natts * sizeof(char*)); xpaths = (xmlChar**)palloc(ret_tupdesc->natts * sizeof(xmlChar*)); /* * Split XPaths. xpathset is a writable CString. * * Note that we stop splitting once we've done all needed for tupdesc */ numpaths = 0; pos = xpathset; while (numpaths < (ret_tupdesc->natts - 1)) { xpaths[numpaths++] = (xmlChar*)pos; pos = strstr(pos, pathsep); if (pos != NULL) { *pos = '\0'; pos++; } else break; } /* Now build query */ initStringInfo(&query_buf); /* Build initial sql statement */ appendStringInfo(&query_buf, "SELECT %s, %s FROM %s WHERE %s", pkeyfield, xmlfield, relname, condition); if ((ret = SPI_connect()) < 0) elog(ERROR, "xpath_table: SPI_connect returned %d", ret); if ((ret = SPI_exec(query_buf.data, 0)) != SPI_OK_SELECT) elog(ERROR, "xpath_table: SPI execution failed for query %s", query_buf.data); proc = SPI_processed; tuptable = SPI_tuptable; spi_tupdesc = tuptable->tupdesc; /* Switch out of SPI context */ MemoryContextSwitchTo(oldcontext); /* * Check that SPI returned correct result. If you put a comma into one of * the function parameters, this will catch it when the SPI query returns * e.g. 3 columns. */ if (spi_tupdesc->natts != 2) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("expression returning multiple columns is not valid in parameter list"), errdetail("Expected two columns in SPI result, got %d.", spi_tupdesc->natts))); } /* * Setup the parser. This should happen after we are done evaluating the * query, in case it calls functions that set up libxml differently. */ xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); PG_TRY(); { /* For each row i.e. document returned from SPI */ for (i = 0; i < proc; i++) { char* pkey = NULL; char* xmldoc = NULL; xmlXPathContextPtr ctxt; xmlXPathObjectPtr res; xmlChar* resstr = NULL; xmlXPathCompExprPtr comppath; /* Extract the row data as C Strings */ spi_tuple = tuptable->vals[i]; pkey = SPI_getvalue(spi_tuple, spi_tupdesc, 1); xmldoc = SPI_getvalue(spi_tuple, spi_tupdesc, 2); /* * Clear the values array, so that not-well-formed documents * return NULL in all columns. Note that this also means that * spare columns will be NULL. */ for (j = 0; j < ret_tupdesc->natts; j++) values[j] = NULL; /* Insert primary key */ values[0] = pkey; /* Parse the document */ if (xmldoc) doctree = xmlParseMemory(xmldoc, strlen(xmldoc)); else /* treat NULL as not well-formed */ doctree = NULL; if (doctree == NULL) { /* not well-formed, so output all-NULL tuple */ ret_tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, ret_tuple); heap_freetuple(ret_tuple); } else { /* New loop here - we have to deal with nodeset results */ rownr = 0; do { /* Now evaluate the set of xpaths. */ had_values = false; for (j = 0; j < numpaths; j++) { ctxt = xmlXPathNewContext(doctree); ctxt->node = xmlDocGetRootElement(doctree); /* compile the path */ comppath = xmlXPathCompile(xpaths[j]); if (comppath == NULL) xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, "XPath Syntax Error"); /* Now evaluate the path expression. */ res = xmlXPathCompiledEval(comppath, ctxt); xmlXPathFreeCompExpr(comppath); if (res != NULL) { switch (res->type) { case XPATH_NODESET: /* We see if this nodeset has enough nodes */ if (res->nodesetval != NULL && rownr < res->nodesetval->nodeNr) { resstr = xmlXPathCastNodeToString(res->nodesetval->nodeTab[rownr]); had_values = true; } else resstr = NULL; break; case XPATH_STRING: resstr = xmlStrdup(res->stringval); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); resstr = xmlStrdup((const xmlChar*)""); } /* * Insert this into the appropriate column in the * result tuple. */ values[j + 1] = (char*)resstr; } xmlXPathFreeContext(ctxt); } /* Now add the tuple to the output, if there is one. */ if (had_values) { ret_tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, ret_tuple); heap_freetuple(ret_tuple); } rownr++; } while (had_values); } if (doctree != NULL) xmlFreeDoc(doctree); doctree = NULL; if (pkey) pfree(pkey); if (xmldoc) pfree(xmldoc); } } PG_CATCH(); { if (doctree != NULL) xmlFreeDoc(doctree); pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } PG_END_TRY(); if (doctree != NULL) xmlFreeDoc(doctree); pg_xml_done(xmlerrcxt, false); tuplestore_donestoring(tupstore); SPI_finish(); rsinfo->setResult = tupstore; /* * SFRM_Materialize mode expects us to return a NULL Datum. The actual * tuples are in our tuplestore and passed back through rsinfo->setResult. * rsinfo->setDesc is set to the tuple description that we actually used * to build our tuples with, so the caller can verify we did what it was * expecting. */ return (Datum)0; }