From a94caa2370e6a2cf9dc111ab59b09ca758bbce56 Mon Sep 17 00:00:00 2001 From: Andrew Potter Date: Fri, 10 Jun 2022 20:31:06 -0700 Subject: [PATCH 1/2] parsers: Avoid std::getline for istream inputs For streambuf implementations that can't set a public get area, getline falls back to a character-by-character implementation. Instead we can pass the stream to xmlCreateIOParserCtxt(). This simplifies the parse stream methods and brings them nearly identical to the other parse methods. --- libxml++/parsers/domparser.cc | 80 +++++------------ libxml++/parsers/saxparser.cc | 78 +++++------------ tests/istream_ioparser/main.cc | 152 +++++++++++++++++++++++++++++++++ tests/meson.build | 1 + 4 files changed, 196 insertions(+), 115 deletions(-) create mode 100644 tests/istream_ioparser/main.cc diff --git a/libxml++/parsers/domparser.cc b/libxml++/parsers/domparser.cc index ef856362..ad4c9608 100644 --- a/libxml++/parsers/domparser.cc +++ b/libxml++/parsers/domparser.cc @@ -153,8 +153,8 @@ void DomParser::check_xinclude_and_finish_parsing() int options = xinclude_options_; // Turn on/off any xinclude options. options |= set_options; - options &= ~clear_options; - + options &= ~clear_options; + if (options & XML_PARSE_XINCLUDE) { const int n_substitutions = xmlXIncludeProcessFlags(context_->myDoc, options); @@ -174,6 +174,19 @@ void DomParser::check_xinclude_and_finish_parsing() Parser::release_underlying(); } +namespace { + extern "C" { + static int _io_read_callback(void * context, + char * buffer, + int len) + { + std::istream *in = static_cast(context); + in->read(buffer, len); + return in->gcount(); + } + } +} + void DomParser::parse_stream(std::istream& in) { release_underlying(); //Free any existing document. @@ -181,67 +194,20 @@ void DomParser::parse_stream(std::istream& in) KeepBlanks k(KeepBlanks::Default); xmlResetLastError(); - context_ = xmlCreatePushParserCtxt( - nullptr, // Setting those two parameters to nullptr force the parser - nullptr, // to create a document while parsing. - nullptr, // chunk - 0, // size - nullptr); // no filename for fetching external entities + context_ = xmlCreateIOParserCtxt( + nullptr, // Setting those two parameters to nullptr force the parser + nullptr, // to create a document while parsing. + _io_read_callback, + nullptr, // inputCloseCallback + &in, + XML_CHAR_ENCODING_NONE); if(!context_) { throw internal_error("Could not create parser context\n" + format_xml_error()); } - initialize_context(); - - // std::string or ustring? - // Output from the XML parser is UTF-8 encoded. - // But the istream "in" is input, i.e. an XML file. It can use any encoding. - // If it's not UTF-8, the file itself must contain information about which - // encoding it uses. See the XML specification. Thus use std::string. - int firstParseError = XML_ERR_OK; - std::string line; - while(std::getline(in, line)) - { - // since getline does not get the line separator, we have to add it since the parser cares - // about layout in certain cases. - line += '\n'; - - const int parseError = xmlParseChunk(context_, line.c_str(), - line.size() /* This is a std::string, not a ustring, so this is the number of bytes. */, 0); - - // Save the first error code if any, but read on. - // More errors might be reported and then thrown by check_for_exception(). - if (parseError != XML_ERR_OK && firstParseError == XML_ERR_OK) - firstParseError = parseError; - } - - const int parseError = xmlParseChunk(context_, nullptr, 0, 1 /* last chunk */); - if (parseError != XML_ERR_OK && firstParseError == XML_ERR_OK) - firstParseError = parseError; - - try - { - check_for_exception(); - } - catch (...) - { - release_underlying(); //Free doc_ and context_ - throw; // re-throw exception - } - - auto error_str = format_xml_parser_error(context_); - if (error_str.empty() && firstParseError != XML_ERR_OK) - error_str = "Error code from xmlParseChunk(): " + std::to_string(firstParseError); - - if(!error_str.empty()) - { - release_underlying(); //Free doc_ and context_ - throw parse_error(error_str); - } - - check_xinclude_and_finish_parsing(); + parse_context(); } void DomParser::release_underlying() diff --git a/libxml++/parsers/saxparser.cc b/libxml++/parsers/saxparser.cc index 71674877..4fbf2d10 100644 --- a/libxml++/parsers/saxparser.cc +++ b/libxml++/parsers/saxparser.cc @@ -209,6 +209,19 @@ void SaxParser::parse_memory(const ustring& contents) parse_memory_raw((const unsigned char*)contents.c_str(), contents.size()); } +namespace { + extern "C" { + static int _io_read_callback(void * context, + char * buffer, + int len) + { + std::istream *in = static_cast(context); + in->read(buffer, len); + return in->gcount(); + } + } +} + void SaxParser::parse_stream(std::istream& in) { if(context_) @@ -217,66 +230,15 @@ void SaxParser::parse_stream(std::istream& in) } KeepBlanks k(KeepBlanks::Default); - xmlResetLastError(); - context_ = xmlCreatePushParserCtxt( + context_ = xmlCreateIOParserCtxt( sax_handler_.get(), - nullptr, // user_data - nullptr, // chunk - 0, // size - nullptr); // no filename for fetching external entities - - if(!context_) - { - throw internal_error("Could not create parser context\n" + format_xml_error()); - } - - initialize_context(); - - // std::string or ustring? - // Output from the XML parser is UTF-8 encoded. - // But the istream "in" is input, i.e. an XML file. It can use any encoding. - // If it's not UTF-8, the file itself must contain information about which - // encoding it uses. See the XML specification. Thus use std::string. - int firstParseError = XML_ERR_OK; - std::string line; - while (!exception_ && std::getline(in, line)) - { - // since getline does not get the line separator, we have to add it since the parser care - // about layout in certain cases. - line += '\n'; - - const int parseError = xmlParseChunk(context_, line.c_str(), - line.size() /* This is a std::string, not a ustring, so this is the number of bytes. */, - 0 /* don't terminate */); - - // Save the first error code if any, but read on. - // More errors might be reported and then thrown by check_for_exception(). - if (parseError != XML_ERR_OK && firstParseError == XML_ERR_OK) - firstParseError = parseError; - } - - if (!exception_) - { - //This is called just to terminate parsing. - const int parseError = xmlParseChunk(context_, nullptr /* chunk */, 0 /* size */, 1 /* terminate (1 or 0) */); - - if (parseError != XML_ERR_OK && firstParseError == XML_ERR_OK) - firstParseError = parseError; - } - - auto error_str = format_xml_parser_error(context_); - if (error_str.empty() && firstParseError != XML_ERR_OK) - error_str = "Error code from xmlParseChunk(): " + std::to_string(firstParseError); - - release_underlying(); // Free context_ - - check_for_exception(); - - if(!error_str.empty()) - { - throw parse_error(error_str); - } + nullptr, // user_data + _io_read_callback, + nullptr, // inputCloseCallback + &in, + XML_CHAR_ENCODING_NONE); + parse(); } void SaxParser::parse_chunk(const ustring& chunk) diff --git a/tests/istream_ioparser/main.cc b/tests/istream_ioparser/main.cc new file mode 100644 index 00000000..89821adb --- /dev/null +++ b/tests/istream_ioparser/main.cc @@ -0,0 +1,152 @@ +/* Copyright (C) 2022 The libxml++ development team + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include +#include +#include +#include + +class test_streambuf : public std::streambuf +{ +public: + test_streambuf() : + uflow_calls(0), + underflow_calls(0), + ofs(0), + buf("\n") + { + } + +protected: + /* Simulate some kind of streambuf impl that doesn't setg() */ + virtual int_type underflow() override final + { + ++underflow_calls; + if (ofs >= (sizeof(buf)-1)) + return traits_type::eof(); + return traits_type::to_int_type(buf[ofs]); + } + + virtual int_type uflow() override final + { + ++uflow_calls; + if (ofs >= (sizeof(buf)-1)) + return traits_type::eof(); + return traits_type::to_int_type(buf[ofs++]); + } + + virtual std::streamsize showmanyc() override final + { + if (ofs >= (sizeof(buf)-1)) + return traits_type::eof(); + return sizeof(buf)-1-ofs; + } + + virtual std::streamsize xsgetn(char_type* s, std::streamsize count) override final + { + auto n = std::min(count, static_cast(sizeof(buf)-1-ofs)); + memcpy(s, buf + ofs, n); + ofs += n; + return n; + } + +public: + int uflow_calls; + int underflow_calls; + +private: + size_t ofs; + char buf[15]; +}; + +class MySaxParser : public xmlpp::SaxParser { +public: + bool saw_root = false; +protected: + virtual void on_start_document() override final + { + saw_root = false; + } + virtual void on_end_element(const xmlpp::ustring &name) override final + { + if (name == "root") + saw_root = true; + } +}; + +int main() +{ + { // Check DomParser works well with normal and custom istreams + xmlpp::DomParser parser; + try + { + std::stringstream ss(""); + parser.parse_stream(ss); + } + catch(...) + { + assert(false); + } + + { + auto doc = parser.get_document(); + assert(doc->get_root_node()->get_name() == "root"); + } + + { + test_streambuf buf; + try { + std::istream is(&buf); + parser.parse_stream(is); + } catch (...) { + assert(false); + } + assert(buf.underflow_calls + buf.uflow_calls < 3); + auto doc = parser.get_document(); + assert(doc->get_root_node()->get_name() == "root"); + } + } + { // Check SaxParser works well with normal and custom istreams. + MySaxParser parser; + try + { + std::stringstream ss(""); + parser.parse_stream(ss); + } + catch(...) + { + assert(false); + } + assert(parser.saw_root); + + { + test_streambuf buf; + try { + std::istream is(&buf); + parser.parse_stream(is); + } catch (...) { + assert(false); + } + assert(buf.underflow_calls + buf.uflow_calls < 3); + assert(parser.saw_root); + } + } + assert(true); + return EXIT_SUCCESS; +} diff --git a/tests/meson.build b/tests/meson.build index f62e73be..9e7948d1 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -8,6 +8,7 @@ test_programs = [ [['saxparser_chunk_parsing_inconsistent_state'], 'test', ['main.cc']], [['saxparser_parse_double_free'], 'test', ['main.cc']], [['saxparser_parse_stream_inconsistent_state'], 'test', ['main.cc']], + [['istream_ioparser'], 'test', ['main.cc']], ] foreach ex : test_programs From 64d11b4b060dd7215e8fb841dcfba51af3d6b9fc Mon Sep 17 00:00:00 2001 From: Andrew Potter Date: Fri, 10 Jun 2022 22:45:47 -0700 Subject: [PATCH 2/2] Add test to autotools build --- tests/Makefile.am | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/Makefile.am b/tests/Makefile.am index 96bfe237..ad58ba03 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -21,10 +21,12 @@ LDADD = $(top_builddir)/libxml++/libxml++-$(LIBXMLXX_API_VERSION).la $(LIBXMLXX_ check_PROGRAMS = \ saxparser_chunk_parsing_inconsistent_state/test \ saxparser_parse_double_free/test \ - saxparser_parse_stream_inconsistent_state/test + saxparser_parse_stream_inconsistent_state/test \ + istream_ioparser/test TESTS = $(check_PROGRAMS) saxparser_chunk_parsing_inconsistent_state_test_SOURCES = saxparser_chunk_parsing_inconsistent_state/main.cc saxparser_parse_double_free_test_SOURCES = saxparser_parse_double_free/main.cc saxparser_parse_stream_inconsistent_state_test_SOURCES = saxparser_parse_stream_inconsistent_state/main.cc +istream_ioparser_test_SOURCES = istream_ioparser/main.cc