From b9f7074430594b95824059eef931dfbb27a7645e Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 31 May 2022 22:49:19 +0200 Subject: [PATCH 1/6] Remove debug print from test. --- src/lxml/tests/test_htmlparser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/lxml/tests/test_htmlparser.py b/src/lxml/tests/test_htmlparser.py index acbde4212..2f3186ff1 100644 --- a/src/lxml/tests/test_htmlparser.py +++ b/src/lxml/tests/test_htmlparser.py @@ -661,7 +661,6 @@ def test_xhtml_as_html_as_xml(self): b'' ) root = html.fromstring(xhtml) - print(root.attrib) result = etree.tostring(root) self.assertEqual(result, b'') @@ -673,7 +672,6 @@ def test_xhtml_as_html_as_xml(self): b'' ) root = html.fromstring(xhtml) - print(root.attrib) result = etree.tostring(root) self.assertEqual(result, b'') """ From 8f0bf2d158f2dd3f98d410c8a38fcd536fd11b53 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 31 May 2022 23:18:38 +0200 Subject: [PATCH 2/6] Try to speed up the musllinux AArch64 build by splitting the different CPython versions into separate GHA jobs. --- .github/workflows/wheels.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index e96753ad8..09dc7c9d7 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -70,6 +70,8 @@ jobs: exclude: - image: manylinux_2_24_aarch64 pyversion: "*" + - image: musllinux_1_1_aarch64 + pyversion: "*" include: - image: manylinux2014_aarch64 pyversion: "cp36*" @@ -82,6 +84,17 @@ jobs: - image: manylinux_2_24_aarch64 pyversion: "cp310*" + - image: musllinux_1_1_aarch64 + pyversion: "cp36*" + - image: musllinux_1_1_aarch64 + pyversion: "cp37*" + - image: musllinux_1_1_aarch64 + pyversion: "cp38*" + - image: musllinux_1_1_aarch64 + pyversion: "cp39*" + - image: musllinux_1_1_aarch64 + pyversion: "cp310*" + steps: - uses: actions/checkout@v2 From 50c276412880c1a3dde8a6d6c909e3ed8ef47e43 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 22 Jun 2022 09:10:10 +0200 Subject: [PATCH 3/6] Delete unused Travis CI config and reference in docs (GH-345) --- .travis.yml | 86 ----------------------------------------------------- README.rst | 2 +- 2 files changed, 1 insertion(+), 87 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9d8a9f424..000000000 --- a/.travis.yml +++ /dev/null @@ -1,86 +0,0 @@ -os: linux -language: python - -cache: - pip: true - directories: - - $HOME/.ccache - - libs - -python: - - nightly - - 3.10 - - 2.7 - - 3.9 - - 3.8 - - 3.7 - - 3.6 - - 3.5 - -env: - global: - - USE_CCACHE=1 - - CCACHE_SLOPPINESS=pch_defines,time_macros - - CCACHE_COMPRESS=1 - - CCACHE_MAXSIZE=70M - - PATH="/usr/lib/ccache:$PATH" - - LIBXML2_VERSION=2.9.10 - - LIBXSLT_VERSION=1.1.34 - matrix: - - STATIC_DEPS=false - - STATIC_DEPS=true - -matrix: - include: - - python: 3.8 - env: - - STATIC_DEPS=false - - EXTRA_DEPS="docutils pygments sphinx sphinx-rtd-theme" - script: make html - - python: 3.8 - env: - - STATIC_DEPS=false - - EXTRA_DEPS="coverage<5" - - python: 3.8 - env: - - STATIC_DEPS=true - - LIBXML2_VERSION=2.9.2 # minimum version requirements - - LIBXSLT_VERSION=1.1.27 - - python: pypy - env: STATIC_DEPS=false - - python: pypy3 - env: STATIC_DEPS=false - - python: 3.8 - env: STATIC_DEPS=false - arch: arm64 - - python: 3.8 - env: STATIC_DEPS=true - arch: arm64 - - python: 3.8 - env: STATIC_DEPS=false - arch: ppc64le - - python: 3.8 - env: STATIC_DEPS=true - arch: ppc64le - allow_failures: - - python: nightly - - python: pypy - - python: pypy3 - -install: - - pip install -U pip wheel - - if [ -z "${TRAVIS_PYTHON_VERSION##*-dev}" ]; - then pip install --install-option=--no-cython-compile https://github.com/cython/cython/archive/master.zip; - else pip install -r requirements.txt; - fi - - pip install -U beautifulsoup4 cssselect html5lib rnc2rng==2.6.5 ${EXTRA_DEPS} - -script: - - CFLAGS="-O0 -g -fPIC" python -u setup.py build_ext --inplace - $(if [ -n "${TRAVIS_PYTHON_VERSION##2.*}" -a -n "${TRAVIS_PYTHON_VERSION##3.[34]*}" ]; then echo -n " -j7 "; fi ) - $(if [ -n "$EXTRA_DEPS" -a -z "${EXTRA_DEPS##*coverage*}" ]; then echo -n "--with-coverage"; fi ) - - ccache -s || true - - CFLAGS="-O0 -g -fPIC" PYTHONUNBUFFERED=x make test - - ccache -s || true - - python setup.py install - - python -c "from lxml import etree" diff --git a/README.rst b/README.rst index e8705ab92..a0434b379 100644 --- a/README.rst +++ b/README.rst @@ -63,7 +63,7 @@ Crypto currencies do not fit into that ambition. .. _`doc/main.txt`: https://github.com/lxml/lxml/blob/master/doc/main.txt .. _`INSTALL.txt`: http://lxml.de/installation.html -`Travis-CI `_ and `AppVeyor `_ +`AppVeyor `_ and `GitHub Actions `_ support the lxml project with their build and CI servers. Jetbrains supports the lxml project by donating free licenses of their `PyCharm IDE `_. From 86368e9cf70a0ad23cccd5ee32de847149af0c6f Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 1 Jul 2022 21:06:10 +0200 Subject: [PATCH 4/6] Fix a crash when incorrect parser input occurs together with usages of iterwalk() on trees generated by the same parser. --- src/lxml/apihelpers.pxi | 7 ++++--- src/lxml/iterparse.pxi | 11 ++++++----- src/lxml/tests/test_etree.py | 20 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi index c16627629..9fae9fb12 100644 --- a/src/lxml/apihelpers.pxi +++ b/src/lxml/apihelpers.pxi @@ -246,9 +246,10 @@ cdef dict _build_nsmap(xmlNode* c_node): while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE: c_ns = c_node.nsDef while c_ns is not NULL: - prefix = funicodeOrNone(c_ns.prefix) - if prefix not in nsmap: - nsmap[prefix] = funicodeOrNone(c_ns.href) + if c_ns.prefix or c_ns.href: + prefix = funicodeOrNone(c_ns.prefix) + if prefix not in nsmap: + nsmap[prefix] = funicodeOrNone(c_ns.href) c_ns = c_ns.next c_node = c_node.parent return nsmap diff --git a/src/lxml/iterparse.pxi b/src/lxml/iterparse.pxi index 138c23a6a..a7299da6d 100644 --- a/src/lxml/iterparse.pxi +++ b/src/lxml/iterparse.pxi @@ -420,7 +420,7 @@ cdef int _countNsDefs(xmlNode* c_node): count = 0 c_ns = c_node.nsDef while c_ns is not NULL: - count += 1 + count += (c_ns.href is not NULL) c_ns = c_ns.next return count @@ -431,9 +431,10 @@ cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1: count = 0 c_ns = c_node.nsDef while c_ns is not NULL: - ns_tuple = (funicode(c_ns.prefix) if c_ns.prefix is not NULL else '', - funicode(c_ns.href)) - event_list.append( (u"start-ns", ns_tuple) ) - count += 1 + if c_ns.href: + ns_tuple = (funicodeOrEmpty(c_ns.prefix), + funicode(c_ns.href)) + event_list.append( (u"start-ns", ns_tuple) ) + count += 1 c_ns = c_ns.next return count diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py index e5f084692..285313f6e 100644 --- a/src/lxml/tests/test_etree.py +++ b/src/lxml/tests/test_etree.py @@ -1460,6 +1460,26 @@ def test_iterwalk_getiterator(self): [1,2,1,4], counts) + def test_walk_after_parse_failure(self): + # This used to be an issue because libxml2 can leak empty namespaces + # between failed parser runs. iterwalk() failed to handle such a tree. + try: + etree.XML('''''') + except etree.XMLSyntaxError: + pass + else: + assert False, "invalid input did not fail to parse" + + et = etree.XML(''' ''') + try: + ns = next(etree.iterwalk(et, events=('start-ns',))) + except StopIteration: + # This would be the expected result, because there was no namespace + pass + else: + # This is a bug in libxml2 + assert not ns, repr(ns) + def test_itertext_comment_pi(self): # https://bugs.launchpad.net/lxml/+bug/1844674 XML = self.etree.XML From d65e63229e8958bc08344a85cd3f09ceeef933c3 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 1 Jul 2022 21:09:05 +0200 Subject: [PATCH 5/6] Prepare release of lxml 4.9.1. --- CHANGES.txt | 12 ++++++++++++ doc/main.txt | 10 +++++++--- src/lxml/__init__.py | 2 +- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b2e0c8f03..64bba1c22 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,18 @@ lxml changelog ============== +4.9.1 (2022-07-01) +================== + +Bugs fixed +---------- + +* A crash was resolved when using ``iterwalk()`` (or ``canonicalize()``) + after parsing certain incorrect input. Note that ``iterwalk()`` can crash + on *valid* input parsed with the same parser *after* failing to parse the + incorrect input. + + 4.9.0 (2022-06-01) ================== diff --git a/doc/main.txt b/doc/main.txt index e9a0a4637..578f92dcf 100644 --- a/doc/main.txt +++ b/doc/main.txt @@ -160,8 +160,8 @@ Index `_ (PyPI). It has the source that compiles on various platforms. The source distribution is signed with `this key `_. -The latest version is `lxml 4.9.0`_, released 2022-06-01 -(`changes for 4.9.0`_). `Older versions <#old-versions>`_ +The latest version is `lxml 4.9.1`_, released 2022-07-01 +(`changes for 4.9.1`_). `Older versions <#old-versions>`_ are listed below. Please take a look at the @@ -256,7 +256,9 @@ See the websites of lxml .. and the `latest in-development version `_. -.. _`PDF documentation`: lxmldoc-4.9.0.pdf +.. _`PDF documentation`: lxmldoc-4.9.1.pdf + +* `lxml 4.9.1`_, released 2022-07-01 (`changes for 4.9.1`_) * `lxml 4.9.0`_, released 2022-06-01 (`changes for 4.9.0`_) @@ -280,6 +282,7 @@ See the websites of lxml * `older releases `_ +.. _`lxml 4.9.1`: /files/lxml-4.9.1.tgz .. _`lxml 4.9.0`: /files/lxml-4.9.0.tgz .. _`lxml 4.8.0`: /files/lxml-4.8.0.tgz .. _`lxml 4.7.1`: /files/lxml-4.7.1.tgz @@ -291,6 +294,7 @@ See the websites of lxml .. _`lxml 4.6.1`: /files/lxml-4.6.1.tgz .. _`lxml 4.6.0`: /files/lxml-4.6.0.tgz +.. _`changes for 4.9.1`: /changes-4.9.1.html .. _`changes for 4.9.0`: /changes-4.9.0.html .. _`changes for 4.8.0`: /changes-4.8.0.html .. _`changes for 4.7.1`: /changes-4.7.1.html diff --git a/src/lxml/__init__.py b/src/lxml/__init__.py index 0e0083413..f8be68f71 100644 --- a/src/lxml/__init__.py +++ b/src/lxml/__init__.py @@ -1,6 +1,6 @@ # this is a package -__version__ = "4.9.0" +__version__ = "4.9.1" def get_include(): From d01872ccdf7e1e5e825b6c6292b43e7d27ae5fc4 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 1 Jul 2022 21:19:44 +0200 Subject: [PATCH 6/6] Prevent parse failure in new test from leaking into later test runs. --- src/lxml/tests/test_etree.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py index 285313f6e..3e52258ed 100644 --- a/src/lxml/tests/test_etree.py +++ b/src/lxml/tests/test_etree.py @@ -1463,14 +1463,16 @@ def test_iterwalk_getiterator(self): def test_walk_after_parse_failure(self): # This used to be an issue because libxml2 can leak empty namespaces # between failed parser runs. iterwalk() failed to handle such a tree. + parser = etree.XMLParser() + try: - etree.XML('''''') + etree.XML('''''', parser=parser) except etree.XMLSyntaxError: pass else: assert False, "invalid input did not fail to parse" - et = etree.XML(''' ''') + et = etree.XML(''' ''', parser=parser) try: ns = next(etree.iterwalk(et, events=('start-ns',))) except StopIteration: