Skip to content

Commit 0fc5a3c

Browse files
committed
Merge remote-tracking branch 'origin/master' into serializer_roundtrip_test
2 parents c44fbd7 + f5fd711 commit 0fc5a3c

19 files changed

+376
-159
lines changed

.travis.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ python:
44
- "2.7"
55
- "3.2"
66
- "3.3"
7+
- "3.4"
78
- "pypy"
89

910
env:
@@ -14,12 +15,12 @@ matrix:
1415
exclude:
1516
- python: "2.7"
1617
env: USE_OPTIONAL=false
17-
- python: "3.3"
18+
- python: "3.4"
1819
env: USE_OPTIONAL=false
1920
include:
2021
- python: "2.7"
2122
env: USE_OPTIONAL=false FLAKE=true
22-
- python: "3.3"
23+
- python: "3.4"
2324
env: USE_OPTIONAL=false FLAKE=true
2425

2526
before_install:

AUTHORS.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Credits
1010

1111
Patches and suggestions
1212
-----------------------
13+
(In chronological order, by first commit:)
1314

1415
- Anne van Kesteren
1516
- Lachlan Hunt
@@ -21,10 +22,13 @@ Patches and suggestions
2122
- Philip Taylor
2223
- Ryan King
2324
- Edward Z. Yang
25+
- fantasai
2426
- Philip Jägenstedt
2527
- Ms2ger
2628
- Andy Wingo
2729
- Andreas Madsack
2830
- Karim Valiev
2931
- Mohammad Taha Jahangir
3032
- Juan Carlos Garcia Segovia
33+
- Mike West
34+
- Marc DM

CHANGES.rst

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,70 @@
11
Change Log
22
----------
33

4+
0.9999
5+
~~~~~~
6+
7+
Released on XXX, 2014
8+
9+
* XXX
10+
11+
12+
0.999
13+
~~~~~
14+
15+
Released on December 23, 2013
16+
17+
* Fix #127: add work-around for CPython issue #20007: .read(0) on
18+
http.client.HTTPResponse drops the rest of the content.
19+
20+
* Fix #115: lxml treewalker can now deal with fragments containing, at
21+
their root level, text nodes with non-ASCII characters on Python 2.
22+
23+
24+
0.99
25+
~~~~
26+
27+
Released on September 10, 2013
28+
29+
* No library changes from 1.0b3; released as 0.99 as pip has changed
30+
behaviour from 1.4 to avoid installing pre-release versions per
31+
PEP 440.
32+
33+
34+
1.0b3
35+
~~~~~
36+
37+
Released on July 24, 2013
38+
39+
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
40+
implementation using it should be moved to
41+
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
42+
for years.
43+
44+
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
45+
object, thereby fixing any case where html5lib is passed a
46+
non-seekable RawIOBase-like object.
47+
48+
449
1.0b2
550
~~~~~
651

7-
Released on XXX, 2013
52+
Released on June 27, 2013
53+
54+
* Removed reordering of attributes within the serializer. There is now
55+
an ``alphabetical_attributes`` option which preserves the previous
56+
behaviour through a new filter. This allows attribute order to be
57+
preserved through html5lib if the tree builder preserves order.
858

959
* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
1060
``treeadapters.sax.to_sax`` which is generic and supports any
1161
treewalker; it also resolves all known bugs with ``dom2sax``.
1262

63+
* Fix treewalker assertions on hitting bytes strings on
64+
Python 2. Previous to 1.0b1, treewalkers coped with mixed
65+
bytes/unicode data on Python 2; this reintroduces this prior
66+
behaviour on Python 2. Behaviour is unchanged on Python 3.
67+
1368

1469
1.0b1
1570
~~~~~

README.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,29 @@ a treebuilder:
4141
with open("mydocument.html", "rb") as f:
4242
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
4343
44+
When using with ``urllib2`` (Python 2), the charset from HTTP should be
45+
pass into html5lib as follows:
46+
47+
.. code-block:: python
48+
49+
from contextlib import closing
50+
from urllib2 import urlopen
51+
import html5lib
52+
53+
with closing(urlopen("http://example.com/")) as f:
54+
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
55+
56+
When using with ``urllib.request`` (Python 3), the charset from HTTP
57+
should be pass into html5lib as follows:
58+
59+
.. code-block:: python
60+
61+
from urllib.request import urlopen
62+
import html5lib
63+
64+
with urlopen("http://example.com/") as f:
65+
document = html5lib.parse(f, encoding=f.info().get_content_charset())
66+
4467
To have more control over the parser, create a parser object explicitly.
4568
For instance, to make the parser raise exceptions on parse errors, use:
4669

html5lib/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020

2121
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
2222
"getTreeWalker", "serialize"]
23-
__version__ = "1.0b2"
23+
__version__ = "0.9999-dev"

html5lib/html5parser.py

Lines changed: 68 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,17 @@ def reset(self):
129129

130130
self.framesetOK = True
131131

132+
@property
133+
def documentEncoding(self):
134+
"""The name of the character encoding
135+
that was used to decode the input stream,
136+
or :obj:`None` if that is not determined yet.
137+
138+
"""
139+
if not hasattr(self, 'tokenizer'):
140+
return None
141+
return self.tokenizer.stream.charEncoding[0]
142+
132143
def isHTMLIntegrationPoint(self, element):
133144
if (element.name == "annotation-xml" and
134145
element.namespace == namespaces["mathml"]):
@@ -169,7 +180,7 @@ def mainLoop(self):
169180
(self.isMathMLTextIntegrationPoint(currentNode) and
170181
((type == StartTagToken and
171182
token["name"] not in frozenset(["mglyph", "malignmark"])) or
172-
type in (CharactersToken, SpaceCharactersToken))) or
183+
type in (CharactersToken, SpaceCharactersToken))) or
173184
(currentNodeNamespace == namespaces["mathml"] and
174185
currentNodeName == "annotation-xml" and
175186
token["name"] == "svg") or
@@ -507,61 +518,61 @@ def processDoctype(self, token):
507518

508519
if (not correct or token["name"] != "html"
509520
or publicId.startswith(
510-
("+//silmaril//dtd html pro v0r11 19970101//",
511-
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
512-
"-//as//dtd html 3.0 aswedit + extensions//",
513-
"-//ietf//dtd html 2.0 level 1//",
514-
"-//ietf//dtd html 2.0 level 2//",
515-
"-//ietf//dtd html 2.0 strict level 1//",
516-
"-//ietf//dtd html 2.0 strict level 2//",
517-
"-//ietf//dtd html 2.0 strict//",
518-
"-//ietf//dtd html 2.0//",
519-
"-//ietf//dtd html 2.1e//",
520-
"-//ietf//dtd html 3.0//",
521-
"-//ietf//dtd html 3.2 final//",
522-
"-//ietf//dtd html 3.2//",
523-
"-//ietf//dtd html 3//",
524-
"-//ietf//dtd html level 0//",
525-
"-//ietf//dtd html level 1//",
526-
"-//ietf//dtd html level 2//",
527-
"-//ietf//dtd html level 3//",
528-
"-//ietf//dtd html strict level 0//",
529-
"-//ietf//dtd html strict level 1//",
530-
"-//ietf//dtd html strict level 2//",
531-
"-//ietf//dtd html strict level 3//",
532-
"-//ietf//dtd html strict//",
533-
"-//ietf//dtd html//",
534-
"-//metrius//dtd metrius presentational//",
535-
"-//microsoft//dtd internet explorer 2.0 html strict//",
536-
"-//microsoft//dtd internet explorer 2.0 html//",
537-
"-//microsoft//dtd internet explorer 2.0 tables//",
538-
"-//microsoft//dtd internet explorer 3.0 html strict//",
539-
"-//microsoft//dtd internet explorer 3.0 html//",
540-
"-//microsoft//dtd internet explorer 3.0 tables//",
541-
"-//netscape comm. corp.//dtd html//",
542-
"-//netscape comm. corp.//dtd strict html//",
543-
"-//o'reilly and associates//dtd html 2.0//",
544-
"-//o'reilly and associates//dtd html extended 1.0//",
545-
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
546-
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
547-
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
548-
"-//spyglass//dtd html 2.0 extended//",
549-
"-//sq//dtd html 2.0 hotmetal + extensions//",
550-
"-//sun microsystems corp.//dtd hotjava html//",
551-
"-//sun microsystems corp.//dtd hotjava strict html//",
552-
"-//w3c//dtd html 3 1995-03-24//",
553-
"-//w3c//dtd html 3.2 draft//",
554-
"-//w3c//dtd html 3.2 final//",
555-
"-//w3c//dtd html 3.2//",
556-
"-//w3c//dtd html 3.2s draft//",
557-
"-//w3c//dtd html 4.0 frameset//",
558-
"-//w3c//dtd html 4.0 transitional//",
559-
"-//w3c//dtd html experimental 19960712//",
560-
"-//w3c//dtd html experimental 970421//",
561-
"-//w3c//dtd w3 html//",
562-
"-//w3o//dtd w3 html 3.0//",
563-
"-//webtechs//dtd mozilla html 2.0//",
564-
"-//webtechs//dtd mozilla html//"))
521+
("+//silmaril//dtd html pro v0r11 19970101//",
522+
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
523+
"-//as//dtd html 3.0 aswedit + extensions//",
524+
"-//ietf//dtd html 2.0 level 1//",
525+
"-//ietf//dtd html 2.0 level 2//",
526+
"-//ietf//dtd html 2.0 strict level 1//",
527+
"-//ietf//dtd html 2.0 strict level 2//",
528+
"-//ietf//dtd html 2.0 strict//",
529+
"-//ietf//dtd html 2.0//",
530+
"-//ietf//dtd html 2.1e//",
531+
"-//ietf//dtd html 3.0//",
532+
"-//ietf//dtd html 3.2 final//",
533+
"-//ietf//dtd html 3.2//",
534+
"-//ietf//dtd html 3//",
535+
"-//ietf//dtd html level 0//",
536+
"-//ietf//dtd html level 1//",
537+
"-//ietf//dtd html level 2//",
538+
"-//ietf//dtd html level 3//",
539+
"-//ietf//dtd html strict level 0//",
540+
"-//ietf//dtd html strict level 1//",
541+
"-//ietf//dtd html strict level 2//",
542+
"-//ietf//dtd html strict level 3//",
543+
"-//ietf//dtd html strict//",
544+
"-//ietf//dtd html//",
545+
"-//metrius//dtd metrius presentational//",
546+
"-//microsoft//dtd internet explorer 2.0 html strict//",
547+
"-//microsoft//dtd internet explorer 2.0 html//",
548+
"-//microsoft//dtd internet explorer 2.0 tables//",
549+
"-//microsoft//dtd internet explorer 3.0 html strict//",
550+
"-//microsoft//dtd internet explorer 3.0 html//",
551+
"-//microsoft//dtd internet explorer 3.0 tables//",
552+
"-//netscape comm. corp.//dtd html//",
553+
"-//netscape comm. corp.//dtd strict html//",
554+
"-//o'reilly and associates//dtd html 2.0//",
555+
"-//o'reilly and associates//dtd html extended 1.0//",
556+
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
557+
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
558+
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
559+
"-//spyglass//dtd html 2.0 extended//",
560+
"-//sq//dtd html 2.0 hotmetal + extensions//",
561+
"-//sun microsystems corp.//dtd hotjava html//",
562+
"-//sun microsystems corp.//dtd hotjava strict html//",
563+
"-//w3c//dtd html 3 1995-03-24//",
564+
"-//w3c//dtd html 3.2 draft//",
565+
"-//w3c//dtd html 3.2 final//",
566+
"-//w3c//dtd html 3.2//",
567+
"-//w3c//dtd html 3.2s draft//",
568+
"-//w3c//dtd html 4.0 frameset//",
569+
"-//w3c//dtd html 4.0 transitional//",
570+
"-//w3c//dtd html experimental 19960712//",
571+
"-//w3c//dtd html experimental 970421//",
572+
"-//w3c//dtd w3 html//",
573+
"-//w3o//dtd w3 html 3.0//",
574+
"-//webtechs//dtd mozilla html 2.0//",
575+
"-//webtechs//dtd mozilla html//"))
565576
or publicId in
566577
("-//w3o//dtd w3 html strict 3.0//en//",
567578
"-/w3c/dtd html 4.0 transitional/en",
@@ -1205,8 +1216,7 @@ def startTagIsIndex(self, token):
12051216
attributes["name"] = "isindex"
12061217
self.processStartTag(impliedTagToken("input", "StartTag",
12071218
attributes=attributes,
1208-
selfClosing=
1209-
token["selfClosing"]))
1219+
selfClosing=token["selfClosing"]))
12101220
self.processEndTag(impliedTagToken("label"))
12111221
self.processStartTag(impliedTagToken("hr", "StartTag"))
12121222
self.processEndTag(impliedTagToken("form"))

html5lib/inputstream.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import absolute_import, division, unicode_literals
22
from six import text_type
3+
from six.moves import http_client
34

45
import codecs
56
import re
@@ -63,11 +64,11 @@ def tell(self):
6364
return pos
6465

6566
def seek(self, pos):
66-
assert pos < self._bufferedBytes()
67+
assert pos <= self._bufferedBytes()
6768
offset = pos
6869
i = 0
6970
while len(self.buffer[i]) < offset:
70-
offset -= pos
71+
offset -= len(self.buffer[i])
7172
i += 1
7273
self.position = [i, offset]
7374

@@ -114,11 +115,15 @@ def _readFromBuffer(self, bytes):
114115
if remainingBytes:
115116
rv.append(self._readStream(remainingBytes))
116117

117-
return "".join(rv)
118+
return b"".join(rv)
118119

119120

120121
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
121-
if hasattr(source, "read"):
122+
if isinstance(source, http_client.HTTPResponse):
123+
# Work around Python bug #20007: read(0) closes the connection.
124+
# http://bugs.python.org/issue20007
125+
isUnicode = False
126+
elif hasattr(source, "read"):
122127
isUnicode = isinstance(source.read(0), text_type)
123128
else:
124129
isUnicode = isinstance(source, text_type)

html5lib/sanitizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def sanitize_css(self, style):
245245
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
246246
'padding']:
247247
for keyword in value.split():
248-
if not keyword in self.acceptable_css_keywords and \
248+
if keyword not in self.acceptable_css_keywords and \
249249
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
250250
break
251251
else:

html5lib/serializer/htmlserializer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
v = utils.surrogatePairToCodepoint(v)
3737
else:
3838
v = ord(v)
39-
if not v in encode_entity_map or k.islower():
39+
if v not in encode_entity_map or k.islower():
4040
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
4141
encode_entity_map[v] = k
4242

@@ -301,7 +301,7 @@ def serialize(self, treewalker, encoding=None):
301301
elif type == "Entity":
302302
name = token["name"]
303303
key = name + ";"
304-
if not key in entities:
304+
if key not in entities:
305305
self.serializeError(_("Entity %s not recognized" % name))
306306
if self.resolve_entities and key not in xmlEntities:
307307
data = entities[key]

html5lib/tests/test_encoding.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,11 @@ def test_codec_name_d(self):
2828

2929
def runParserEncodingTest(data, encoding):
3030
p = HTMLParser()
31+
assert p.documentEncoding is None
3132
p.parse(data, useChardet=False)
3233
encoding = encoding.lower().decode("ascii")
3334

34-
assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0])
35+
assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
3536

3637

3738
def runPreScanEncodingTest(data, encoding):

0 commit comments

Comments
 (0)