Skip to content

Commit 1523af2

Browse files
committed
Turn on chardet by default, if installed
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40524
1 parent 0a221a4 commit 1523af2

File tree

3 files changed

+3
-22
lines changed

3 files changed

+3
-22
lines changed

src/__init__.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,6 @@
1010
f = open("my_document.html")
1111
p = html5lib.HTMLParser()
1212
tree = p.parse(f)
13-
14-
By default the returned treeformat is a custom "simpletree", similar
15-
to a DOM tree; each element has attributes childNodes and parent
16-
holding the parents and children respectively, a name attribute
17-
holding the Element name, a data attribute holding the element data
18-
(for text and comment nodes) and an attributes dictionary holding the
19-
element's attributes (for Element nodes).
20-
21-
To get output in ElementTree format:
22-
23-
import html5lib
24-
from html5lib.treebuilders import etree
25-
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
26-
elementtree = p.parse(f)
27-
28-
Note: Because HTML documents support various features not in the
29-
default ElementTree (e.g. doctypes), we suppy our own simple
30-
serializer; html5lib.treebuilders.etree.tostring At present this does not
31-
have the encoding support offered by the elementtree serializer.
32-
3313
"""
3414
from html5parser import HTMLParser
15+
from liberalxmlparser import XMLParser, XHTMLParser

src/inputstream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class HTMLInputStream(object):
1414
1515
"""
1616

17-
def __init__(self, source, encoding=None, chardet=False):
17+
def __init__(self, source, encoding=None, chardet=True):
1818
"""Initialises the HTMLInputStream.
1919
2020
HTMLInputStream(source, [encoding]) -> Normalized stream from source

tests/test_encoding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class TestCase(unittest.TestCase):
4242
def runEncodingTest(self, input, encoding):
4343
#XXX - move this out into the setup function
4444
#concatenate all consecutive character tokens into a single token
45-
stream = inputstream.HTMLInputStream(input)
45+
stream = inputstream.HTMLInputStream(input, chardet=False)
4646

4747
errorMsg = "\n".join(["\n\nInput", input,"\nExpected:", encoding,
4848
"\nRecieved:", stream.charEncoding])

0 commit comments

Comments
 (0)