Skip to content

Commit 768ba79

Browse files
committed
More stuff orking including treewalkers, parts of parse.py dom, (c)ElementTree
--HG-- branch : svgmathml extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401266
1 parent 10b9010 commit 768ba79

31 files changed

+303
-264
lines changed

parse.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3.0
22
"""usage: %prog [options] filename
33
44
Parse a document to a simpletree tree, with optional profiling
@@ -9,11 +9,16 @@
99
import os
1010
from optparse import OptionParser
1111

12+
print(sys.stdout.encoding)
13+
1214
#RELEASE remove
1315
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1416
#END RELEASE
15-
from html5lib import html5parser, liberalxmlparser, sanitizer
17+
print(sys.path)
18+
import html5lib
19+
import html5lib.html5parser as html5parser
1620
from html5lib.tokenizer import HTMLTokenizer
21+
from html5lib import treebuilders
1722
from html5lib import treebuilders, serializer, treewalkers
1823
from html5lib import constants
1924

@@ -27,8 +32,8 @@ def parse():
2732
# Try opening from the internet
2833
if f.startswith('http://'):
2934
try:
30-
import urllib, cgi
31-
f = urllib.urlopen(f)
35+
from urllib import request
36+
f = request.urlopen(f)
3237
contentType = f.headers.get('content-type')
3338
if contentType:
3439
(mediaType, params) = cgi.parse_header(contentType)
@@ -39,7 +44,7 @@ def parse():
3944
else:
4045
try:
4146
# Try opening from file system
42-
f = open(f)
47+
f = open(f, "rb")
4348
except IOError: pass
4449
except IndexError:
4550
sys.stderr.write("No filename provided. Use -h for help\n")
@@ -64,16 +69,16 @@ def parse():
6469

6570
if opts.profile:
6671
#XXX should import cProfile instead and use that
67-
import hotshot
68-
import hotshot.stats
69-
prof = hotshot.Profile('stats.prof')
70-
prof.runcall(parseMethod, f, encoding=encoding)
72+
try:
73+
import cProfile as profile
74+
except ImportError:
75+
import profile
76+
import pstats
77+
prof = profile.run('parseMethod(f, encoding=encoding)', 'prof.out')
7178
prof.close()
7279
# XXX - We should use a temp file here
73-
stats = hotshot.stats.load('stats.prof')
74-
stats.strip_dirs()
75-
stats.sort_stats('time')
76-
stats.print_stats()
80+
stats = pstats.stats('prof.out')
81+
stats.strip_dirs().sort_stats('time').print_stats()
7782
elif opts.time:
7883
import time
7984
t0 = time.time()
@@ -88,13 +93,14 @@ def parse():
8893

8994
def printOutput(parser, document, opts):
9095
if opts.encoding:
91-
print "Encoding:", parser.tokenizer.stream.charEncoding
96+
print("Encoding:", parser.tokenizer.stream.charEncoding)
9297
if opts.xml:
9398
sys.stdout.write(document.toxml("utf-8"))
9499
elif opts.tree:
95100
if not hasattr(document,'__getitem__'): document = [document]
96101
for fragment in document:
97-
print parser.tree.testSerializer(fragment).encode("utf-8")
102+
sys.stdout.write(parser.tree.testSerializer(fragment))
103+
sys.stdout.write("\n")
98104
elif opts.hilite:
99105
sys.stdout.write(document.hilite("utf-8"))
100106
elif opts.html:
@@ -103,7 +109,7 @@ def printOutput(parser, document, opts):
103109
kwargs[opt] = getattr(opts,opt)
104110
if not kwargs['quote_char']: del kwargs['quote_char']
105111
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
106-
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
112+
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
107113
sys.stdout.write(text)
108114
if not text.endswith('\n'): sys.stdout.write('\n')
109115
if opts.error:

src/html5lib/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010
f = open("my_document.html")
1111
tree = html5lib.parse(f)
1212
"""
13-
print(__path__)
1413

15-
#from .html5parser import HTMLParser, parse
16-
#from treebuilders import getTreeBuilder
14+
from .html5parser import HTMLParser, parse
15+
from .treebuilders import getTreeBuilder
1716

1817
#from .liberalxmlparser import XMLParser, XHTMLParser
1918

src/html5lib/filters/formfiller.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
55
#
66

7-
import _base
7+
from . import _base
88

99
from html5lib.constants import spaceCharacters
10-
spaceCharacters = u"".join(spaceCharacters)
10+
spaceCharacters = "".join(spaceCharacters)
1111

1212
class SimpleFilter(_base.Filter):
1313
def __init__(self, source, fieldStorage):
@@ -29,13 +29,13 @@ def __iter__(self):
2929
input_checked_index = -1
3030
for i,(n,v) in enumerate(token["data"]):
3131
n = n.lower()
32-
if n == u"name":
32+
if n == "name":
3333
field_name = v.strip(spaceCharacters)
34-
elif n == u"type":
34+
elif n == "type":
3535
field_type = v.strip(spaceCharacters)
36-
elif n == u"checked":
36+
elif n == "checked":
3737
input_checked_index = i
38-
elif n == u"value":
38+
elif n == "value":
3939
input_value_index = i
4040

4141
value_list = self.fieldStorage.getlist(field_name)
@@ -45,20 +45,20 @@ def __iter__(self):
4545
else:
4646
value = ""
4747

48-
if field_type in (u"checkbox", u"radio"):
48+
if field_type in ("checkbox", "radio"):
4949
if value_list:
5050
if token["data"][input_value_index][1] == value:
5151
if input_checked_index < 0:
52-
token["data"].append((u"checked", u""))
52+
token["data"].append(("checked", ""))
5353
field_indices[field_name] = field_index + 1
5454
elif input_checked_index >= 0:
5555
del token["data"][input_checked_index]
5656

57-
elif field_type not in (u"button", u"submit", u"reset"):
57+
elif field_type not in ("button", "submit", "reset"):
5858
if input_value_index >= 0:
59-
token["data"][input_value_index] = (u"value", value)
59+
token["data"][input_value_index] = ("value", value)
6060
else:
61-
token["data"].append((u"value", value))
61+
token["data"].append(("value", value))
6262
field_indices[field_name] = field_index + 1
6363

6464
field_type = None
@@ -96,7 +96,7 @@ def __iter__(self):
9696
value = ""
9797
if (is_select_multiple or not is_selected_option_found) and option_value == value:
9898
if option_selected_index < 0:
99-
token["data"].append((u"selected", u""))
99+
token["data"].append(("selected", ""))
100100
field_indices[field_name] = field_index + 1
101101
is_selected_option_found = True
102102
elif option_selected_index >= 0:

src/html5lib/filters/inject_meta_charset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import _base
1+
from . import _base
22

33
class Filter(_base.Filter):
44
def __init__(self, source, encoding):
@@ -23,7 +23,7 @@ def __iter__(self):
2323
content_index = -1
2424
for i,(name,value) in enumerate(token["data"]):
2525
if name.lower() == 'charset':
26-
token["data"][i] = (u'charset', self.encoding)
26+
token["data"][i] = ('charset', self.encoding)
2727
meta_found = True
2828
break
2929
elif name == 'http-equiv' and value.lower() == 'content-type':
@@ -32,7 +32,7 @@ def __iter__(self):
3232
content_index = i
3333
else:
3434
if has_http_equiv_content_type and content_index >= 0:
35-
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
35+
token["data"][content_index] = ('content', 'text/html; charset=%s' % self.encoding)
3636
meta_found = True
3737

3838
elif token["name"].lower() == "head" and not meta_found:

src/html5lib/filters/iso639codes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -746,4 +746,4 @@ def isValidLangCode(value):
746746
lang, sublang = value.split('-', 1)
747747
else:
748748
lang = value
749-
return isoLang.has_key(unicode.lower(unicode(lang)))
749+
return str.lower(str(lang)) in isoLang

src/html5lib/filters/lint.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from gettext import gettext
22
_ = gettext
33

4-
import _base
4+
from . import _base
55
from html5lib.constants import cdataElements, rcdataElements, voidElements
66

77
from html5lib.constants import spaceCharacters
8-
spaceCharacters = u"".join(spaceCharacters)
8+
spaceCharacters = "".join(spaceCharacters)
99

1010
class LintError(Exception): pass
1111

@@ -19,22 +19,22 @@ def __iter__(self):
1919
name = token["name"]
2020
if contentModelFlag != "PCDATA":
2121
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
22-
if not isinstance(name, unicode):
23-
raise LintError(_(u"Tag name is not a string: %r") % name)
22+
if not isinstance(name, str):
23+
raise LintError(_("Tag name is not a string: %r") % name)
2424
if not name:
25-
raise LintError(_(u"Empty tag name"))
25+
raise LintError(_("Empty tag name"))
2626
if type == "StartTag" and name in voidElements:
27-
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
27+
raise LintError(_("Void element reported as StartTag token: %s") % name)
2828
elif type == "EmptyTag" and name not in voidElements:
29-
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
29+
raise LintError(_("Non-void element reported as EmptyTag token: %s") % token["name"])
3030
if type == "StartTag":
3131
open_elements.append(name)
3232
for name, value in token["data"]:
33-
if not isinstance(name, unicode):
33+
if not isinstance(name, str):
3434
raise LintError(_("Attribute name is not a string: %r") % name)
3535
if not name:
36-
raise LintError(_(u"Empty attribute name"))
37-
if not isinstance(value, unicode):
36+
raise LintError(_("Empty attribute name"))
37+
if not isinstance(value, str):
3838
raise LintError(_("Attribute value is not a string: %r") % value)
3939
if name in cdataElements:
4040
contentModelFlag = "CDATA"
@@ -45,15 +45,15 @@ def __iter__(self):
4545

4646
elif type == "EndTag":
4747
name = token["name"]
48-
if not isinstance(name, unicode):
49-
raise LintError(_(u"Tag name is not a string: %r") % name)
48+
if not isinstance(name, str):
49+
raise LintError(_("Tag name is not a string: %r") % name)
5050
if not name:
51-
raise LintError(_(u"Empty tag name"))
51+
raise LintError(_("Empty tag name"))
5252
if name in voidElements:
53-
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
53+
raise LintError(_("Void element reported as EndTag token: %s") % name)
5454
start_name = open_elements.pop()
5555
if start_name != name:
56-
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
56+
raise LintError(_("EndTag (%s) does not match StartTag (%s)") % (name, start_name))
5757
contentModelFlag = "PCDATA"
5858

5959
elif type == "Comment":
@@ -62,27 +62,27 @@ def __iter__(self):
6262

6363
elif type in ("Characters", "SpaceCharacters"):
6464
data = token["data"]
65-
if not isinstance(data, unicode):
65+
if not isinstance(data, str):
6666
raise LintError(_("Attribute name is not a string: %r") % data)
6767
if not data:
68-
raise LintError(_(u"%s token with empty data") % type)
68+
raise LintError(_("%s token with empty data") % type)
6969
if type == "SpaceCharacters":
7070
data = data.strip(spaceCharacters)
7171
if data:
72-
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
72+
raise LintError(_("Non-space character(s) found in SpaceCharacters token: ") % data)
7373

7474
elif type == "Doctype":
7575
name = token["name"]
7676
if contentModelFlag != "PCDATA":
7777
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
78-
if not isinstance(name, unicode):
79-
raise LintError(_(u"Tag name is not a string: %r") % name)
78+
if not isinstance(name, str):
79+
raise LintError(_("Tag name is not a string: %r") % name)
8080
# XXX: what to do with token["data"] ?
8181

8282
elif type in ("ParseError", "SerializeError"):
8383
pass
8484

8585
else:
86-
raise LintError(_(u"Unknown token type: %s") % type)
86+
raise LintError(_("Unknown token type: %s") % type)
8787

8888
yield token

src/html5lib/filters/optionaltags.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import _base
1+
from . import _base
22

33
class Filter(_base.Filter):
44
def slider(self):

src/html5lib/filters/sanitizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import _base
1+
from . import _base
22
from html5lib.sanitizer import HTMLSanitizerMixin
33

44
class Filter(_base.Filter, HTMLSanitizerMixin):

0 commit comments

Comments
 (0)