Skip to content

Commit e65c433

Browse files
committed
Start of Cythonizing the tokenizer
1 parent 8b89668 commit e65c433

16 files changed

+978
-81
lines changed

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ include .pytest.expect
77
include tox.ini
88
include pytest.ini
99
graft html5lib/tests/testdata
10+
recursive-include html5lib *.pxd
1011
recursive-include html5lib/tests *.py

benchmarks/bench_html.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import pyperf
66

7-
sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
7+
#sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
88
import html5lib # noqa: E402
99

1010

@@ -49,9 +49,9 @@ def add_cmdline_args(cmd, args):
4949
source = io.BytesIO(fh.read())
5050

5151
if "parse" in benchmarks:
52-
for tb in ("etree", "dom", "lxml"):
52+
for tb in ("etree",):
5353
runner.bench_func("html_parse_%s" % tb, bench_parse, source, tb)
5454

55-
if "serialize" in benchmarks:
56-
for tb in ("etree", "dom", "lxml"):
57-
runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)
55+
# if "serialize" in benchmarks:
56+
# for tb in ("etree",):
57+
# runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)

benchmarks/bench_wpt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import pyperf
66

7-
sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
7+
#sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
88
import html5lib # noqa: E402
99

1010

html5lib/_inputstream.pxd

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# cython: language_level=3
2+
cimport cython
3+
from cpython cimport array
4+
5+
ctypedef void (*rCEf)(HTMLUnicodeInputStream, unicode) except *
6+
7+
cdef dict charsUntilCache
8+
9+
cdef class BufferedStream(object):
10+
cdef object stream
11+
cdef object buffer
12+
cdef object position
13+
cpdef object tell(self)
14+
cpdef object seek(self, object pos)
15+
cpdef object read(self, object bytes)
16+
cdef object _bufferedBytes(self)
17+
cdef object _readStream(self, object bytes)
18+
cdef object _readFromBuffer(self, object bytes)
19+
20+
#def HTMLInputStream(source, object **kwargs)
21+
22+
cdef class HTMLUnicodeInputStream(object):
23+
cdef rCEf reportCharacterErrors
24+
cdef object newLines
25+
cdef readonly object charEncoding
26+
cdef object dataStream
27+
cdef unicode chunk
28+
cdef Py_ssize_t chunkSize
29+
cdef Py_ssize_t chunkOffset
30+
cdef readonly list errors
31+
32+
# number of (complete) lines in previous chunks
33+
cdef Py_ssize_t prevNumLines
34+
# number of columns in the last line of the previous chunk
35+
cdef Py_ssize_t prevNumCols
36+
37+
# Deal with CR LF and surrogates split over chunk boundaries
38+
cdef unicode _bufferedCharacter
39+
40+
cdef object reset(self)
41+
cdef object openStream(self, object source)
42+
43+
@cython.locals(nLines=Py_ssize_t, lastLinePos=Py_ssize_t)
44+
cdef tuple _position(self, Py_ssize_t offset)
45+
cpdef tuple position(self)
46+
47+
@cython.locals(chunkOffset=Py_ssize_t, char=unicode)
48+
cpdef unicode char(self)
49+
50+
@cython.locals(data=unicode)
51+
cdef bint readChunk(self, Py_ssize_t chunkSize=?) except? -1
52+
53+
@cython.locals(c=ulong)
54+
cdef void characterErrorsUCS4(self, unicode data) except *
55+
cdef void characterErrorsUCS2(self, unicode data) except *
56+
57+
cpdef object charsUntil(self, object characters, bint opposite=?)
58+
cpdef object unget(self, object char)
59+
60+
cdef class HTMLBinaryInputStream(HTMLUnicodeInputStream):
61+
cdef object rawStream
62+
cdef readonly object numBytesMeta
63+
cdef readonly object numBytesChardet
64+
cdef object override_encoding
65+
cdef object transport_encoding
66+
cdef object same_origin_parent_encoding
67+
cdef object likely_encoding
68+
cdef object default_encoding
69+
cdef object reset(self)
70+
cdef object openStream(self, object source)
71+
cdef object determineEncoding(self, object chardet=?)
72+
cpdef object changeEncoding(self, object newEncoding)
73+
@cython.locals(string=bytes)
74+
cdef object detectBOM(self)
75+
cdef object detectEncodingMeta(self)
76+
77+
# cdef class EncodingBytes(bytes):
78+
# cdef object previous(self)
79+
# cdef object setPosition(self, object position)
80+
# cdef object getPosition(self)
81+
# cdef object getCurrentByte(self)
82+
# cdef object skip(self, object chars=?)
83+
# cdef object skipUntil(self, object chars)
84+
# cdef object matchBytes(self, object bytes)
85+
# cdef object jumpTo(self, object bytes)
86+
87+
ctypedef bint (*encstate)(EncodingParser) except? -1
88+
89+
cdef class EncodingParser(object):
90+
cdef object data
91+
cdef object encoding
92+
93+
@cython.locals(func=encstate, keepParsing=bint)
94+
cdef object getEncoding(self)
95+
cdef bint handleComment(self) except? -1
96+
@cython.locals(hasPragma=bint, name=bytes, value=bytes, tentativeEncoding=bytes)
97+
cdef bint handleMeta(self) except? -1
98+
cdef bint handlePossibleStartTag(self) except? -1
99+
cdef bint handlePossibleEndTag(self) except? -1
100+
cdef bint handlePossibleTag(self, bint endTag) except? -1
101+
cdef bint handleOther(self) except? -1
102+
@cython.locals(c=bytes)
103+
cdef tuple getAttribute(self)
104+
105+
cdef class ContentAttrParser(object):
106+
cdef object data
107+
cpdef object parse(self) # this needs to be cpdef for tests
108+
109+
cdef object lookupEncoding(object encoding)

0 commit comments

Comments
 (0)