Skip to content

Commit f4f1fb8

Browse files
fantasaigsnedders
authored andcommitted
Google Code Issue 157: Add "escape invisible characters" option
Vaguely updated, but basically working.
1 parent 073d792 commit f4f1fb8

File tree

3 files changed

+51
-1
lines changed

3 files changed

+51
-1
lines changed

html5lib/constants.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import gettext
55
_ = gettext.gettext
66

7+
from itertools import chain
8+
9+
710
EOF = None
811

912
E = {
@@ -3078,6 +3081,19 @@
30783081
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
30793082

30803083

3084+
invisibleChars = frozenset(chain(
3085+
# ASCII control chars
3086+
range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20),
3087+
# Other control chars
3088+
# fixed-width spaces, zero-width marks, bidi marks
3089+
range(0x2000, 0x2010),
3090+
# LS, PS, bidi control codes
3091+
range(0x2028, 0x2030),
3092+
# nbsp, mathsp, ideosp, WJ, interlinear
3093+
[0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
3094+
))
3095+
3096+
30813097
class DataLossWarning(UserWarning):
30823098
pass
30833099

html5lib/serializer/htmlserializer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class HTMLSerializer(object):
9494
# escaping options
9595
escape_lt_in_attrs = False
9696
escape_rcdata = False
97+
escape_invisible = False
9798
resolve_entities = True
9899

99100
# miscellaneous options
@@ -105,7 +106,8 @@ class HTMLSerializer(object):
105106
"minimize_boolean_attributes", "use_trailing_solidus",
106107
"space_before_trailing_solidus", "omit_optional_tags",
107108
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
108-
"escape_rcdata", "resolve_entities", "sanitize")
109+
"escape_rcdata", "escape_invisible", "resolve_entities",
110+
"sanitize")
109111

110112
def __init__(self, **kwargs):
111113
"""Initialize HTMLSerializer.
@@ -127,6 +129,10 @@ def __init__(self, **kwargs):
127129
escape_rcdata=False|True
128130
Whether to escape characters that need to be escaped within normal
129131
elements within rcdata elements such as style.
132+
escape_invisible=False|True|'numeric'|'named'
133+
Whether to escape invisible characters (such as nbsp, fixed-width
134+
spaces, and control codes). Uses named HTML escapes if 'named'
135+
is specified, otherwise uses numeric codes.
130136
resolve_entities=True|False
131137
Whether to resolve named character entities that appear in the
132138
source tree. The XML predefined entities < > & " '
@@ -160,6 +166,8 @@ def __init__(self, **kwargs):
160166

161167
def encode(self, string):
162168
assert(isinstance(string, text_type))
169+
if self.escape_invisible:
170+
text = utils.escapeInvisible(text, self.escape_invisible == 'named')
163171
if self.encoding:
164172
return string.encode(self.encoding, unicode_encode_errors)
165173
else:

html5lib/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from types import ModuleType
44

5+
from .constants import invisibleChars
6+
57

68
class MethodDispatcher(dict):
79
"""Dict with 2 special properties:
@@ -71,3 +73,27 @@ def moduleFactory(baseModule, *args, **kwargs):
7173
return mod
7274

7375
return moduleFactory
76+
77+
78+
def escapeInvisible(text, useNamedEntities=False):
79+
"""Escape invisible characters other than Tab, LF, CR, and ASCII space
80+
"""
81+
assert type(text) == text_type
82+
# This algorithm is O(MN) for M len(text) and N num escapable
83+
# But it doesn't modify the text when N is zero (common case) and
84+
# N is expected to be small (usually 1 or 2) in most other cases.
85+
escapable = set()
86+
for c in text:
87+
if ord(c) in invisibleChars:
88+
escapable.add(c)
89+
if useNamedEntities:
90+
raise NotImplementedError("This doesn't work on Python 3")
91+
for c in escapable:
92+
name = codepoint2name.get(ord(c))
93+
escape = "&%s;" % name if name else "&#x%X;" % ord(c)
94+
text = text.replace(c, escape)
95+
else:
96+
for c in escapable:
97+
text = text.replace(c, "&#x%X;" % ord(c))
98+
99+
return text

0 commit comments

Comments
 (0)