Skip to content

Commit 85d6960

Browse files
committed
Char encoding now passes exisitng unit tests: it has been enabled but may not get encoding right
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40505
1 parent 0fb5b14 commit 85d6960

File tree

1 file changed

+41
-13
lines changed

1 file changed

+41
-13
lines changed

src/inputstream.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ def detectEncoding(self):
8383

8484
#If there is no BOM need to look for meta elements with encoding
8585
#information
86-
#encoding = self.detectEncodingMeta()
87-
#if encoding is not None:
88-
# return encoding
86+
encoding = self.detectEncodingMeta()
87+
if encoding is not None:
88+
return encoding
8989

9090
#Guess with chardet, if avaliable
9191
try:
@@ -345,6 +345,7 @@ def getAttribute(self):
345345
attrParser = AttrParser(self.data[self.position:])
346346
attr = attrParser.parse()
347347
self.position += attrParser.position
348+
#print attr, attrParser.position, self.data[self.position]
348349
return attr
349350

350351
def isValidEncoding(self, encoding):
@@ -426,6 +427,8 @@ def parse(self):
426427
class AttrParser(FragmentParser):
427428
def parse(self):
428429
self.skip(list(spaceCharacters)+["/"])
430+
if self.position == len(self.fragment):
431+
return None
429432
if self.fragment[self.position] == "<":
430433
self.position -= 1
431434
return None
@@ -434,41 +437,63 @@ def parse(self):
434437
attrName = []
435438
attrValue = []
436439
spaceFound = False
440+
#Step 5 attribute name
437441
while True:
438-
if self.fragment[self.position] == "=" and attrName:
442+
if self.position == len(self.fragment):
443+
return "".join(attrName), ""
444+
elif self.fragment[self.position] == "=" and attrName:
439445
break
440446
elif self.fragment[self.position] in spaceCharacters:
441447
spaceFound=True
442448
break
443449
elif self.fragment[self.position] in ("/", "<", ">"):
444-
self.position -= 1
450+
#self.position -= 1
445451
return "".join(attrName), ""
446452
elif self.fragment[self.position] in asciiUppercase:
447453
attrName.extend(self.fragment[self.position].lower())
448454
else:
449455
attrName.extend(self.fragment[self.position])
456+
#Step 6
450457
self.position += 1
458+
#Step 7
451459
if spaceFound:
452460
self.skip()
461+
if self.position == len(self.fragment):
462+
return "".join(attrName), ""
463+
#Step 8
453464
if self.fragment[self.position] != "=":
454-
self.position -= 1
465+
#self.position -= 1
455466
return "".join(attrName), ""
456467
#XXX need to advance positon in both spaces and value case
468+
#Step 9
457469
self.position += 1
470+
#Step 10
458471
self.skip()
472+
#XXX Need to exit if we go past the end of the fragment
473+
if self.position == len(self.fragment):
474+
return "".join(attrName), ""
475+
#Step 11
459476
if self.fragment[self.position] in ("'", '"'):
477+
#11.1
460478
quoteChar = self.fragment[self.position]
461-
self.position += 1
462479
while True:
463-
if self.fragment[self.position] == quoteChar:
480+
#11.2
481+
self.position += 1
482+
if self.position == len(self.fragment):
483+
return "".join(attrName), "".join(attrValue)
484+
#11.3
485+
elif self.fragment[self.position] == quoteChar:
486+
#XXX Not in spec
487+
self.position += 1
464488
return "".join(attrName), "".join(attrValue)
489+
#11.4
465490
elif self.fragment[self.position] in asciiUppercase:
466491
attrValue.extend(self.fragment[self.position].lower())
492+
#11.5
467493
else:
468494
attrValue.extend(self.fragment[self.position])
469-
self.position += 1
470495
elif self.fragment[self.position] in (">", '<'):
471-
self.position -= 1
496+
#self.position -= 1
472497
return "".join(attrName), ""
473498
elif self.fragment[self.position] in asciiUppercase:
474499
attrValue.extend(self.fragment[self.position].lower())
@@ -477,10 +502,13 @@ def parse(self):
477502
#XXX I think this next bit is right but there is a bug in the spec
478503
while True:
479504
self.position +=1
480-
if self.fragment[self.position] in (
505+
if self.position == len(self.fragment):
506+
return "".join(attrName), "".join(attrValue)
507+
elif self.fragment[self.position] in (
481508
list(spaceCharacters) + [">", '<']):
482-
self.position -= 1
483-
return "".join(attrName), ""
509+
#XXX this is wrong
510+
#self.position -= 1
511+
return "".join(attrName), "".join(attrValue)
484512
elif self.fragment[self.position] in asciiUppercase:
485513
attrValue.extend(self.fragment[self.position].lower())
486514
else:

0 commit comments

Comments
 (0)