Skip to content

Commit 0fb5b14

Browse files
committed
More character encoding detection work in progress - now works sometimes (not yet enabled)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40504
1 parent 67f0b64 commit 0fb5b14

File tree

1 file changed

+118
-112
lines changed

1 file changed

+118
-112
lines changed

src/inputstream.py

Lines changed: 118 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ def __init__(self, source, encoding=None):
3333
self.rawStream = self.openStream(source)
3434

3535
# Encoding Information
36+
#Number of bytes to use when looking for a meta element with
37+
#encoding information
38+
self.numBytesMeta = 512
3639
#Encoding to use if no other information can be found
3740
self.defaultEncoding = "cp1252"
3841
#Detect encoding iff no explicit "transport level" encoding is supplied
@@ -125,10 +128,11 @@ def detectBOM(self):
125128

126129
return encoding
127130

128-
def detectEncodingMeta(self, encoding):
131+
def detectEncodingMeta(self):
129132
"""Report the encoding declared by the meta element
130133
"""
131-
parser = MetaParser(self.rawStream.read(self.numBytesMeta))
134+
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
135+
self.rawStream.seek(0)
132136
return parser.getEncoding()
133137

134138
def determineNewLines(self):
@@ -200,9 +204,8 @@ def charsUntil(self, characters, opposite = False):
200204
class EncodingParser(object):
201205
"""Mini parser for detecting character encoding from meta elements"""
202206

203-
def __init__(self, inputStream, string):
207+
def __init__(self, data):
204208
"""string - the data to work on for encoding detection"""
205-
self.inputStream = inputStream
206209
self.data = data
207210
self.position = 0
208211
self.encoding = None
@@ -212,14 +215,13 @@ def getEncoding(self):
212215
("<!--",self.handleComment),
213216
("<meta",self.handleMeta),
214217
("</",self.handlePossibleEndTag),
215-
("<!",self.handleOther)
218+
("<!",self.handleOther),
216219
("<?",self.handleOther),
217-
("<",handlePossibleStartTag))
220+
("<",self.handlePossibleStartTag))
218221
while self.position < len(self.data):
219222
keepParsing = True
220-
for key, method in unparsedData:
221-
if self.matchBytes(key):
222-
self.movePosition(len(key))
223+
for key, method in methodDispatch:
224+
if self.matchBytes(key, lower=True):
223225
keepParsing = method()
224226
break
225227
if not keepParsing:
@@ -236,13 +238,16 @@ def readBytes(self, numBytes):
236238

237239
def movePosition(self, offset):
238240
"""Move offset bytes from the current read position"""
239-
self.positon += offset
241+
self.position += offset
240242

241-
def matchBytes(self, bytes):
243+
def matchBytes(self, bytes, lower=False):
242244
"""Look for a sequence of bytes at the start of a string. If the bytes
243245
are found return True and advance the position to the byte after the
244246
match. Otherwise return False and leave the position alone"""
245-
rv = self.data[self.position:].startswith(bytes)
247+
data = self.data[self.position:self.position+len(bytes)]
248+
if lower:
249+
data = data.lower()
250+
rv = data.startswith(bytes)
246251
if rv == True:
247252
self.movePosition(len(bytes))
248253
return rv
@@ -258,13 +263,20 @@ def findBytes(self, bytes):
258263
else:
259264
self.position = len(self.data)
260265
return False
266+
267+
def findNext(self, charList):
268+
"""Move the pointer so it points to the next byte in a set of possible
269+
bytes"""
270+
while (self.position < len(self.data) and
271+
self.data[self.position] not in charList):
272+
self.position += 1
261273

262274
def handleComment(self):
263275
"""Skip over comments"""
264276
return self.findBytes("-->")
265277

266278
def handleMeta(self):
267-
if self.position == len(self.data)-1:
279+
if self.position == len(self.data):
268280
#We have <meta at the end of our sniffing stream
269281
return False
270282
elif self.data[self.position] not in spaceCharacters:
@@ -290,10 +302,10 @@ def handleMeta(self):
290302
return False
291303

292304
def handlePossibleStartTag(self):
293-
return self.handlePossibleTag(self, False)
305+
return self.handlePossibleTag(False)
294306

295307
def handlePossibleEndTag(self):
296-
return self.handlePossibleTag(self, True)
308+
return self.handlePossibleTag(True)
297309

298310
def handlePossibleTag(self, endTag):
299311
if self.readBytes(1) not in asciiLetters:
@@ -306,24 +318,22 @@ def handlePossibleTag(self, endTag):
306318
else:
307319
return
308320

309-
startPosition = position
310-
match = False
311-
for possibleChar in ([str(char) for char in spaceCharacters] +
312-
["<", ">"]):
313-
if self.findBytes(possibleChar):
314-
match = True
315-
break
316-
else:
317-
self.position = startPosition
318-
if not match:
319-
#If no match is found set the position to the end of the data
320-
self.position = len(self.data)
321+
322+
possibleChar =([str(char) for char in spaceCharacters] +
323+
["<", ">"])
324+
self.findNext(possibleChar)
325+
if self.position == len(self.data):
326+
#If no match is found abort processing
321327
return False
328+
elif self.data[self.position] == "<":
329+
#return to the first step in the overall "two step" algorithm
330+
self.position -= 1
331+
return True
322332
else:
323333
#Read all attributes
324-
self.getAttribute()
334+
attr = self.getAttribute()
325335
while attr is not None:
326-
self.getAttribute()
336+
attr = self.getAttribute()
327337
return True
328338

329339
def handleOther(self):
@@ -337,14 +347,13 @@ def getAttribute(self):
337347
self.position += attrParser.position
338348
return attr
339349

340-
def isValidEncodinfEncoding(self, encoding):
350+
def isValidEncoding(self, encoding):
341351
"""Determine if encoding is a valid encoding and, if it is, set it
342352
as the encoding on the inputstream"""
343-
#XXX to do
344353
try:
345354
codecs.lookup(encoding)
346355
rv = True
347-
except codecs.LookupError:
356+
except codecs.lookup_error:
348357
rv = False
349358
return rv
350359

@@ -359,34 +368,30 @@ def parse(self):
359368
raise NotImplementedError
360369

361370
def skip(self, chars=spaceCharacters):
362-
while self.fragment[self.position] in chars:
371+
while (self.position < len(self.fragment)
372+
and self.fragment[self.position] in chars):
363373
self.position += 1
364374

365375
def startsWith(self, value):
366376
return self.fragment[self.position:].startswith(value)
367377

368-
def findBytes(self, bytes):
369-
"""Look for the next sequence of bytes matching a given sequence. If
370-
a match is found advance the position to the last byte of the match or
371-
to the end of the string"""
372-
newPosition = self.fragment[self.position:].find(bytes)
373-
if newPosition > -1:
374-
self.position += (newPosition + len(bytes)-1)
375-
return True
376-
else:
377-
self.position = len(self.data)
378-
return False
379-
378+
def findNext(self, charList):
379+
"""Move the pointer so it points to the next byte in a set of possible
380+
bytes"""
381+
while (self.position < len(self.fragment) and
382+
self.fragment[self.position] not in charList):
383+
self.position += 1
384+
380385
class ContentAttrParser(FragmentParser):
381386
def parse(self):
382387
#Skip to the first ";"
383388
parts = self.fragment.split(";")
384389
if len(parts) > 1:
385-
self.value = parts[1]
386-
self.skipWhitespace()
390+
self.fragment = parts[1]
391+
self.skip()
387392
#Check if the attr name is charset
388393
#otherwise return
389-
if self.startsWith("charset"):
394+
if not self.startsWith("charset"):
390395
return None
391396
self.position += len("charset")
392397
self.skip()
@@ -396,8 +401,8 @@ def parse(self):
396401
self.position += 1
397402
self.skip()
398403
#Look for an encoding between matching quote marks
399-
if value[position] in ('"', "'"):
400-
quoteMark = value[position]
404+
if self.fragment[self.position] in ('"', "'"):
405+
quoteMark = self.fragment[self.position]
401406
self.position += 1
402407
oldPosition = self.positon
403408
endQuotePosition = selfBytes(quoteMark)
@@ -409,73 +414,74 @@ def parse(self):
409414
return None
410415
else:
411416
#Unquoted value
412-
for char in spaceCharacters:
413-
oldPosition = self.position
414-
self.findByte(char)
415-
if self.position > -1:
416-
return value[position:position+spacePosition]
417-
else:
418-
self.position = oldPosition
419-
#Return the whole remaining value
420-
return value[position:]
421-
422-
class AttrParser(FragmentParser):
423-
def parse(self):
424-
self.skip(list(spaceCharacters)+["/"])
425-
if self.value[self.position] == "<":
426-
self.position -= 1
427-
return None
428-
elif self.value[self.position] == "<":
429-
return None
430-
attrName = []
431-
attrValue = []
432-
spaceFound = False
433-
while True:
434-
if self.fragment[self.position] == "=" and attrName:
435-
break
436-
elif self.fragment[self.position] in spaceCharacters:
437-
spaceFound=True
438-
break
439-
elif self.fragment[self.position] in ("/", "<", ">"):
440-
self.position -= 1
441-
return "".join(attrName), ""
442-
elif self.fragment[self.position] in asciiUppercase:
443-
attrName.extend(self.fragment[self.position].lower())
417+
startPosition = self.position
418+
self.findNext(spaceCharacters)
419+
if self.position != len(self.fragment):
420+
return self.fragment[startPosition:self.position]
444421
else:
445-
attrName.extend(self.fragment[self.position])
446-
self.position += 1
447-
if spaceFound:
448-
self.skip()
449-
if self.fragment[self.position] != "=":
450-
self.position -= 1
451-
return "".join(attrName), ""
452-
self.position += 1
453-
self.skip()
454-
if self.fragment[self.position] in ("'", '"'):
455-
quoteChar = self.fragment[self.position]
456-
self.position += 1
457-
while True:
458-
if self.fragment[self.position] == quoteChar:
459-
return "".join(attrName), "".join(attrValue)
460-
elif self.fragment[self.position] in asciiUppercase:
461-
attrName.extend(self.fragment[self.position].lower())
462-
else:
463-
attrName.extend(self.fragment[self.position])
464-
elif self.fragment[self.position] in (">", '<'):
465-
self.position -= 1
466-
return "".join(attrName), ""
422+
#Return the whole remaining value
423+
return self.fragment[startPosition:]
424+
425+
426+
class AttrParser(FragmentParser):
427+
def parse(self):
428+
self.skip(list(spaceCharacters)+["/"])
429+
if self.fragment[self.position] == "<":
430+
self.position -= 1
431+
return None
432+
elif self.fragment[self.position] == ">":
433+
return None
434+
attrName = []
435+
attrValue = []
436+
spaceFound = False
437+
while True:
438+
if self.fragment[self.position] == "=" and attrName:
439+
break
440+
elif self.fragment[self.position] in spaceCharacters:
441+
spaceFound=True
442+
break
443+
elif self.fragment[self.position] in ("/", "<", ">"):
444+
self.position -= 1
445+
return "".join(attrName), ""
467446
elif self.fragment[self.position] in asciiUppercase:
468447
attrName.extend(self.fragment[self.position].lower())
469448
else:
470449
attrName.extend(self.fragment[self.position])
471-
#XXX I think this next bit is right but there is a bug in the spec
450+
self.position += 1
451+
if spaceFound:
452+
self.skip()
453+
if self.fragment[self.position] != "=":
454+
self.position -= 1
455+
return "".join(attrName), ""
456+
#XXX need to advance positon in both spaces and value case
457+
self.position += 1
458+
self.skip()
459+
if self.fragment[self.position] in ("'", '"'):
460+
quoteChar = self.fragment[self.position]
461+
self.position += 1
472462
while True:
473-
self.position +=1
474-
if self.fragment[self.position] in (
475-
list(spaceCharacters).extend([">", '<'])):
476-
self.position -= 1
477-
return "".join(attrName), ""
463+
if self.fragment[self.position] == quoteChar:
464+
return "".join(attrName), "".join(attrValue)
478465
elif self.fragment[self.position] in asciiUppercase:
479-
attrName.extend(self.fragment[self.position].lower())
466+
attrValue.extend(self.fragment[self.position].lower())
480467
else:
481-
attrName.extend(self.fragment[self.position])
468+
attrValue.extend(self.fragment[self.position])
469+
self.position += 1
470+
elif self.fragment[self.position] in (">", '<'):
471+
self.position -= 1
472+
return "".join(attrName), ""
473+
elif self.fragment[self.position] in asciiUppercase:
474+
attrValue.extend(self.fragment[self.position].lower())
475+
else:
476+
attrValue.extend(self.fragment[self.position])
477+
#XXX I think this next bit is right but there is a bug in the spec
478+
while True:
479+
self.position +=1
480+
if self.fragment[self.position] in (
481+
list(spaceCharacters) + [">", '<']):
482+
self.position -= 1
483+
return "".join(attrName), ""
484+
elif self.fragment[self.position] in asciiUppercase:
485+
attrValue.extend(self.fragment[self.position].lower())
486+
else:
487+
attrValue.extend(self.fragment[self.position])

0 commit comments

Comments
 (0)