@@ -33,6 +33,9 @@ def __init__(self, source, encoding=None):
33
33
self .rawStream = self .openStream (source )
34
34
35
35
# Encoding Information
36
+ #Number of bytes to use when looking for a meta element with
37
+ #encoding information
38
+ self .numBytesMeta = 512
36
39
#Encoding to use if no other information can be found
37
40
self .defaultEncoding = "cp1252"
38
41
#Detect encoding iff no explicit "transport level" encoding is supplied
@@ -125,10 +128,11 @@ def detectBOM(self):
125
128
126
129
return encoding
127
130
128
- def detectEncodingMeta (self , encoding ):
131
+ def detectEncodingMeta (self ):
129
132
"""Report the encoding declared by the meta element
130
133
"""
131
- parser = MetaParser (self .rawStream .read (self .numBytesMeta ))
134
+ parser = EncodingParser (self .rawStream .read (self .numBytesMeta ))
135
+ self .rawStream .seek (0 )
132
136
return parser .getEncoding ()
133
137
134
138
def determineNewLines (self ):
@@ -200,9 +204,8 @@ def charsUntil(self, characters, opposite = False):
200
204
class EncodingParser (object ):
201
205
"""Mini parser for detecting character encoding from meta elements"""
202
206
203
- def __init__ (self , inputStream , string ):
207
+ def __init__ (self , data ):
204
208
"""string - the data to work on for encoding detection"""
205
- self .inputStream = inputStream
206
209
self .data = data
207
210
self .position = 0
208
211
self .encoding = None
@@ -212,14 +215,13 @@ def getEncoding(self):
212
215
("<!--" ,self .handleComment ),
213
216
("<meta" ,self .handleMeta ),
214
217
("</" ,self .handlePossibleEndTag ),
215
- ("<!" ,self .handleOther )
218
+ ("<!" ,self .handleOther ),
216
219
("<?" ,self .handleOther ),
217
- ("<" ,handlePossibleStartTag ))
220
+ ("<" ,self . handlePossibleStartTag ))
218
221
while self .position < len (self .data ):
219
222
keepParsing = True
220
- for key , method in unparsedData :
221
- if self .matchBytes (key ):
222
- self .movePosition (len (key ))
223
+ for key , method in methodDispatch :
224
+ if self .matchBytes (key , lower = True ):
223
225
keepParsing = method ()
224
226
break
225
227
if not keepParsing :
@@ -236,13 +238,16 @@ def readBytes(self, numBytes):
236
238
237
239
def movePosition (self , offset ):
238
240
"""Move offset bytes from the current read position"""
239
- self .positon += offset
241
+ self .position += offset
240
242
241
- def matchBytes (self , bytes ):
243
+ def matchBytes (self , bytes , lower = False ):
242
244
"""Look for a sequence of bytes at the start of a string. If the bytes
243
245
are found return True and advance the position to the byte after the
244
246
match. Otherwise return False and leave the position alone"""
245
- rv = self .data [self .position :].startswith (bytes )
247
+ data = self .data [self .position :self .position + len (bytes )]
248
+ if lower :
249
+ data = data .lower ()
250
+ rv = data .startswith (bytes )
246
251
if rv == True :
247
252
self .movePosition (len (bytes ))
248
253
return rv
@@ -258,13 +263,20 @@ def findBytes(self, bytes):
258
263
else :
259
264
self .position = len (self .data )
260
265
return False
266
+
267
+ def findNext (self , charList ):
268
+ """Move the pointer so it points to the next byte in a set of possible
269
+ bytes"""
270
+ while (self .position < len (self .data ) and
271
+ self .data [self .position ] not in charList ):
272
+ self .position += 1
261
273
262
274
def handleComment (self ):
263
275
"""Skip over comments"""
264
276
return self .findBytes ("-->" )
265
277
266
278
def handleMeta (self ):
267
- if self .position == len (self .data )- 1 :
279
+ if self .position == len (self .data ):
268
280
#We have <meta at the end of our sniffing stream
269
281
return False
270
282
elif self .data [self .position ] not in spaceCharacters :
@@ -290,10 +302,10 @@ def handleMeta(self):
290
302
return False
291
303
292
304
def handlePossibleStartTag (self ):
293
- return self .handlePossibleTag (self , False )
305
+ return self .handlePossibleTag (False )
294
306
295
307
def handlePossibleEndTag (self ):
296
- return self .handlePossibleTag (self , True )
308
+ return self .handlePossibleTag (True )
297
309
298
310
def handlePossibleTag (self , endTag ):
299
311
if self .readBytes (1 ) not in asciiLetters :
@@ -306,24 +318,22 @@ def handlePossibleTag(self, endTag):
306
318
else :
307
319
return
308
320
309
- startPosition = position
310
- match = False
311
- for possibleChar in ([str (char ) for char in spaceCharacters ] +
312
- ["<" , ">" ]):
313
- if self .findBytes (possibleChar ):
314
- match = True
315
- break
316
- else :
317
- self .position = startPosition
318
- if not match :
319
- #If no match is found set the position to the end of the data
320
- self .position = len (self .data )
321
+
322
+ possibleChar = ([str (char ) for char in spaceCharacters ] +
323
+ ["<" , ">" ])
324
+ self .findNext (possibleChar )
325
+ if self .position == len (self .data ):
326
+ #If no match is found abort processing
321
327
return False
328
+ elif self .data [self .position ] == "<" :
329
+ #return to the first step in the overall "two step" algorithm
330
+ self .position -= 1
331
+ return True
322
332
else :
323
333
#Read all attributes
324
- self .getAttribute ()
334
+ attr = self .getAttribute ()
325
335
while attr is not None :
326
- self .getAttribute ()
336
+ attr = self .getAttribute ()
327
337
return True
328
338
329
339
def handleOther (self ):
@@ -337,14 +347,13 @@ def getAttribute(self):
337
347
self .position += attrParser .position
338
348
return attr
339
349
340
- def isValidEncodinfEncoding (self , encoding ):
350
+ def isValidEncoding (self , encoding ):
341
351
"""Determine if encoding is a valid encoding and, if it is, set it
342
352
as the encoding on the inputstream"""
343
- #XXX to do
344
353
try :
345
354
codecs .lookup (encoding )
346
355
rv = True
347
- except codecs .LookupError :
356
+ except codecs .lookup_error :
348
357
rv = False
349
358
return rv
350
359
@@ -359,34 +368,30 @@ def parse(self):
359
368
raise NotImplementedError
360
369
361
370
def skip (self , chars = spaceCharacters ):
362
- while self .fragment [self .position ] in chars :
371
+ while (self .position < len (self .fragment )
372
+ and self .fragment [self .position ] in chars ):
363
373
self .position += 1
364
374
365
375
def startsWith (self , value ):
366
376
return self .fragment [self .position :].startswith (value )
367
377
368
- def findBytes (self , bytes ):
369
- """Look for the next sequence of bytes matching a given sequence. If
370
- a match is found advance the position to the last byte of the match or
371
- to the end of the string"""
372
- newPosition = self .fragment [self .position :].find (bytes )
373
- if newPosition > - 1 :
374
- self .position += (newPosition + len (bytes )- 1 )
375
- return True
376
- else :
377
- self .position = len (self .data )
378
- return False
379
-
378
+ def findNext (self , charList ):
379
+ """Move the pointer so it points to the next byte in a set of possible
380
+ bytes"""
381
+ while (self .position < len (self .fragment ) and
382
+ self .fragment [self .position ] not in charList ):
383
+ self .position += 1
384
+
380
385
class ContentAttrParser (FragmentParser ):
381
386
def parse (self ):
382
387
#Skip to the first ";"
383
388
parts = self .fragment .split (";" )
384
389
if len (parts ) > 1 :
385
- self .value = parts [1 ]
386
- self .skipWhitespace ()
390
+ self .fragment = parts [1 ]
391
+ self .skip ()
387
392
#Check if the attr name is charset
388
393
#otherwise return
389
- if self .startsWith ("charset" ):
394
+ if not self .startsWith ("charset" ):
390
395
return None
391
396
self .position += len ("charset" )
392
397
self .skip ()
@@ -396,8 +401,8 @@ def parse(self):
396
401
self .position += 1
397
402
self .skip ()
398
403
#Look for an encoding between matching quote marks
399
- if value [ position ] in ('"' , "'" ):
400
- quoteMark = value [ position ]
404
+ if self . fragment [ self . position ] in ('"' , "'" ):
405
+ quoteMark = self . fragment [ self . position ]
401
406
self .position += 1
402
407
oldPosition = self .positon
403
408
endQuotePosition = selfBytes (quoteMark )
@@ -409,73 +414,74 @@ def parse(self):
409
414
return None
410
415
else :
411
416
#Unquoted value
412
- for char in spaceCharacters :
413
- oldPosition = self .position
414
- self .findByte (char )
415
- if self .position > - 1 :
416
- return value [position :position + spacePosition ]
417
- else :
418
- self .position = oldPosition
419
- #Return the whole remaining value
420
- return value [position :]
421
-
422
- class AttrParser (FragmentParser ):
423
- def parse (self ):
424
- self .skip (list (spaceCharacters )+ ["/" ])
425
- if self .value [self .position ] == "<" :
426
- self .position -= 1
427
- return None
428
- elif self .value [self .position ] == "<" :
429
- return None
430
- attrName = []
431
- attrValue = []
432
- spaceFound = False
433
- while True :
434
- if self .fragment [self .position ] == "=" and attrName :
435
- break
436
- elif self .fragment [self .position ] in spaceCharacters :
437
- spaceFound = True
438
- break
439
- elif self .fragment [self .position ] in ("/" , "<" , ">" ):
440
- self .position -= 1
441
- return "" .join (attrName ), ""
442
- elif self .fragment [self .position ] in asciiUppercase :
443
- attrName .extend (self .fragment [self .position ].lower ())
417
+ startPosition = self .position
418
+ self .findNext (spaceCharacters )
419
+ if self .position != len (self .fragment ):
420
+ return self .fragment [startPosition :self .position ]
444
421
else :
445
- attrName .extend (self .fragment [self .position ])
446
- self .position += 1
447
- if spaceFound :
448
- self .skip ()
449
- if self .fragment [self .position ] != "=" :
450
- self .position -= 1
451
- return "" .join (attrName ), ""
452
- self .position += 1
453
- self .skip ()
454
- if self .fragment [self .position ] in ("'" , '"' ):
455
- quoteChar = self .fragment [self .position ]
456
- self .position += 1
457
- while True :
458
- if self .fragment [self .position ] == quoteChar :
459
- return "" .join (attrName ), "" .join (attrValue )
460
- elif self .fragment [self .position ] in asciiUppercase :
461
- attrName .extend (self .fragment [self .position ].lower ())
462
- else :
463
- attrName .extend (self .fragment [self .position ])
464
- elif self .fragment [self .position ] in (">" , '<' ):
465
- self .position -= 1
466
- return "" .join (attrName ), ""
422
+ #Return the whole remaining value
423
+ return self .fragment [startPosition :]
424
+
425
+
426
+ class AttrParser (FragmentParser ):
427
+ def parse (self ):
428
+ self .skip (list (spaceCharacters )+ ["/" ])
429
+ if self .fragment [self .position ] == "<" :
430
+ self .position -= 1
431
+ return None
432
+ elif self .fragment [self .position ] == ">" :
433
+ return None
434
+ attrName = []
435
+ attrValue = []
436
+ spaceFound = False
437
+ while True :
438
+ if self .fragment [self .position ] == "=" and attrName :
439
+ break
440
+ elif self .fragment [self .position ] in spaceCharacters :
441
+ spaceFound = True
442
+ break
443
+ elif self .fragment [self .position ] in ("/" , "<" , ">" ):
444
+ self .position -= 1
445
+ return "" .join (attrName ), ""
467
446
elif self .fragment [self .position ] in asciiUppercase :
468
447
attrName .extend (self .fragment [self .position ].lower ())
469
448
else :
470
449
attrName .extend (self .fragment [self .position ])
471
- #XXX I think this next bit is right but there is a bug in the spec
450
+ self .position += 1
451
+ if spaceFound :
452
+ self .skip ()
453
+ if self .fragment [self .position ] != "=" :
454
+ self .position -= 1
455
+ return "" .join (attrName ), ""
456
+ #XXX need to advance positon in both spaces and value case
457
+ self .position += 1
458
+ self .skip ()
459
+ if self .fragment [self .position ] in ("'" , '"' ):
460
+ quoteChar = self .fragment [self .position ]
461
+ self .position += 1
472
462
while True :
473
- self .position += 1
474
- if self .fragment [self .position ] in (
475
- list (spaceCharacters ).extend ([">" , '<' ])):
476
- self .position -= 1
477
- return "" .join (attrName ), ""
463
+ if self .fragment [self .position ] == quoteChar :
464
+ return "" .join (attrName ), "" .join (attrValue )
478
465
elif self .fragment [self .position ] in asciiUppercase :
479
- attrName .extend (self .fragment [self .position ].lower ())
466
+ attrValue .extend (self .fragment [self .position ].lower ())
480
467
else :
481
- attrName .extend (self .fragment [self .position ])
468
+ attrValue .extend (self .fragment [self .position ])
469
+ self .position += 1
470
+ elif self .fragment [self .position ] in (">" , '<' ):
471
+ self .position -= 1
472
+ return "" .join (attrName ), ""
473
+ elif self .fragment [self .position ] in asciiUppercase :
474
+ attrValue .extend (self .fragment [self .position ].lower ())
475
+ else :
476
+ attrValue .extend (self .fragment [self .position ])
477
+ #XXX I think this next bit is right but there is a bug in the spec
478
+ while True :
479
+ self .position += 1
480
+ if self .fragment [self .position ] in (
481
+ list (spaceCharacters ) + [">" , '<' ]):
482
+ self .position -= 1
483
+ return "" .join (attrName ), ""
484
+ elif self .fragment [self .position ] in asciiUppercase :
485
+ attrValue .extend (self .fragment [self .position ].lower ())
486
+ else :
487
+ attrValue .extend (self .fragment [self .position ])
0 commit comments