@@ -355,7 +355,7 @@ public function xss_clean($str, $is_image = FALSE)
355
355
$ words = array (
356
356
'javascript ' , 'expression ' , 'vbscript ' , 'jscript ' , 'wscript ' ,
357
357
'vbs ' , 'script ' , 'base64 ' , 'applet ' , 'alert ' , 'document ' ,
358
- 'write ' , 'cookie ' , 'window ' , 'confirm ' , 'prompt '
358
+ 'write ' , 'cookie ' , 'window ' , 'confirm ' , 'prompt ' , ' eval '
359
359
);
360
360
361
361
foreach ($ words as $ word )
@@ -399,12 +399,8 @@ public function xss_clean($str, $is_image = FALSE)
399
399
}
400
400
}
401
401
while ($ original !== $ str );
402
-
403
402
unset($ original );
404
403
405
- // Remove evil attributes such as style, onclick and xmlns
406
- $ str = $ this ->_remove_evil_attributes ($ str , $ is_image );
407
-
408
404
/*
409
405
* Sanitize naughty HTML elements
410
406
*
@@ -414,8 +410,29 @@ public function xss_clean($str, $is_image = FALSE)
414
410
* So this: <blink>
415
411
* Becomes: <blink>
416
412
*/
417
- $ naughty = 'alert|prompt|confirm|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|button|select|isindex|layer|link|meta|keygen|object|plaintext|style|script|textarea|title|math|video|svg|xml|xss ' ;
418
- $ str = preg_replace_callback ('#<(/*\s*)( ' .$ naughty .')([^><]*)([><]*)#is ' , array ($ this , '_sanitize_naughty_html ' ), $ str );
413
+ $ pattern = '# '
414
+ .'<((?<slash>/*\s*)(?<tagName>[a-z0-9]+)(?=[^a-z0-9]|$) ' // tag start and name, followed by a non-tag character
415
+ .'[^\s\042\047a-z0-9>/=]* ' // a valid attribute character immediately after the tag would count as a separator
416
+ // optional attributes
417
+ .'(?<attributes>(?:[\s\042\047/=]* ' // non-attribute characters, excluding > (tag close) for obvious reasons
418
+ .'[^\s\042\047>/=]+ ' // attribute characters
419
+ // optional attribute-value
420
+ .'(?:\s*= ' // attribute-value separator
421
+ .'(?:[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*)) ' // single, double or non-quoted value
422
+ .')? ' // end optional attribute-value group
423
+ .')*) ' // end optional attributes group
424
+ .'[^>]*)(?<closeTag>\>)?#isS ' ;
425
+
426
+ // Note: It would be nice to optimize this for speed, BUT
427
+ // only matching the naughty elements here results in
428
+ // false positives and in turn - vulnerabilities!
429
+ do
430
+ {
431
+ $ old_str = $ str ;
432
+ $ str = preg_replace_callback ($ pattern , array ($ this , '_sanitize_naughty_html ' ), $ str );
433
+ }
434
+ while ($ old_str !== $ str );
435
+ unset($ old_str );
419
436
420
437
/*
421
438
* Sanitize naughty scripting elements
@@ -626,82 +643,80 @@ protected function _compact_exploded_words($matches)
626
643
627
644
// --------------------------------------------------------------------
628
645
629
- /*
630
- * Remove Evil HTML Attributes (like evenhandlers and style)
646
+ /**
647
+ * Sanitize Naughty HTML
631
648
*
632
- * It removes the evil attribute and either:
633
- * - Everything up until a space
634
- * For example, everything between the pipes:
635
- * <a |style=document.write('hello');alert('world');| class=link>
636
- * - Everything inside the quotes
637
- * For example, everything between the pipes:
638
- * <a |style="document.write('hello'); alert('world');"| class="link">
649
+ * Callback function for xss_clean() to remove naughty HTML elements
639
650
*
640
- * @param string $str The string to check
641
- * @param boolean $is_image TRUE if this is an image
642
- * @return string The string with the evil attributes removed
651
+ * @param array
652
+ * @return string
643
653
*/
644
- protected function _remove_evil_attributes ( $ str , $ is_image )
654
+ protected function _sanitize_naughty_html ( $ matches )
645
655
{
646
- // All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns
647
- $ evil_attributes = array ('on\w* ' , 'style ' , 'xmlns ' , 'formaction ' , 'form ' , 'xlink:href ' , 'FSCommand ' , 'seekSegmentTime ' );
656
+ static $ naughty_tags = array (
657
+ 'alert ' , 'prompt ' , 'confirm ' , 'applet ' , 'audio ' , 'basefont ' , 'base ' , 'behavior ' , 'bgsound ' ,
658
+ 'blink ' , 'body ' , 'embed ' , 'expression ' , 'form ' , 'frameset ' , 'frame ' , 'head ' , 'html ' , 'ilayer ' ,
659
+ 'iframe ' , 'input ' , 'button ' , 'select ' , 'isindex ' , 'layer ' , 'link ' , 'meta ' , 'keygen ' , 'object ' ,
660
+ 'plaintext ' , 'style ' , 'script ' , 'textarea ' , 'title ' , 'math ' , 'video ' , 'svg ' , 'xml ' , 'xss '
661
+ );
648
662
649
- if ($ is_image === TRUE )
663
+ static $ evil_attributes = array (
664
+ 'on\w+ ' , 'style ' , 'xmlns ' , 'formaction ' , 'form ' , 'xlink:href ' , 'FSCommand ' , 'seekSegmentTime '
665
+ );
666
+
667
+ // First, escape unclosed tags
668
+ if (empty ($ matches ['closeTag ' ]))
650
669
{
651
- /*
652
- * Adobe Photoshop puts XML metadata into JFIF images,
653
- * including namespacing, so we have to allow this for images.
654
- */
655
- unset($ evil_attributes [array_search ('xmlns ' , $ evil_attributes )]);
670
+ return '< ' .$ matches [1 ];
656
671
}
657
-
658
- do {
659
- $ count = 0 ;
660
- $ attribs = array () ;
661
-
662
- // find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes)
663
- preg_match_all ( ' /(?<!\w)( ' . implode ( ' | ' , $ evil_attributes ). ' )\s*=\s*(\042|\047)([^ \\ 2]*?)( \\ 2)/is ' , $ str , $ matches , PREG_SET_ORDER );
664
-
665
- foreach ( $ matches as $ attr )
666
- {
667
- $ attribs [] = preg_quote ( $ attr [ 0 ], ' / ' );
668
- }
669
-
670
- // find occurrences of illegal attribute strings without quotes
671
- preg_match_all ( ' /(?<!\w)( ' . implode ( ' | ' , $ evil_attributes ). ' )\s*=\s*([^\s>]*)/is ' , $ str , $ matches , PREG_SET_ORDER ) ;
672
-
673
- foreach ($ matches as $ attr )
672
+ // Is the element that we caught naughty? If so, escape it
673
+ elseif ( in_array ( strtolower ( $ matches [ ' tagName ' ]), $ naughty_tags , TRUE ))
674
+ {
675
+ return ' < ' . $ matches [ 1 ]. ' > ' ;
676
+ }
677
+ // For other tags, see if their attributes are "evil" and strip those
678
+ elseif ( isset ( $ matches [ ' attributes ' ]))
679
+ {
680
+ // We'll need to catch all attributes separately first
681
+ $ pattern = ' # '
682
+ . ' ([\s\042\047/=]*) ' // non-attribute characters, excluding > (tag close) for obvious reasons
683
+ . ' (?<name>[^\s\042\047>/=]+) ' // attribute characters
684
+ // optional attribute-value
685
+ . ' (?:\s*=(?<value>[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*))) ' // attribute-value separator
686
+ . ' #i ' ;
687
+
688
+ if ($ count = preg_match_all ( $ pattern , $ matches [ ' attributes ' ], $ attributes , PREG_SET_ORDER | PREG_OFFSET_CAPTURE ) )
674
689
{
675
- $ attribs [] = preg_quote ($ attr [0 ], '/ ' );
676
- }
690
+ // Since we'll be using substr_replace() below, we
691
+ // need to handle the attributes in reverse order,
692
+ // so we don't damage the string.
693
+ for ($ i = $ count - 1 ; $ i > -1 ; $ i --)
694
+ {
695
+ if (
696
+ // Is it indeed an "evil" attribute?
697
+ preg_match ('#^( ' .implode ('| ' , $ evil_attributes ).')$#i ' , $ attributes [$ i ]['name ' ][0 ])
698
+ // Or an attribute not starting with a letter? Some parsers get confused by that
699
+ OR ! ctype_alpha ($ attributes [$ i ]['name ' ][0 ][0 ])
700
+ // Does it have an equals sign, but no value and not quoted? Strip that too!
701
+ OR (trim ($ attributes [$ i ]['value ' ][0 ]) === '' )
702
+ )
703
+ {
704
+ $ matches ['attributes ' ] = substr_replace (
705
+ $ matches ['attributes ' ],
706
+ ' [removed] ' ,
707
+ $ attributes [$ i ][0 ][1 ],
708
+ strlen ($ attributes [$ i ][0 ][0 ])
709
+ );
710
+ }
711
+ }
677
712
678
- // replace illegal attribute strings that are inside an html tag
679
- if (count ($ attribs ) > 0 )
680
- {
681
- $ str = preg_replace ('/(<?)(\/?[^><]+?)([^A-Za-z<>\-])(.*?)( ' .implode ('| ' , $ attribs ).')(.*?)([\s><]?)([><]*)/i ' , '$1$2 $4$6$7$8 ' , $ str , -1 , $ count );
713
+ // Note: This will strip some non-space characters and/or
714
+ // reduce multiple spaces between attributes.
715
+ return '< ' .$ matches ['slash ' ].$ matches ['tagName ' ].' ' .trim ($ matches ['attributes ' ]).'> ' ;
682
716
}
683
-
684
717
}
685
- while ($ count );
686
-
687
- return $ str ;
688
- }
689
718
690
- // --------------------------------------------------------------------
691
-
692
- /**
693
- * Sanitize Naughty HTML
694
- *
695
- * Callback function for xss_clean() to remove naughty HTML elements
696
- *
697
- * @param array
698
- * @return string
699
- */
700
- protected function _sanitize_naughty_html ($ matches )
701
- {
702
- return '< ' .$ matches [1 ].$ matches [2 ].$ matches [3 ] // encode opening brace
703
- // encode captured opening or closing brace to prevent recursive vectors:
704
- .str_replace (array ('> ' , '< ' ), array ('> ' , '< ' ), $ matches [4 ]);
719
+ return $ matches [0 ];
705
720
}
706
721
707
722
// --------------------------------------------------------------------
@@ -724,7 +739,7 @@ protected function _js_link_removal($match)
724
739
preg_replace (
725
740
'#href=.*?(?:(?:alert|prompt|confirm)(?:\(|&\#40;)|javascript:|livescript:|mocha:|charset=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si ' ,
726
741
'' ,
727
- $ this ->_filter_attributes (str_replace ( array ( ' < ' , ' > ' ), '' , $ match [1 ]) )
742
+ $ this ->_filter_attributes ($ match [1 ])
728
743
),
729
744
$ match [0 ]
730
745
);
@@ -748,9 +763,9 @@ protected function _js_img_removal($match)
748
763
return str_replace (
749
764
$ match [1 ],
750
765
preg_replace (
751
- '#src=.*?(?:(?:alert|prompt|confirm)(?:\(|&\#40;)|javascript:|livescript:|mocha:|charset=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si ' ,
766
+ '#src=.*?(?:(?:alert|prompt|confirm|eval )(?:\(|&\#40;)|javascript:|livescript:|mocha:|charset=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si ' ,
752
767
'' ,
753
- $ this ->_filter_attributes (str_replace ( array ( ' < ' , ' > ' ), '' , $ match [1 ]) )
768
+ $ this ->_filter_attributes ($ match [1 ])
754
769
),
755
770
$ match [0 ]
756
771
);
0 commit comments