Changeset 1603 in subversion


Ignore:
Timestamp:
Jul 22, 2008 4:01:42 AM (5 years ago)
Author:
thomasb
Message:

Improve HTML sanitization with washtml

Location:
trunk/roundcubemail/program
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/roundcubemail/program/include/rcube_message.php

    r1534 r1603  
    5858     
    5959    $this->set_safe((intval($_GET['_safe']) || $_SESSION['safe_messages'][$uid])); 
    60     $this->set_safe(0); 
    6160     
    6261    $this->opt = array( 
  • trunk/roundcubemail/program/lib/washtml.php

    r1544 r1603  
    3434 * SYNOPSIS: 
    3535 * 
    36  * washtml::wash($html, $config, $full); 
     36 * $washer = new washtml($config); 
     37 * $washer->wash($html); 
    3738 * It return a sanityzed string of the $html parameter without html and head tags. 
    3839 * $html is a string containing the html code to wash. 
     
    4344 *   $config['cid_map'] is an array where cid urls index urls to replace them. 
    4445 *   $config['charset'] is a string containing the charset of the HTML document if it is not defined in it. 
    45  * $full is a reference to a boolean that is set to true if no remote images are removed. (FE: show remote images link) 
     46 * $washer->extlinks is a reference to a boolean that is set to true if remote images were removed. (FE: show remote images link) 
    4647 * 
    4748 * INTERNALS: 
    4849 * 
    49  * Only tags and attributes in the globals $html_elements and $html_attributes 
     50 * Only tags and attributes in the static lists $html_elements and $html_attributes 
    5051 * are kept, inline styles are also filtered: all style identifiers matching 
    5152 * /[a-z\-]/i are allowed. Values matching colors, sizes, /[a-z\-]/i and safe 
     
    7374class washtml 
    7475{ 
    75  
     76  /* Allowed HTML elements (default) */ 
     77  static $html_elements = array('a', 'abbr', 'acronym', 'address', 'area', 'b', 'basefont', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'ins', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'img'); 
     78   
     79  /* Ignore these HTML tags but process their content */ 
     80  static $ignore_elements = array('html', 'body'); 
     81   
     82  /* Allowed HTML attributes */ 
     83  static $html_attribs = array('name', 'class', 'title', 'alt', 'width', 'height', 'align', 'nowrap', 'col', 'row', 'id', 'rowspan', 'colspan', 'cellspacing', 'cellpadding', 'valign', 'bgcolor', 'color', 'border', 'bordercolorlight', 'bordercolordark', 'face', 'marginwidth', 'marginheight', 'axis', 'border', 'abbr', 'char', 'charoff', 'clear', 'compact', 'coords', 'vspace', 'hspace', 'cellborder', 'size', 'lang', 'dir', 'background');   
     84   
     85  /* State for linked objects in HTML */ 
     86  public $extlinks = false; 
     87 
     88  /* Current settings */ 
     89  private $config = array(); 
     90 
     91  /* Registered callback functions for tags */ 
     92  private $handlers = array(); 
     93   
    7694  /* Allowed HTML elements */ 
    77   static $html_elements = array('a', 'abbr', 'acronym', 'address', 'area', 'b', 'basefont', 'bdo', 'big', 'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'ins', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'u', 'ul', 'var', 'img'); 
     95  private $_html_elements = array(); 
     96 
     97  /* Ignore these HTML tags but process their content */ 
     98  private $_ignore_elements = array(); 
    7899 
    79100  /* Allowed HTML attributes */ 
    80   static $html_attribs = array('name', 'class', 'title', 'alt', 'width', 'height', 'align', 'nowrap', 'col', 'row', 'id', 'rowspan', 'colspan', 'cellspacing', 'cellpadding', 'valign', 'bgcolor', 'color', 'border', 'bordercolorlight', 'bordercolordark', 'face', 'marginwidth', 'marginheight', 'axis', 'border', 'abbr', 'char', 'charoff', 'clear', 'compact', 'coords', 'vspace', 'hspace', 'cellborder', 'size', 'lang', 'dir', 'background'); 
    81  
     101  private $_html_attribs = array(); 
     102   
     103 
     104  /* Constructor */ 
     105  public function __construct($p = array()) { 
     106    $this->_html_elements = array_flip((array)$p['html_elements']) + array_flip(self::$html_elements) ; 
     107    $this->_html_attribs = array_flip((array)$p['html_attribs']) + array_flip(self::$html_attribs); 
     108    $this->_ignore_elements = array_flip((array)$p['ignore_elements']) + array_flip(self::$ignore_elements); 
     109    unset($p['html_elements'], $p['html_attribs'], $p['ignore_elements']); 
     110    $this->config = $p + array('show_washed'=>true, 'allow_remote'=>false, 'cid_map'=>array()); 
     111  } 
     112   
     113  /* Register a callback function for a certain tag */ 
     114  public function add_callback($tagName, $callback) 
     115  { 
     116    $this->handlers[$tagName] = $callback; 
     117  } 
     118   
    82119  /* Check CSS style */ 
    83   static function wash_style($style, $config, &$full) { 
     120  private function wash_style($style) { 
    84121    $s = ''; 
    85122 
     
    97134          if($match[2]) { 
    98135            if(preg_match('/^(http|https|ftp):.*$/i', $match[2], $url)) { 
    99               if($config['allow_remote']) 
     136              if($this->config['allow_remote']) 
    100137                $value .= ' url(\''.htmlspecialchars($url[0], ENT_QUOTES).'\')'; 
    101138              else 
    102                 $full = false; 
     139                $this->extlinks = true; 
    103140            } else if(preg_match('/^cid:(.*)$/i', $match[2], $cid)) 
    104               $value .= ' url(\''.htmlspecialchars($config['cid_map']['cid:'.$cid[1]], ENT_QUOTES) . '\')'; 
     141              $value .= ' url(\''.htmlspecialchars($this->config['cid_map']['cid:'.$cid[1]], ENT_QUOTES) . '\')'; 
    105142          } else if($match[0] != 'url' && $match[0] != 'rbg')//whitelist ? 
    106143            $value .= ' ' . $match[0]; 
     
    115152 
    116153  /* Take a node and return allowed attributes and check values */ 
    117   static function wash_attribs($node, $config, &$full) { 
     154  private function wash_attribs($node) { 
    118155    $t = ''; 
    119156    $washed; 
     
    122159      $key = strtolower($key); 
    123160      $value = $node->getAttribute($key); 
    124       if((in_array($key, self::$html_attribs)) || 
     161      if(isset($this->_html_attribs[$key]) || 
    125162         ($key == 'href' && preg_match('/^(http|https|ftp|mailto):.*/i', $value))) 
    126163        $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"'; 
    127       else if($key == 'style' && ($style = self::wash_style($value, $config, $full))) 
     164      else if($key == 'style' && ($style = $this->wash_style($value))) 
    128165        $t .= ' style="' . $style . '"'; 
    129166      else if($key == 'src' && strtolower($node->tagName) == 'img') { //check tagName anyway 
    130167        if(preg_match('/^(http|https|ftp):.*/i', $value)) { 
    131           if($config['allow_remote']) 
     168          if($this->config['allow_remote']) 
    132169            $t .= ' ' . $key . '="' . htmlspecialchars($value, ENT_QUOTES) . '"'; 
    133170          else { 
    134             $full = false; 
    135             if ($config['blocked_src']) 
    136               $t .= ' src="' . htmlspecialchars($config['blocked_src'], ENT_QUOTES) . '"'; 
     171            $this->extlinks = true; 
     172            if ($this->config['blocked_src']) 
     173              $t .= ' src="' . htmlspecialchars($this->config['blocked_src'], ENT_QUOTES) . '"'; 
    137174          } 
    138175        } else if(preg_match('/^cid:(.*)$/i', $value, $cid)) 
    139           $t .= ' ' . $key . '="' . htmlspecialchars($config['cid_map']['cid:'.$cid[1]], ENT_QUOTES) . '"'; 
     176          $t .= ' ' . $key . '="' . htmlspecialchars($this->config['cid_map']['cid:'.$cid[1]], ENT_QUOTES) . '"'; 
    140177      } else 
    141178        $washed .= ($washed?' ':'') . $key; 
    142179    } 
    143     return $t . ($washed && $config['show_washed']?' x-washed="'.$washed.'"':''); 
     180    return $t . ($washed && $this->config['show_washed']?' x-washed="'.$washed.'"':''); 
    144181  } 
    145182 
     
    147184   * It output only allowed tags with allowed attributes 
    148185   * and allowed inline styles */ 
    149   static function dumpHtml($node, $config, &$full) { 
     186  private function dumpHtml($node) { 
    150187    if(!$node->hasChildNodes()) 
    151188      return ''; 
     
    158195      case XML_ELEMENT_NODE: //Check element 
    159196        $tagName = strtolower($node->tagName); 
    160         if(in_array($tagName, self::$html_elements)) { 
    161           $content = self::dumpHtml($node, $config, $full); 
    162           $dump .= '<' . $tagName . self::wash_attribs($node, $config, $full) . 
     197        if($callback = $this->handlers[$tagName]) { 
     198          $dump .= call_user_func($callback, $tagName, $this->wash_attribs($node), $this->dumpHtml($node)); 
     199        } else if(isset($this->_html_elements[$tagName])) { 
     200          $content = $this->dumpHtml($node); 
     201          $dump .= '<' . $tagName . $this->wash_attribs($node) . 
    163202            ($content?">$content</$tagName>":' />'); 
    164         } else if($tagName == 'html' || $tagName == 'body') { 
    165           $dump .= self::dumpHtml($node, $config, $full); //Just ignored 
     203        } else if(isset($this->_ignore_elements[$tagName])) { 
     204          $dump .= '<!-- ' . htmlspecialchars($tagName, ENT_QUOTES) . ' ignored -->'; 
     205          $dump .= $this->dumpHtml($node); //Just ignored 
    166206        } else 
    167207          $dump .= '<!-- ' . htmlspecialchars($tagName, ENT_QUOTES) . ' not allowed -->'; 
    168208        break; 
     209      case XML_CDATA_SECTION_NODE: 
     210        $dump .= $node->nodeValue; 
     211        break; 
    169212      case XML_TEXT_NODE: 
    170213        $dump .= htmlspecialchars($node->nodeValue); 
    171214        break; 
    172215      case XML_HTML_DOCUMENT_NODE: 
    173         $dump .= self::dumpHtml($node, $config, $full); 
    174         break; 
    175       case XML_DOCUMENT_TYPE_NODE: break; 
     216        $dump .= $this->dumpHtml($node); 
     217        break; 
     218      case XML_DOCUMENT_TYPE_NODE: 
     219        break; 
    176220      default: 
     221        $dump . '<!-- node type ' . $node->nodeType . ' -->'; 
    177222      } 
    178223    } while($node = $node->nextSibling); 
     
    183228  /* Main function, give it untrusted HTML, tell it if you allow loading 
    184229   * remote images and give it a map to convert "cid:" urls. */ 
    185   static function wash($html, $config=array(), &$full=true) { 
    186     $config += array('show_washed'=>true, 'allow_remote'=>false, 'cid_map'=>array()); 
     230  public function wash($html) { 
    187231    //Charset seems to be ignored (probably if defined in the HTML document) 
    188     $node = new DOMDocument('1.0', $config['charset']); 
    189     $full = true; 
     232    $node = new DOMDocument('1.0', $this->config['charset']); 
     233    $this->extlinks = false; 
    190234    @$node->loadHTML($html); 
    191     return self::dumpHtml($node, $config, $full); 
     235    return $this->dumpHtml($node); 
    192236  } 
    193237 
  • trunk/roundcubemail/program/steps/mail/func.inc

    r1601 r1603  
    540540 * @return string Formatted HTML string 
    541541 */ 
    542 function rcmail_print_body($part, $safe=false, $plain=false) 
     542function rcmail_print_body($part, $p = array()) 
    543543{ 
    544544  global $REMOTE_OBJECTS; 
    545545   
     546  $p += array('safe' => false, 'plain' => false, 'inline_html' => true); 
     547   
    546548  // convert html to text/plain 
    547   if ($part->ctype_secondary == 'html' && $plain) { 
     549  if ($part->ctype_secondary == 'html' && $p['plain']) { 
    548550    $txt = new html2text($part->body, false, true); 
    549551    $body = $txt->get_text(); 
     
    554556    // charset was converted to UTF-8 in rcube_imap::get_message_part() -> change charset specification in HTML accordingly 
    555557    $html = $part->body;  
    556     if(preg_match('/(\s+content=[\'"]\w+\/\w+;\s+charset)=([a-z0-9-]+)/i', $html))  
    557       $html = preg_replace('/(\s+content=[\'"]\w+\/\w+;\s+charset)=([a-z0-9-]+)/i', '\\1='.RCMAIL_CHARSET, $html);  
     558    if (preg_match('/(\s+content=[\'"]\w+\/\w+;\s*charset)=([a-z0-9-]+)/i', $html))  
     559      $html = preg_replace('/(\s+content=[\'"]\w+\/\w+;\s*charset)=([a-z0-9-]+)/i', '\\1='.RCMAIL_CHARSET, $html);  
    558560    else { 
    559561      // add <head> for malformed messages, washtml cannot work without that 
    560       if (!preg_match('/<head>(.*)<\/head>/m', $html)) 
     562      if (!preg_match('/<head>(.*)<\\/head>/Uims', $html)) 
    561563        $html = '<head></head>' . $html; 
    562564      $html = substr_replace($html, '<meta http-equiv="Content-Type" content="text/html; charset='.RCMAIL_CHARSET.'" />', intval(stripos($html, '</head>')), 0); 
    563565    } 
    564  
     566     
    565567    // clean HTML with washhtml by Frederic Motte 
    566     $body = washtml::wash($html, array( 
     568    $wash_opts = array( 
    567569      'show_washed' => false, 
    568       'allow_remote' => $safe, 
     570      'allow_remote' => $p['safe'], 
    569571      'blocked_src' => "./program/blocked.gif", 
    570572      'charset' => RCMAIL_CHARSET, 
    571573      'cid_map' => $part->replaces, 
    572       ), $full_inline); 
    573  
    574     $REMOTE_OBJECTS = !$full_inline; 
     574      'html_elements' => array('body'), 
     575    ); 
     576     
     577    if (!$p['inline_html']) { 
     578      $wash_opts['html_elements'] = array('html','head','title','body'); 
     579    } 
     580     
     581    /* CSS styles need to be sanitized! 
     582    if ($p['safe']) { 
     583      $wash_opts['html_elements'][] = 'style'; 
     584      $wash_opts['html_attribs'] = array('type'); 
     585    } 
     586    */ 
     587     
     588    $washer = new washtml($wash_opts); 
     589    $washer->add_callback('form', 'rcmail_washtml_callback'); 
     590    $body = $washer->wash($html); 
     591    $REMOTE_OBJECTS = $washer->extlinks; 
    575592 
    576593    return $body; 
     
    638655   
    639656  return "<div class=\"pre\">".$body."\n</div>"; 
    640   } 
    641  
    642  
     657} 
    643658 
    644659/** 
     
    646661 */ 
    647662function rcmail_str_replacement($str, &$rep) 
    648   { 
     663{ 
    649664  static $count = 0; 
    650665  $rep[$count] = stripslashes($str); 
    651666  return "##string_replacement{".($count++)."}##"; 
    652   } 
    653  
     667} 
     668 
     669 
     670/** 
     671 * Callback function for washtml cleaning class 
     672 */ 
     673function rcmail_washtml_callback($tagname, $attrib, $content) 
     674{ 
     675  switch ($tagname) { 
     676    case 'form': 
     677      $out = html::div('form', $content); 
     678      break; 
     679       
     680    default: 
     681      $out = ''; 
     682  } 
     683   
     684  return $out; 
     685} 
    654686 
    655687 
     
    757789          $part->body = $MESSAGE->get_part_content($part->mime_id); 
    758790 
    759         $body = rcmail_print_body($part, $safe_mode, !$CONFIG['prefer_html']); 
     791        $body = rcmail_print_body($part, array('safe' => $safe_mode, 'plain' => !$CONFIG['prefer_html'])); 
    760792 
    761793        if ($part->ctype_secondary == 'html') 
  • trunk/roundcubemail/program/steps/mail/get.inc

    r1400 r1603  
    6666      header("Content-Type: application/octet-stream"); 
    6767    } 
     68    else if ($ctype_primary == 'text') 
     69      header("Content-Type: text/$ctype_secondary; charset=" . RCMAIL_CHARSET); 
    6870    else 
    6971      header("Content-Type: $mimetype"); 
     
    9698 
    9799      $OUTPUT = new rcube_html_page(); 
    98       $OUTPUT->write(rcmail_print_body($part, $MESSAGE->is_safe)); 
     100      $OUTPUT->write(rcmail_print_body($part, array('safe' => $MESSAGE->is_safe, 'inline_html' => false))); 
    99101    } 
    100102    else { 
Note: See TracChangeset for help on using the changeset viewer.