Class: TextUtils::HtmlSanitizer

Inherits:
Processor show all
Defined in:
lib/text_utils/html_sanitizer.rb

Constant Summary collapse

RELAXED =
{
  elements: [
    'a', 'b', 'blockquote', 'br', 'caption', 'cite', 'code', 'col',
    'colgroup', 'dd', 'dl', 'dt', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
    'i', 'img', 'li', 'ol', 'p', 'pre', 'q', 'small', 'strike', 'strong',
    'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'u',
    'ul', 'div', 'font', 'span', 'iframe'],

  attributes: {
    :all         => ['class', 'style'],
    'a'          => ['href', 'title', 'rel'],
    'blockquote' => ['cite'],
    'col'        => ['span', 'width'],
    'colgroup'   => ['span', 'width'],
    'img'        => ['align', 'alt', 'height', 'src', 'title', 'width'],
    'ol'         => ['start', 'type'],
    'q'          => ['cite'],
    'table'      => ['summary', 'width'],
    'td'         => ['abbr', 'axis', 'colspan', 'rowspan', 'width'],
    'th'         => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'],
    'ul'         => ['type'],
    'code'       => ['lang', 'language'],

    'iframe'     => ['height', 'scrolling', 'src', 'width']
  },

  protocols: {
    'a'          => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]},
    'blockquote' => {'cite' => ['http', 'https', :relative]},
    'img'        => {'src'  => ['http', 'https', :relative]},
    'q'          => {'cite' => ['http', 'https', :relative]}
  }
}
VIDEO_URLS =
[
  /^http:\/\/(?:www\.)?youtube\.com\/v\//,
]
EMBEDDED_VIDEO =
lambda do |env|
  node      = env[:node]
  node_name = node.name.to_s.downcase
  parent    = node.parent

  # Since the transformer receives the deepest nodes first, we look for a
  # <param> element or an <embed> element whose parent is an <object>.
  return nil unless (node_name == 'param' || node_name == 'embed') && parent.name.to_s.downcase == 'object'

  if node_name == 'param'
    # Quick XPath search to find the <param> node that contains the video URL.
    return nil unless movie_node = parent.search('param[@name="movie"]')[0]
    url = movie_node['value']
  else
    # Since this is an <embed>, the video URL is in the "src" attribute. No
    # extra work needed.
    url = node['src']
  end

  # # Verify that the video URL is actually a valid YouTube video URL.
  return nil unless VIDEO_URLS.any?{|t| url =~ t}

  # # We're now certain that this is a YouTube embed, but we still need to run
  # # it through a special Sanitize step to ensure that no unwanted elements or
  # # attributes that don't belong in a YouTube embed can sneak in.
  Sanitize.clean_node!(parent, {
    :elements   => ['embed', 'object', 'param'],
    attributes: {
      'embed'  => ['allowfullscreen', 'allowscriptaccess', 'height', 'src', 'type', 'width'],
      'object' => ['height', 'width'],
      'param'  => ['name', 'value']
    }
  })

  # Now that we're sure that this is a valid YouTube embed and that there are
  # no unwanted elements or attributes hidden inside it, we can tell Sanitize
  # to whitelist the current node (<param> or <embed>) and its parent
  # (<object>).
  {:whitelist_nodes => [node, parent]}
end

Instance Method Summary collapse

Methods inherited from Processor

#initialize

Constructor Details

This class inherits a constructor from TextUtils::Processor

Instance Method Details

#call(data, env) ⇒ Object



81
82
83
84
85
86
87
88
89
90
# File 'lib/text_utils/html_sanitizer.rb', line 81

def call data, env
  data = call_next data, env

  Sanitize.clean(data, RELAXED.merge(
    transformers: [EMBEDDED_VIDEO],
    :add_attributes => {
      all: [:class]
    }
  ))
end