Class: SiteDiff::Sanitizer

Inherits:
Object
  • Object
show all
Defined in:
lib/sitediff/sanitize.rb,
lib/sitediff/sanitize/regexp.rb,
lib/sitediff/sanitize/dom_transform.rb

Overview

SiteDiff Sanitizer.

Defined Under Namespace

Classes: DomTransform, InvalidSanitization, Regexp

Constant Summary collapse

TOOLS =
{
  array: %w[dom_transform sanitization],
  scalar: %w[selector remove_spacing ignore_whitespace]
}.freeze
DOM_TRANSFORMS =
Set.new(%w[remove strip unwrap_root unwrap remove_class])

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html, config, opts = {}) ⇒ Sanitizer

Creates a Sanitizer.



23
24
25
26
27
# File 'lib/sitediff/sanitize.rb', line 23

def initialize(html, config, opts = {})
  @html = html
  @config = config
  @opts = opts
end

Class Method Details

.domify(str, force_doc: false) ⇒ Object

Parse HTML into a node



213
214
215
216
217
218
219
# File 'lib/sitediff/sanitize.rb', line 213

def self.domify(str, force_doc: false)
  if force_doc || /<!DOCTYPE/.match(str[0, 512])
    Nokogiri::HTML(str)
  else
    Nokogiri::HTML.fragment(str)
  end
end

.prettify(obj) ⇒ Object

Pretty-print some HTML



170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/sitediff/sanitize.rb', line 170

def self.prettify(obj)
  @stylesheet ||= begin
    stylesheet_path = File.join(SiteDiff::FILES_DIR, 'pretty_print.xsl')
    Nokogiri::XSLT(File.read(stylesheet_path))
  end

  # Pull out the html element's children
  # The obvious way to do this is to iterate over pretty.css('html'),
  # but that tends to segfault Nokogiri
  str = @stylesheet.apply_to(to_document(obj))

  # There's a lot of cruft left over,that we don't want

  # Prevent potential UTF-8 encoding errors by removing invalid bytes.
  # Not the only solution.
  # An alternative is to return the string unmodified.
  str = str.encode(
    'UTF-8',
    'binary',
    invalid: :replace,
    undef: :replace,
    replace: ''
  )
  # Remove xml declaration and <html> tags
  str.sub!(/\A<\?xml.*$\n/, '')
  str.sub!(/\A^<html>$\n/, '')
  str.sub!(%r{</html>\n\Z}, '')

  # Remove top-level indentation
  indent = /\A(\s*)/.match(str)[1].size
  str.gsub!(/^\s{,#{indent}}/, '')

  # Remove blank lines
  str.gsub!(/^\s*$\n/, '')

  # Remove DOS newlines
  str.gsub!(/\x0D$/, '')
  str.gsub!(/&#13;$/, '')

  str
end

.remove_node_spacing(node) ⇒ Object

Remove double-spacing inside text nodes



138
139
140
141
142
143
# File 'lib/sitediff/sanitize.rb', line 138

def self.remove_node_spacing(node)
  # remove double spacing, but only inside text nodes (eg not attributes)
  node.xpath('//text()').each do |el|
    el.content = el.content.gsub(/  +/, ' ')
  end
end

.select_fragments(node, sel) ⇒ Object

Get a fragment consisting of the elements matching the selector(s)



160
161
162
163
164
165
166
167
# File 'lib/sitediff/sanitize.rb', line 160

def self.select_fragments(node, sel)
  # When we choose a new root, we always become a DocumentFragment,
  # and lose any DOCTYPE and such.
  ns = node.css(sel)
  node = Nokogiri::HTML.fragment('') unless node.fragment?
  node.children = ns
  node
end

.to_document(obj) ⇒ Object

Force this object to be a document, so we can apply a stylesheet



222
223
224
225
226
227
228
229
230
231
# File 'lib/sitediff/sanitize.rb', line 222

def self.to_document(obj)
  if obj.instance_of?(Nokogiri::XML::Document) || obj.instance_of?(Nokogiri::HTML::Document)
    obj
  # node or fragment
  elsif obj.instance_of?(Nokogiri::XML::Node) || obj.instance_of?(Nokogiri::HTML::DocumentFragment)
    domify(obj.to_s, force_doc: true)
  else
    to_document(domify(obj, force_doc: false))
  end
end

Instance Method Details

#canonicalize_rule(name) ⇒ Object

Canonicalize a simple rule, eg: ‘remove_spacing’ or ‘selector’. It may be a simple value, or a hash, or an array of hashes. Turn it into an array of hashes.



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/sitediff/sanitize.rb', line 61

def canonicalize_rule(name)
  (rules = @config[name]) || (return nil)

  # Already an array? Do nothing.
  if rules[0].respond_to?('each') && rules[0]&.fetch('value')
  # If it is a hash, put it in an array.
  elsif rules['value']
    rules = [rules]
  # If it is a scalar value, put it in an array.
  else
    rules = [{ 'value' => rules }]
  end

  want = rules.select { |r| want_rule(r) }
  return nil if want.empty?
  raise "Too many matching rules of type #{name}" if want.size > 1

  want.first
end

#dom_transformsObject

Perform DOM transforms



125
126
127
128
129
130
131
132
133
# File 'lib/sitediff/sanitize.rb', line 125

def dom_transforms
  (rules = @config['dom_transform']) || return
  rules = rules.select { |r| want_rule(r) }

  rules.each do |rule|
    transform = DomTransform.create(rule)
    transform.apply(@node)
  end
end

#regexpsObject

Applies regexps. Also



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/sitediff/sanitize.rb', line 101

def regexps
  (rules = @config['sanitization']) || return
  rules = rules.select { |r| want_rule(r) }

  rules.map! { |r| Regexp.create(r) }
  selector, global = rules.partition(&:selector?)

  selector.each { |r| r.apply(@node) }
  @html = Sanitizer.prettify(@node)
  @node = nil
  # Prevent potential UTF-8 encoding errors by removing bytes
  # Not the only solution. An alternative is to return the
  # string unmodified.
  @html = @html.encode(
    'UTF-8',
    'binary',
    invalid: :replace,
    undef: :replace,
    replace: ''
  )
  global.each { |r| r.apply(@html) }
end

#regionsObject

Perform ‘regions’ action, don’t perform ‘selector’ if regions exist.



88
89
90
91
92
# File 'lib/sitediff/sanitize.rb', line 88

def regions
  return unless validate_regions

  @node = select_regions(@node, @config['regions'], @opts[:output])
end

#remove_spacingObject

Perform ‘remove_spacing’ action



82
83
84
85
# File 'lib/sitediff/sanitize.rb', line 82

def remove_spacing
  (rule = canonicalize_rule('remove_spacing')) || return
  Sanitizer.remove_node_spacing(@node) if rule['value']
end

#sanitizeObject

Performs sanitization.



31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/sitediff/sanitize.rb', line 31

def sanitize
  return '' if @html == '' # Quick return on empty input

  @node = Sanitizer.domify(@html)
  @html = nil

  remove_spacing
  regions || selector
  dom_transforms
  regexps

  @html || Sanitizer.prettify(@node)
end

#select_regions(node, regions, output) ⇒ Object

Restructure the node into regions.



146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/sitediff/sanitize.rb', line 146

def select_regions(node, regions, output)
  regions = output.map do |name|
    selector = get_named_region(regions, name)['selector']
    region = Nokogiri::XML.fragment("<region id=\"#{name}\"></region>").at_css('region')
    matching = node.css(selector)
    matching.each { |m| region.add_child m }
    region
  end
  node = Nokogiri::HTML.fragment('')
  regions.each { |r| node.add_child r }
  node
end

#selectorObject

Perform ‘selector’ action, to choose a new root



95
96
97
98
# File 'lib/sitediff/sanitize.rb', line 95

def selector
  (rule = canonicalize_rule('selector')) || return
  @node = Sanitizer.select_fragments(@node, rule['value'])
end

#want_rule(rule) ⇒ Object

Return whether or not we want to keep a rule



46
47
48
49
50
51
52
53
54
55
56
# File 'lib/sitediff/sanitize.rb', line 46

def want_rule(rule)
  return false unless rule
  return false if rule['disabled']

  # Filter out if path regexp doesn't match
  if (pathre = rule['path']) && (path = @opts[:path])
    return ::Regexp.new(pathre).match(path)
  end

  true
end