Class: SearchIndexer::HtmlScrubber

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
app/services/search_indexer.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeHtmlScrubber

Returns a new instance of HtmlScrubber.



352
353
354
# File 'app/services/search_indexer.rb', line 352

def initialize
  @scrubbed = +""
end

Instance Attribute Details

#scrubbedObject (readonly)

Returns the value of attribute scrubbed.



350
351
352
# File 'app/services/search_indexer.rb', line 350

def scrubbed
  @scrubbed
end

Class Method Details

.scrub(html) ⇒ Object



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
# File 'app/services/search_indexer.rb', line 356

def self.scrub(html)
  return +"" if html.blank?

  begin
    document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
  rescue ArgumentError
    return +""
  end

  nodes = document.css("div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}")

  if nodes.present?
    nodes.each do |node|
      node.traverse do |child_node|
        next if child_node == node

        if %w[a img].exclude?(child_node.name)
          child_node.remove
        elsif child_node.name == "a"
          ATTRIBUTES.each { |attribute| child_node.remove_attribute(attribute) }
        end
      end
    end
  end

  document.css("img.emoji").each { |node| node.remove_attribute("alt") }

  document
    .css("a[href]")
    .each do |node|
      if node["href"] == node.text || MENTION_CLASSES.include?(node["class"])
        node.remove_attribute("href")
      end

      if node["class"] == "anchor" && node["href"].starts_with?("#")
        node.remove_attribute("href")
      end
    end

  html_scrubber = new
  Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
  html_scrubber.scrubbed.squish
end

Instance Method Details

#characters(str) ⇒ Object



414
415
416
# File 'app/services/search_indexer.rb', line 414

def characters(str)
  scrubbed << " #{str} "
end

#start_element(_name, attributes = []) ⇒ Object



403
404
405
406
407
408
409
410
411
412
# File 'app/services/search_indexer.rb', line 403

def start_element(_name, attributes = [])
  attributes = Hash[*attributes.flatten]

  ATTRIBUTES.each do |attribute_name|
    if attributes[attribute_name].present? &&
         !(attribute_name == "href" && UrlHelper.is_local(attributes[attribute_name]))
      characters(attributes[attribute_name])
    end
  end
end