Class: SearchIndexer::HtmlScrubber

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
app/services/search_indexer.rb

Constant Summary collapse

MENTION_CLASSES =
%w[mention mention-group]
ATTRIBUTES =
%w[alt title href data-video-title]

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeHtmlScrubber

Returns a new instance of HtmlScrubber.



360
361
362
# File 'app/services/search_indexer.rb', line 360

def initialize
  @scrubbed = +""
end

Instance Attribute Details

#scrubbedObject (readonly)

Returns the value of attribute scrubbed.



358
359
360
# File 'app/services/search_indexer.rb', line 358

def scrubbed
  @scrubbed
end

Class Method Details

.scrub(html) ⇒ Object



364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# File 'app/services/search_indexer.rb', line 364

def self.scrub(html)
  return +"" if html.blank?

  begin
    document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
  rescue ArgumentError
    return +""
  end

  nodes = document.css("div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}")

  if nodes.present?
    nodes.each do |node|
      node.traverse do |child_node|
        next if child_node == node

        if %w[a img].exclude?(child_node.name)
          child_node.remove
        elsif child_node.name == "a"
          ATTRIBUTES.each { |attribute| child_node.remove_attribute(attribute) }
        end
      end
    end
  end

  document.css("img.emoji").each { |node| node.remove_attribute("alt") }

  document
    .css("a[href]")
    .each do |node|
      if node["href"] == node.text || MENTION_CLASSES.include?(node["class"])
        node.remove_attribute("href")
      end

      if node["class"] == "anchor" && node["href"].starts_with?("#")
        node.remove_attribute("href")
      end
    end

  html_scrubber = new
  Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html)
  html_scrubber.scrubbed.squish
end

Instance Method Details

#characters(str) ⇒ Object



422
423
424
# File 'app/services/search_indexer.rb', line 422

def characters(str)
  scrubbed << " #{str} "
end

#start_element(_name, attributes = []) ⇒ Object



411
412
413
414
415
416
417
418
419
420
# File 'app/services/search_indexer.rb', line 411

def start_element(_name, attributes = [])
  attributes = Hash[*attributes.flatten]

  ATTRIBUTES.each do |attribute_name|
    if attributes[attribute_name].present? &&
         !(attribute_name == "href" && UrlHelper.is_local(attributes[attribute_name]))
      characters(attributes[attribute_name])
    end
  end
end