Class: SearchIndexer::HtmlScrubber
- Inherits:
-
Nokogiri::XML::SAX::Document
- Object
- Nokogiri::XML::SAX::Document
- SearchIndexer::HtmlScrubber
- Defined in:
- app/services/search_indexer.rb
Instance Attribute Summary collapse
-
#scrubbed ⇒ Object
readonly
Returns the value of attribute scrubbed.
Class Method Summary collapse
Instance Method Summary collapse
- #characters(str) ⇒ Object
-
#initialize ⇒ HtmlScrubber
constructor
A new instance of HtmlScrubber.
- #start_element(_name, attributes = []) ⇒ Object
Constructor Details
#initialize ⇒ HtmlScrubber
Returns a new instance of HtmlScrubber.
352 353 354 |
# File 'app/services/search_indexer.rb', line 352 def initialize @scrubbed = +"" end |
Instance Attribute Details
#scrubbed ⇒ Object (readonly)
Returns the value of attribute scrubbed.
350 351 352 |
# File 'app/services/search_indexer.rb', line 350 def scrubbed @scrubbed end |
Class Method Details
.scrub(html) ⇒ Object
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 |
# File 'app/services/search_indexer.rb', line 356 def self.scrub(html) return +"" if html.blank? begin document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s) rescue ArgumentError return +"" end nodes = document.css("div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}") if nodes.present? nodes.each do |node| node.traverse do |child_node| next if child_node == node if %w[a img].exclude?(child_node.name) child_node.remove elsif child_node.name == "a" ATTRIBUTES.each { |attribute| child_node.remove_attribute(attribute) } end end end end document.css("img.emoji").each { |node| node.remove_attribute("alt") } document .css("a[href]") .each do |node| if node["href"] == node.text || MENTION_CLASSES.include?(node["class"]) node.remove_attribute("href") end if node["class"] == "anchor" && node["href"].starts_with?("#") node.remove_attribute("href") end end html_scrubber = new Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html) html_scrubber.scrubbed.squish end |
Instance Method Details
#characters(str) ⇒ Object
414 415 416 |
# File 'app/services/search_indexer.rb', line 414 def characters(str) scrubbed << " #{str} " end |
#start_element(_name, attributes = []) ⇒ Object
403 404 405 406 407 408 409 410 411 412 |
# File 'app/services/search_indexer.rb', line 403 def start_element(_name, attributes = []) attributes = Hash[*attributes.flatten] ATTRIBUTES.each do |attribute_name| if attributes[attribute_name].present? && !(attribute_name == "href" && UrlHelper.is_local(attributes[attribute_name])) characters(attributes[attribute_name]) end end end |