Class: SearchIndexer::HtmlScrubber
- Inherits:
-
Nokogiri::XML::SAX::Document
- Object
- Nokogiri::XML::SAX::Document
- SearchIndexer::HtmlScrubber
- Defined in:
- app/services/search_indexer.rb
Constant Summary collapse
- MENTION_CLASSES =
%w[mention mention-group]
- ATTRIBUTES =
%w[alt title href data-video-title]
Instance Attribute Summary collapse
-
#scrubbed ⇒ Object
readonly
Returns the value of attribute scrubbed.
Class Method Summary collapse
Instance Method Summary collapse
- #characters(str) ⇒ Object
-
#initialize ⇒ HtmlScrubber
constructor
A new instance of HtmlScrubber.
- #start_element(_name, attributes = []) ⇒ Object
Constructor Details
#initialize ⇒ HtmlScrubber
Returns a new instance of HtmlScrubber.
360 361 362 |
# File 'app/services/search_indexer.rb', line 360 def initialize @scrubbed = +"" end |
Instance Attribute Details
#scrubbed ⇒ Object (readonly)
Returns the value of attribute scrubbed.
358 359 360 |
# File 'app/services/search_indexer.rb', line 358 def scrubbed @scrubbed end |
Class Method Details
.scrub(html) ⇒ Object
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 |
# File 'app/services/search_indexer.rb', line 364 def self.scrub(html) return +"" if html.blank? begin document = Nokogiri.HTML5("<div>#{html}</div>", nil, Encoding::UTF_8.to_s) rescue ArgumentError return +"" end nodes = document.css("div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}") if nodes.present? nodes.each do |node| node.traverse do |child_node| next if child_node == node if %w[a img].exclude?(child_node.name) child_node.remove elsif child_node.name == "a" ATTRIBUTES.each { |attribute| child_node.remove_attribute(attribute) } end end end end document.css("img.emoji").each { |node| node.remove_attribute("alt") } document .css("a[href]") .each do |node| if node["href"] == node.text || MENTION_CLASSES.include?(node["class"]) node.remove_attribute("href") end if node["class"] == "anchor" && node["href"].starts_with?("#") node.remove_attribute("href") end end html_scrubber = new Nokogiri::HTML::SAX::Parser.new(html_scrubber).parse(document.to_html) html_scrubber.scrubbed.squish end |
Instance Method Details
#characters(str) ⇒ Object
422 423 424 |
# File 'app/services/search_indexer.rb', line 422 def characters(str) scrubbed << " #{str} " end |
#start_element(_name, attributes = []) ⇒ Object
411 412 413 414 415 416 417 418 419 420 |
# File 'app/services/search_indexer.rb', line 411 def start_element(_name, attributes = []) attributes = Hash[*attributes.flatten] ATTRIBUTES.each do |attribute_name| if attributes[attribute_name].present? && !(attribute_name == "href" && UrlHelper.is_local(attributes[attribute_name])) characters(attributes[attribute_name]) end end end |