Module: Solrizer::HTML::Extractor

Defined in:
lib/solrizer/html/extractor.rb

Instance Method Summary collapse

Instance Method Details

#html_to_solr(ds, solr_doc = Solr::Document.new) ⇒ Object

This method strips html tags out and returns content to be indexed in solr



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/solrizer/html/extractor.rb', line 11

def html_to_solr( ds, solr_doc=Solr::Document.new )
  
  text = CGI.unescapeHTML(ds.content)
  doc = Nokogiri::HTML(text)
  
  # html to story_display
  stories = doc.xpath('//story')
      
  stories.each do |story|
    solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
  end
  
  #strip out text and put in story_t
  text_nodes = doc.xpath("//text()")
  text = String.new
  
   text_nodes.each do |text_node|
     text << text_node.content
   end
  
   solr_doc << Solr::Field.new(:story_t => text)
   
   return solr_doc
end