Class: Shelver::Extractor

Inherits:

Object

Object
Shelver::Extractor

show all

Defined in:: lib/shelver/extractor.rb

Instance Method Summary collapse

#extract_rels_ext(text, solr_doc = Solr::Document.new) ⇒ Object

Extracts content-model and hydra-type from RELS-EXT datastream.
#extract_tag(doc, type) ⇒ Object
#extract_tags(text) ⇒ Object
#html_content_to_solr(ds, solr_doc = Solr::Document.new) ⇒ Object

This method strips html tags out and returns content to be indexed in solr.
#xml_to_solr(text, solr_doc = Solr::Document.new) ⇒ Object

This method extracts solr fields from simple xml.

Instance Method Details

#extract_rels_ext(text, solr_doc = Solr::Document.new) ⇒ `Object`

Extracts content-model and hydra-type from RELS-EXT datastream

# File 'lib/shelver/extractor.rb', line 25

def extract_rels_ext( text, solr_doc=Solr::Document.new )
  # TODO: only read in this file once
  
  if defined?(RAILS_ROOT)
    config_path = File.join(RAILS_ROOT, "config")
  else
    config_path = File.join(File.dirname(__FILE__), "..", "..", "config")
  end    
  map = YAML.load(File.open(File.join(config_path, "hydra_types.yml")))
  
  doc = Nokogiri::XML(text)
  doc.xpath( '//foo:hasModel', 'foo' => 'info:fedora/fedora-system:def/model#' ).each do |element|
    cmodel = element.attributes['resource'].to_s
    solr_doc << Solr::Field.new( :cmodel_t => cmodel )
    
    if map.has_key?(cmodel)
      solr_doc << Solr::Field.new( :hydra_type_t => map[cmodel] )
    end
  end

  return solr_doc
end

#extract_tag(doc, type) ⇒ `Object`

# File 'lib/shelver/extractor.rb', line 15

def extract_tag(doc, type)
  tags = doc.elements["/fields/#{type}"]
  return {} unless tags
  {type => tags.text.split(/,/).map {|t| t.strip}}
end

#extract_tags(text) ⇒ `Object`

# File 'lib/shelver/extractor.rb', line 10

def extract_tags(text)
  doc = REXML::Document.new( text )
  extract_tag(doc, 'archivist_tags').merge(extract_tag(doc, 'donor_tags'))
end

#html_content_to_solr(ds, solr_doc = Solr::Document.new) ⇒ `Object`

This method strips html tags out and returns content to be indexed in solr

# File 'lib/shelver/extractor.rb', line 63

def html_content_to_solr( ds, solr_doc=Solr::Document.new )
  
  text = CGI.unescapeHTML(ds.content)
  doc = Nokogiri::HTML(text)
  
  # html to story_display
  stories = doc.xpath('//story')
      
  stories.each do |story|
    solr_doc << Solr::Field.new(:story_display => story.children.to_xml)
  end
  
  #strip out text and put in story_t
  text_nodes = doc.xpath("//text()")
  text = String.new
  
   text_nodes.each do |text_node|
     text << text_node.content
   end
  
   solr_doc << Solr::Field.new(:story_t => text)
   
   return solr_doc
end

#xml_to_solr(text, solr_doc = Solr::Document.new) ⇒ `Object`

This method extracts solr fields from simple xml

# File 'lib/shelver/extractor.rb', line 51

def xml_to_solr( text, solr_doc=Solr::Document.new )
  doc = REXML::Document.new( text )
  doc.root.elements.each do |element|
    solr_doc << Solr::Field.new( :"#{element.name}_t" => "#{element.text}" )
  end

  return solr_doc
end

Class: Shelver::Extractor

Instance Method Summary collapse

Instance Method Details

#extract_rels_ext(text, solr_doc = Solr::Document.new) ⇒ Object

#extract_tag(doc, type) ⇒ Object

#extract_tags(text) ⇒ Object

#html_content_to_solr(ds, solr_doc = Solr::Document.new) ⇒ Object

#xml_to_solr(text, solr_doc = Solr::Document.new) ⇒ Object

#extract_rels_ext(text, solr_doc = Solr::Document.new) ⇒ `Object`

#extract_tag(doc, type) ⇒ `Object`

#extract_tags(text) ⇒ `Object`

#html_content_to_solr(ds, solr_doc = Solr::Document.new) ⇒ `Object`

#xml_to_solr(text, solr_doc = Solr::Document.new) ⇒ `Object`