Class: Biblionet::Extractors::PublisherDataExtractor

Inherits:

Object

Object
Biblionet::Extractors::PublisherDataExtractor

show all

Defined in:: lib/bookshark/extractors/publisher_extractor.rb

Instance Attribute Summary collapse

#nodeset ⇒ Object readonly

Returns the value of attribute nodeset.

Instance Method Summary collapse

#bookstores ⇒ Object
#headquarters ⇒ Object
#initialize(document) ⇒ PublisherDataExtractor constructor

A new instance of PublisherDataExtractor.
#name ⇒ Object
#owner ⇒ Object

Constructor Details

#initialize(document) ⇒ `PublisherDataExtractor`

Returns a new instance of PublisherDataExtractor.

# File 'lib/bookshark/extractors/publisher_extractor.rb', line 48

def initialize(document)
  # No need to operate on whole page. Just on part containing the content.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content)
  end                 
end

Instance Attribute Details

#nodeset ⇒ `Object` (readonly)

Returns the value of attribute nodeset.



46
47
48

# File 'lib/bookshark/extractors/publisher_extractor.rb', line 46

def nodeset
  @nodeset
end

Instance Method Details

#bookstores ⇒ `Object`

# File 'lib/bookshark/extractors/publisher_extractor.rb', line 107

def bookstores
  bookstores_hash = Hash.new { |h,k| h[k] = {} }
  address_array   = []
  tel_array       = []

  # Defaunt key in case there is none.
  key = 'Βιβλιοπωλείο'

  @nodeset.css('//p[align="justify"]').inner_html.split('<br><br>').map(&:strip).reject(&:empty?).each do |item_group|
    if item_group.end_with?(":")
      key           = item_group[0..-2]
      address_array = []
      tel_array     = []
    else        
      if bookstores_hash[key].any?          
        key[-1].to_i
        key += ((key[-1].to_i > 0) ? (' '+(key[-1].to_i+1).to_s) : ' 2')
        address_array = []
        tel_array     = []         
      end        
      item_group.split('<br>').each do |item|          
        regex_tel   = /\d{3,5} \d{5,7}/
        regex_tk    = /\d{3} \d{2}/
        regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i
        regex_url   = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix
        
        if item.end_with?(":")                   
          key           = item[0..-2]
          address_array = []
          tel_array     = []
        elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel            
          bookstores_hash[key][:fax]        = item.gsub(/[^\d{3} \d{2}]/, '').strip            
        elsif item =~ regex_tel
          tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip            
          bookstores_hash[key][:telephone]  = tel_array            
        elsif item =~ regex_tk
          address_array << item.gsub(/,$/, '').strip                       
          bookstores_hash[key][:address]    = address_array            
        elsif item =~ regex_email            
          bookstores_hash[key][:email]      = (regex_email.match(item))[0]                        
        elsif item =~ regex_url            
          bookstores_hash[key][:website]    = item[regex_url,1]          
        else     
          address_array << item.gsub(/,$/, '').strip            
          bookstores_hash[key][:address]   = address_array                            
        end
      end                
    end      
  end
  bookstores_hash.delete_if { |k, v| v.empty? }
  return bookstores_hash
end

#headquarters ⇒ `Object`

# File 'lib/bookshark/extractors/publisher_extractor.rb', line 72

def headquarters
  headquarters_hash   = {}
  temp_array          = []
  current_key         = nil
  last_key            = nil

  @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item|
    key         = item.children[0].text.strip
    current_key = key.end_with?(":") ? key[0..-2] : last_key
    value       = item.children[1].text.strip

    unless key.empty? and value.empty?
      if current_key == last_key              
        temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array)
        temp_array << value.gsub(/,$/, '').strip unless value.empty?
        headquarters_hash[current_key] = temp_array
      else
        temp_array                      = []
        headquarters_hash[current_key]  = value.gsub(/,$/, '').strip
      end
    end

    last_key = current_key          
  end

  # Change keys. Use the same as in bookstores.
  mappings                      = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
  headquarters_hash             = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
  headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
  headquarters_hash[:website]   = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
  headquarters_hash[:address]   = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array) 

  return headquarters_hash                
end

#name ⇒ `Object`



64
65
66

# File 'lib/bookshark/extractors/publisher_extractor.rb', line 64

def name
  @nodeset.css('h1.page_title').text.strip
end

#owner ⇒ `Object`



68
69
70

# File 'lib/bookshark/extractors/publisher_extractor.rb', line 68

def owner 
  return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip        
end

Class: Biblionet::Extractors::PublisherDataExtractor

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ PublisherDataExtractor

Instance Attribute Details

#nodeset ⇒ Object (readonly)

Instance Method Details

#bookstores ⇒ Object

#headquarters ⇒ Object

#name ⇒ Object

#owner ⇒ Object

#initialize(document) ⇒ `PublisherDataExtractor`

#nodeset ⇒ `Object` (readonly)

#bookstores ⇒ `Object`

#headquarters ⇒ `Object`

#name ⇒ `Object`

#owner ⇒ `Object`