Class: Biblionet::Extractors::PublisherDataExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/bookshark/extractors/publisher_extractor.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ PublisherDataExtractor

Returns a new instance of PublisherDataExtractor.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/bookshark/extractors/publisher_extractor.rb', line 48

def initialize(document)
  # No need to operate on whole page. Just on part containing the content.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content)
  end                 
end

Instance Attribute Details

#nodesetObject (readonly)

Returns the value of attribute nodeset.



46
47
48
# File 'lib/bookshark/extractors/publisher_extractor.rb', line 46

def nodeset
  @nodeset
end

Instance Method Details

#bookstoresObject



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/bookshark/extractors/publisher_extractor.rb', line 107

def bookstores
  bookstores_hash = Hash.new { |h,k| h[k] = {} }
  address_array   = []
  tel_array       = []

  # Defaunt key in case there is none.
  key = 'Βιβλιοπωλείο'

  @nodeset.css('//p[align="justify"]').inner_html.split('<br><br>').map(&:strip).reject(&:empty?).each do |item_group|
    if item_group.end_with?(":")
      key           = item_group[0..-2]
      address_array = []
      tel_array     = []
    else        
      if bookstores_hash[key].any?          
        key[-1].to_i
        key += ((key[-1].to_i > 0) ? (' '+(key[-1].to_i+1).to_s) : ' 2')
        address_array = []
        tel_array     = []         
      end        
      item_group.split('<br>').each do |item|          
        regex_tel   = /\d{3,5} \d{5,7}/
        regex_tk    = /\d{3} \d{2}/
        regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i
        regex_url   = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix
        
        if item.end_with?(":")                   
          key           = item[0..-2]
          address_array = []
          tel_array     = []
        elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel            
          bookstores_hash[key][:fax]        = item.gsub(/[^\d{3} \d{2}]/, '').strip            
        elsif item =~ regex_tel
          tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip            
          bookstores_hash[key][:telephone]  = tel_array            
        elsif item =~ regex_tk
          address_array << item.gsub(/,$/, '').strip                       
          bookstores_hash[key][:address]    = address_array            
        elsif item =~ regex_email            
          bookstores_hash[key][:email]      = (regex_email.match(item))[0]                        
        elsif item =~ regex_url            
          bookstores_hash[key][:website]    = item[regex_url,1]          
        else     
          address_array << item.gsub(/,$/, '').strip            
          bookstores_hash[key][:address]   = address_array                            
        end
      end                
    end      
  end
  bookstores_hash.delete_if { |k, v| v.empty? }
  return bookstores_hash
end

#headquartersObject



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/bookshark/extractors/publisher_extractor.rb', line 72

def headquarters
  headquarters_hash   = {}
  temp_array          = []
  current_key         = nil
  last_key            = nil

  @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item|
    key         = item.children[0].text.strip
    current_key = key.end_with?(":") ? key[0..-2] : last_key
    value       = item.children[1].text.strip

    unless key.empty? and value.empty?
      if current_key == last_key              
        temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array)
        temp_array << value.gsub(/,$/, '').strip unless value.empty?
        headquarters_hash[current_key] = temp_array
      else
        temp_array                      = []
        headquarters_hash[current_key]  = value.gsub(/,$/, '').strip
      end
    end

    last_key = current_key          
  end

  # Change keys. Use the same as in bookstores.
  mappings                      = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
  headquarters_hash             = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
  headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
  headquarters_hash[:website]   = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
  headquarters_hash[:address]   = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array) 

  return headquarters_hash                
end

#nameObject



64
65
66
# File 'lib/bookshark/extractors/publisher_extractor.rb', line 64

def name
  @nodeset.css('h1.page_title').text.strip
end

#ownerObject



68
69
70
# File 'lib/bookshark/extractors/publisher_extractor.rb', line 68

def owner 
  return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip        
end