Class: Biblionet::Extractors::BookExtractor

Inherits:
Base
  • Object
show all
Defined in:
lib/bookshark/extractors/book_extractor.rb

Direct Known Subclasses

Search

Instance Attribute Summary collapse

Attributes inherited from Base

#biblionet_id, #filepath, #page, #url

Instance Method Summary collapse

Methods inherited from Base

#decode_text, decode_text, #load_page, #load_page_from_file, #load_page_from_url, #present?, #save_page

Methods included from FileManager

#list_directories, #list_files, #save_to

Constructor Details

#initialize(uri = nil) ⇒ BookExtractor

Returns a new instance of BookExtractor.



14
15
16
17
# File 'lib/bookshark/extractors/book_extractor.rb', line 14

def initialize(uri=nil)
  super(uri)        
  extract_book unless uri.nil? or @page.nil?        
end

Instance Attribute Details

#bookObject (readonly)

Returns the value of attribute book.



12
13
14
# File 'lib/bookshark/extractors/book_extractor.rb', line 12

def book
  @book
end

Instance Method Details

#extract_book(biblionet_id = @biblionet_id, book_page = @page) ⇒ Object



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/bookshark/extractors/book_extractor.rb', line 117

def extract_book(biblionet_id=@biblionet_id, book_page=@page)                
  # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
  log = Logger.new(STDOUT)
         
  page = BookDataExtractor.new(book_page)

  # End extraction if BookDataExtractor couldnt create a nodeset
  return nil if page.nodeset.nil?


  book_hash = Hash.new      

  begin                
    img = page.image                            
    raise NoImageError.new(biblionet_id) if img.nil?
  rescue NoImageError => e
    pp e 
    log.warn(e.message)                
  rescue StandardError => e
    pp err_msg = "Error #{e} at book: #{biblionet_id}" 
    log.error(err_msg)                            
  end

  book_hash[:title] = page.title 
  book_hash[:subtitle] = page.subtitle        
  book_hash[:image] = img                          

  contributors = proccess_contributors(page.contributors)

  author = contributors[:author]
  contributors.delete(:author)
  
  # If author is empty, maybe its a collective work.
  if author.nil? or author.empty?
    if page.collective_work?     
      # author = 'Συλλογικό έργο'
      author = ['Συλλογικό έργο']
    else            
      pp err_msg = "No author has been found at book: #{biblionet_id}" 
      log.warn(err_msg)   
      author = []          
    end
  end

  book_hash[:author]       = author
  book_hash[:contributors] = contributors        
  book_hash[:publisher]    = page.publisher

  details = page.details
  if details.nil?
    pp err_msg = "No details at book: #{biblionet_id}"
    log.error(err_msg)       
  end        

  details_hash = proccess_details(details)

  # book_hash[:publication_year] = details_hash[:publication_year]
  # book_hash[:pages]            = details_hash[:pages]
  book_hash[:isbn] = details_hash[:isbn]
  
  if details_hash[:isbn_13].nil?
    if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13)
      book_hash[:isbn_13] = book_hash[:isbn]
    else
      book_hash[:isbn_13] = nil
    end
  else
    book_hash[:isbn_13] = details_hash[:isbn_13]
  end

  # book_hash[:isbn_13]          = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
  # book_hash[:status]           = details_hash[:status]
  # book_hash[:price]            = details_hash[:price]
  book_hash[:award]            = page.awards


  book_hash[:description] = page.description

  ddcs = page.ddcs.map do |ddc|      
          # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
          ddc_biblionet_id = ddc[:href].split(/\//).last
          # Extact DdC id and DdC text.     
          ddc = proccess_ddc(ddc.text)

          ddc.merge!(b_id: ddc_biblionet_id)

        end


  book_hash[:category]   = ddcs
  book_hash[:b_id] = biblionet_id

  uri = nil

  if @url
    uri =  "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
  elsif @filepath
    uri = File.dirname(@filepath) + "/" + "bg_record_#{biblionet_id}.html"
  end

  # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"

  bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
  bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)      

  book_hash[:publisher]         = bibliographical_details[:publisher]
  book_hash[:publication]       = bibliographical_details[:publication]   

  book_hash[:format]            = bibliographical_details[:format]     

  book_hash[:original_language] = bibliographical_details[:original_language]
  book_hash[:original_title]    = bibliographical_details[:original_title]

  book_hash[:price]             = bibliographical_details[:price]      
  book_hash[:availability]      = bibliographical_details[:availability]
  book_hash[:last_update]       = bibliographical_details[:last_update]
  
  book_hash[:series]            = bibliographical_details[:series]        

  physical_description_hash = {}
  physical_description_hash[:pages]      = details_hash[:pages]
  physical_description_hash[:size]       = bibliographical_details[:physical_size]
  physical_description_hash[:cover_type] = bibliographical_details[:cover_type]

  book_hash[:physical_description] = physical_description_hash
  

  return @book = book_hash  
end

#load_and_extract_book(uri = nil) ⇒ Object



19
20
21
22
# File 'lib/bookshark/extractors/book_extractor.rb', line 19

def load_and_extract_book(uri=nil)
  load_page(uri)
  extract_book unless uri.nil? or @page.nil?
end

#proccess_contributors(raw_contributors) ⇒ Object

Converts the parsed contributors string to hash. String must have been processed into the following form: job1: contributor1, contributor2 job2: contributor3 The returned hash is in form: => [“contributor1”,“contributor2”],job2 => [“contributor3”]



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/bookshark/extractors/book_extractor.rb', line 28

def proccess_contributors(raw_contributors)
  contributors  = Hash.new
  partners      = Array.new
  job           = :author
  raw_contributors.each do |cb|
    if cb.is_a?(String) and cb.end_with? ":"
      job = cb[0..-2]
      partners.clear
    else
      partners << cb
      contributors[job] =  partners.clone
    end  
  end unless raw_contributors.nil? or raw_contributors.empty?
  
  return contributors
end

#proccess_ddc(ddc, extract_parents = false) ⇒ Object



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/bookshark/extractors/book_extractor.rb', line 96

def proccess_ddc(ddc, extract_parents = false)
  # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)  
  id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/

  # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)   
  non_text_re = /\s*(\[.*\]|\(.*\))\s*/
          
  # Gets the dcc part from text and removes anything but digits in [DDC: digits].                
  ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text. 

  # Extracts the parent tree of current ddc.
  # ddcparser.parse(ddc_id)       

  # Gets text by reomoving anything but text.
  ddc_text = ddc.gsub(non_text_re, '').strip

  ddc_hash = { ddc: ddc_id, name: ddc_text } 
  return ddc_hash
end

#proccess_details(details) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/bookshark/extractors/book_extractor.rb', line 45

def proccess_details(details)
  details_hash = Hash.new
  
  details.each do |detail|          
    date_regex = /(^\d{4}$)/
    status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/  
    detail = decode_text(detail)

    begin
      if detail =~ date_regex
        #puts "Publication Year: #{detail}"
        details_hash[:publication_year] = detail
      elsif detail.end_with? "σελ."
        pages = detail.gsub(/[^\d]/, '')
        #puts "Pages: #{pages}"
        details_hash[:pages] = pages
      elsif detail.start_with? "ISBN-13"
        isbn_13 = detail.gsub(/ISBN-13 /, "")
        details_hash[:isbn_13] = isbn_13
        #puts "ISBN: #{isbn_13}"      
      elsif detail.start_with? "ISBN"
        isbn = detail.gsub(/ISBN /, "")
        #puts "ISBN: #{isbn}"
        details_hash[:isbn] = isbn
      elsif detail =~ status_regex
        status = detail.gsub(/\[|\]/, '')
        #puts "Status: #{status}"
        details_hash[:status] = status
      elsif detail.start_with? "Τιμή"
        price = detail.gsub(/[^\d,\d]/, '')
        #puts "Price: #{price}"
        details_hash[:price] = price
      elsif detail.start_with? '<img src="/images/award.jpg" border="0" title="Βραβείο">'
        award = Sanitize.clean(detail).strip
        details_hash[:awards] = [] if details_hash[:awards].nil?
        details_hash[:awards] << award
      elsif detail.start_with? "ISMN" #Special typo case
        isbn = detail.gsub(/ISMN /, "")
        #puts "ISBN: #{isbn}"
        details_hash[:isbn] = isbn              
      else 
        raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
      end
    rescue NoIdeaWhatThisIsError => e
      pp e        
    end
  end

  return details_hash
end