Class: Biblionet::Extractors::Base

Inherits:
Object
  • Object
show all
Includes:
FileManager
Defined in:
lib/bookshark/extractors/base.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from FileManager

#list_directories, #list_files, #save_to

Constructor Details

#initialize(uri = nil) ⇒ Base

Initializes the Base class. Without arguments nothing happens. Otherwise loads a page by url or file.

Attributes

  • uri - It can be a url or a path/to/file.ext on local storage.



31
32
33
# File 'lib/bookshark/extractors/base.rb', line 31

def initialize(uri=nil)          
  load_page(uri)
end

Instance Attribute Details

#biblionet_idObject (readonly)

Returns the value of attribute biblionet_id.



23
24
25
# File 'lib/bookshark/extractors/base.rb', line 23

def biblionet_id
  @biblionet_id
end

#filepathObject

Returns the value of attribute filepath.



23
24
25
# File 'lib/bookshark/extractors/base.rb', line 23

def filepath
  @filepath
end

#pageObject (readonly)

Returns the value of attribute page.



23
24
25
# File 'lib/bookshark/extractors/base.rb', line 23

def page
  @page
end

#urlObject

Returns the value of attribute url.



23
24
25
# File 'lib/bookshark/extractors/base.rb', line 23

def url
  @url
end

Class Method Details

.decode_text(encoded_text) ⇒ Object



147
148
149
150
151
# File 'lib/bookshark/extractors/base.rb', line 147

def self.decode_text(encoded_text)
  # encoded_text = File.read(encoded_file_path)
  coder = HTMLEntities.new
  coder.decode(encoded_text)
end

Instance Method Details

#decode_text(encoded_text) ⇒ Object

Decodes text with escaped html entities and returns the decoded text.

Params:

encoded_text

the text which contains encoded entities



143
144
145
# File 'lib/bookshark/extractors/base.rb', line 143

def decode_text(encoded_text)
  self.class.decode_text(encoded_text)
end

#load_page(uri = nil) ⇒ Object

Loads a page from the web or from local file storage depending on passed argument.

Attributes

  • uri - It can be a url(starting with http/https) or a path/to/file.ext on local storage.



41
42
43
44
45
46
47
# File 'lib/bookshark/extractors/base.rb', line 41

def load_page(uri=nil)      
  if uri.match(/\A#{URI::regexp(['http', 'https'])}\z/)        
    load_page_from_url(uri)
  else                
    load_page_from_file(uri)
  end unless uri.nil?
end

#load_page_from_file(filepath) ⇒ Object

Reads a page from the local file system.

Attributes

  • filepath - The path to target file which will be read.



96
97
98
99
100
101
102
103
104
# File 'lib/bookshark/extractors/base.rb', line 96

def load_page_from_file(filepath)    
  begin        
    @filepath = filepath
    @biblionet_id = filepath[/\d+(?!.*\d+)/] unless filepath.nil?
    @page = open(filepath).read  
  rescue StandardError => e
    puts e
  end     
end

#load_page_from_url(url) ⇒ Object

Downloads a page from the web.

Attributes

  • url - The url of webpage to download.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/bookshark/extractors/base.rb', line 55

def load_page_from_url(url)
  begin
    @url = url
    @biblionet_id = url[/\d+(?!.*\d+)/] unless url.nil? # id is expected to be the last number.

    pp "Downloading page: #{url}"
    open(url, :content_length_proc => lambda do |content_length|
      raise EmptyPageError.new(url, content_length) unless content_length.nil? or content_length > 1024
    end) do |f|        
      # pp f.status == ["200", "OK"] ? "success: #{f.status}" : f.status            
      # pp  f.meta
      # pp "Content-Type: " + f.content_type
      # pp "Content-Size: " + (f.meta)["content-length"]
      # pp "last modified" + f.last_modified.to_s + is_empty = (f.last_modified.nil?) ? 'Empty' : 'Not Empty' 

      @page = f.read.gsub(/\s+/, " ")
    end
  rescue Errno::ENOENT => e
    pp "Page: #{url} NOT FOUND."
    pp e
  rescue EmptyPageError => e
    pp "Page: #{url} is EMPTY."
    pp e        
    @page = nil
  rescue OpenURI::HTTPError => e
    pp e
    pp e.io.status          
  rescue StandardError => e          
    pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
    pp e        
    sleep(120)
    retry        
  end
end

#present?(value) ⇒ Boolean

Returns:

  • (Boolean)


153
154
155
# File 'lib/bookshark/extractors/base.rb', line 153

def present?(value)
  return (not value.nil? and not value.empty?) ? true : false
end

#save_page(path) ⇒ Object

Saves page to file.

Attributes

  • path - The path to file(including filename) where content will be saved.



132
133
134
135
# File 'lib/bookshark/extractors/base.rb', line 132

def save_page(path)
  save_to(path, @page)
  pp "Saving page: #{path}"
end