Top Level Namespace

Defined Under Namespace

Modules: Biblionet, Bookshark, FileManager, Nlg

Constant Summary collapse

DEFAULTS =
{
  folder: 'storage/html_author_pages',
  base_url: 'http://www.biblionet.gr/author/',
  extension: '.html',
  first_id: 1,
  last_id: 112000,
  step: 1000
}
FOLDER =

puts page

'html_dcc_pages'
BASE_URL =
'http://www.biblionet.gr/index/'
EXTENSION =
'.html'

Instance Method Summary collapse

Instance Method Details

#crawl_and_save(options = {}) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/bookshark/crawlers/author_crawler.rb', line 17

def crawl_and_save(options={})
  options = DEFAULTS.merge(options)

  start_id  = options[:first_id] + options[:step] - 1
  last_id   = options[:last_id]
  step      = options[:step]

  start_id.step(last_id, step) do |last|  
    first     = last - step + 1
    subfolder = (last/step - 1).to_s
    path      = "#{options[:folder]}/#{subfolder}/"

    # Create a new directory (does nothing if directory exists)
    FileUtils.mkdir_p path

    first.upto(last) do |id|
      file_to_save = "#{path}author_#{id}#{options[:extension]}"
      url_to_download = "#{options[:base_url]}#{id}/"

      downloader = Biblionet::Core::Base.new(url_to_download)
      downloader.save_page(file_to_save) unless downloader.page.nil?

    end
  end

end