Top Level Namespace
Defined Under Namespace
Modules: Biblionet, Bookshark, FileManager, Nlg
Constant Summary collapse
- DEFAULTS =
{ folder: 'storage/html_author_pages', base_url: 'http://www.biblionet.gr/author/', extension: '.html', first_id: 1, last_id: 112000, step: 1000 }
- FOLDER =
puts page
'html_dcc_pages'
- BASE_URL =
'http://www.biblionet.gr/index/'
- EXTENSION =
'.html'
Instance Method Summary collapse
Instance Method Details
#crawl_and_save(options = {}) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/bookshark/crawlers/author_crawler.rb', line 17 def crawl_and_save(={}) = DEFAULTS.merge() start_id = [:first_id] + [:step] - 1 last_id = [:last_id] step = [:step] start_id.step(last_id, step) do |last| first = last - step + 1 subfolder = (last/step - 1).to_s path = "#{[:folder]}/#{subfolder}/" # Create a new directory (does nothing if directory exists) FileUtils.mkdir_p path first.upto(last) do |id| file_to_save = "#{path}author_#{id}#{[:extension]}" url_to_download = "#{[:base_url]}#{id}/" downloader = Biblionet::Core::Base.new(url_to_download) downloader.save_page(file_to_save) unless downloader.page.nil? end end end |