Class: Downer::DownloadStrategy::WebsiteStrategy
- Inherits:
-
GenericStrategy
- Object
- GenericStrategy
- Downer::DownloadStrategy::WebsiteStrategy
- Defined in:
- lib/downer/strategies/website_strategy.rb
Instance Method Summary collapse
-
#absolutify_link(link) ⇒ Object
Converts non absolute urls to absolute ones.
-
#document_links ⇒ Object
Return all links stored within the document.
-
#download_page ⇒ Object
read an html page into memory.
-
#get_urls ⇒ Object
Retrieve urls from an HTML page.
-
#image_urls ⇒ Object
Return all image urls from document.
-
#initialize(url_source, search_options = {}) ⇒ WebsiteStrategy
constructor
Create the downloading strategy, set any behavior flags in the options hash.
- #source_valid? ⇒ Boolean
Methods inherited from GenericStrategy
Constructor Details
#initialize(url_source, search_options = {}) ⇒ WebsiteStrategy
Create the downloading strategy, set any behavior flags in the options hash
7 8 9 10 11 |
# File 'lib/downer/strategies/website_strategy.rb', line 7 def initialize(url_source, = {}) super(url_source, ) uri = URI.parse(url_source) @host_prefix = uri.scheme + "://" + uri.host end |
Instance Method Details
#absolutify_link(link) ⇒ Object
Converts non absolute urls to absolute ones
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/downer/strategies/website_strategy.rb', line 53 def absolutify_link(link) # Auto prepend any links which refer use releative reference like '../' if link[0,1] == '.' link = '/' + link end if link =~ /(https?|ftp).*/ url = link elsif link[0,1] != '/' link = "/" + link else url = @host_prefix + link end end |
#document_links ⇒ Object
Return all links stored within the document
43 44 45 46 47 48 49 50 |
# File 'lib/downer/strategies/website_strategy.rb', line 43 def document_links urls = [] @noko.css('a').each do |alink| link = alink['href'] urls << absolutify_link(link) end urls end |
#download_page ⇒ Object
read an html page into memory
29 30 31 |
# File 'lib/downer/strategies/website_strategy.rb', line 29 def download_page @downloaded_page ||= open(@url_source) end |
#get_urls ⇒ Object
Retrieve urls from an HTML page. Behavior is dependent upon options passed to constructor
15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/downer/strategies/website_strategy.rb', line 15 def get_urls @noko = Nokogiri::HTML(download_page) urls = [] if @search_options[:images_only] urls = image_urls else urls = urls.concat document_links urls = urls.concat image_urls end urls.uniq end |
#image_urls ⇒ Object
Return all image urls from document
34 35 36 37 38 39 40 |
# File 'lib/downer/strategies/website_strategy.rb', line 34 def image_urls urls = [] @noko.css('img').each do |img| urls << absolutify_link(img['src']) end urls end |
#source_valid? ⇒ Boolean
69 70 71 |
# File 'lib/downer/strategies/website_strategy.rb', line 69 def source_valid? URI.parse(@url_source) end |