Class: Downer::DownloadStrategy::WebsiteStrategy

Inherits:
GenericStrategy show all
Defined in:
lib/downer/strategies/website_strategy.rb

Instance Method Summary collapse

Methods inherited from GenericStrategy

#options, #source_type

Constructor Details

#initialize(url_source, search_options = {}) ⇒ WebsiteStrategy

Create the downloading strategy, set any behavior flags in the options hash



7
8
9
10
11
# File 'lib/downer/strategies/website_strategy.rb', line 7

def initialize(url_source, search_options = {})
  super(url_source, search_options)
  uri = URI.parse(url_source)
  @host_prefix = uri.scheme + "://" + uri.host
end

Instance Method Details

Converts non absolute urls to absolute ones



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/downer/strategies/website_strategy.rb', line 53

def absolutify_link(link)
  
  # Auto prepend any links which refer use releative reference like '../'
  if link[0,1] == '.'
    link = '/' + link
  end
  
  if link =~ /(https?|ftp).*/
    url = link
  elsif link[0,1] != '/'
    link = "/" + link
  else
    url = @host_prefix + link
  end
end

Return all links stored within the document



43
44
45
46
47
48
49
50
# File 'lib/downer/strategies/website_strategy.rb', line 43

def document_links
  urls = []
  @noko.css('a').each do |alink|
    link = alink['href']
    urls << absolutify_link(link)
  end
  urls
end

#download_pageObject

read an html page into memory



29
30
31
# File 'lib/downer/strategies/website_strategy.rb', line 29

def download_page
  @downloaded_page ||= open(@url_source)
end

#get_urlsObject

Retrieve urls from an HTML page. Behavior is dependent upon options passed to constructor



15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/downer/strategies/website_strategy.rb', line 15

def get_urls
  @noko = Nokogiri::HTML(download_page)
  urls = []
  
  if @search_options[:images_only]
    urls = image_urls
  else
    urls = urls.concat document_links
    urls = urls.concat image_urls
  end
  urls.uniq
end

#image_urlsObject

Return all image urls from document



34
35
36
37
38
39
40
# File 'lib/downer/strategies/website_strategy.rb', line 34

def image_urls
  urls = []      
  @noko.css('img').each do |img| 
    urls << absolutify_link(img['src'])
  end
  urls
end

#source_valid?Boolean

Returns:

  • (Boolean)


69
70
71
# File 'lib/downer/strategies/website_strategy.rb', line 69

def source_valid?
  URI.parse(@url_source)
end