Module: Spidr::Links
Instance Method Summary collapse
-
#each_link {|link| ... } ⇒ Enumerator
Enumerates over every link in the page.
-
#each_meta_redirect {|link| ... } ⇒ Enumerator
Enumerates over the meta-redirect links in the page.
-
#each_redirect {|link| ... } ⇒ Enumerator
Enumerates over every HTTP or meta-redirect link in the page.
-
#each_url {|url| ... } ⇒ Enumerator
(also: #each)
Enumerates over every absolute URL in the page.
-
#links ⇒ Array<String>
The links from within the page.
-
#meta_redirect? ⇒ Boolean
Returns a boolean indicating whether or not page-level meta redirects are present in this page.
-
#meta_redirects ⇒ Array<String>
The meta-redirect links of the page.
-
#redirects_to ⇒ Array<String>
URLs that this document redirects to.
-
#to_absolute(link) ⇒ URI::HTTP
Normalizes and expands a given link into a proper URI.
-
#urls ⇒ Array<URI::HTTP>
Absolute URIs from within the page.
Instance Method Details
#each_link {|link| ... } ⇒ Enumerator
Enumerates over every link in the page.
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# File 'lib/spidr/links.rb', line 117 def each_link return enum_for(:each_link) unless block_given? filter = lambda { |url| yield url unless (url.nil? || url.empty?) } each_redirect(&filter) if is_redirect? if (html? && doc) doc.search('//a[@href]').each do |a| filter.call(a.get_attribute('href')) end doc.search('//frame[@src]').each do |iframe| filter.call(iframe.get_attribute('src')) end doc.search('//iframe[@src]').each do |iframe| filter.call(iframe.get_attribute('src')) end doc.search('//link[@href]').each do |link| filter.call(link.get_attribute('href')) end doc.search('//script[@src]').each do |script| filter.call(script.get_attribute('src')) end end end |
#each_meta_redirect {|link| ... } ⇒ Enumerator
Enumerates over the meta-redirect links in the page.
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/spidr/links.rb', line 23 def return enum_for(:each_meta_redirect) unless block_given? if (html? && doc) search('//meta[@http-equiv and @content]').each do |node| if node.get_attribute('http-equiv') =~ /refresh/i content = node.get_attribute('content') if (redirect = content.match(/url=(\S+)$/)) yield redirect[1] end end end end end |
#each_redirect {|link| ... } ⇒ Enumerator
Enumerates over every HTTP or meta-redirect link in the page.
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/spidr/links.rb', line 76 def each_redirect(&block) return enum_for(:each_redirect) unless block location = headers['location'] if location.nil? # check page-level meta redirects if there isn't a location header (&block) elsif location.kind_of?(Array) location.each(&block) else # usually the location header contains a single String yield location end end |
#each_url {|url| ... } ⇒ Enumerator Also known as: each
Enumerates over every absolute URL in the page.
174 175 176 177 178 179 180 181 182 |
# File 'lib/spidr/links.rb', line 174 def each_url return enum_for(:each_url) unless block_given? each_link do |link| if (url = to_absolute(link)) yield url end end end |
#links ⇒ Array<String>
The links from within the page.
156 157 158 |
# File 'lib/spidr/links.rb', line 156 def links each_link.to_a end |
#meta_redirect? ⇒ Boolean
Returns a boolean indicating whether or not page-level meta redirects are present in this page.
46 47 48 |
# File 'lib/spidr/links.rb', line 46 def !(.first.nil?) end |
#meta_redirects ⇒ Array<String>
The meta-redirect links of the page.
58 59 60 |
# File 'lib/spidr/links.rb', line 58 def .to_a end |
#redirects_to ⇒ Array<String>
URLs that this document redirects to.
99 100 101 |
# File 'lib/spidr/links.rb', line 99 def redirects_to each_redirect.to_a end |
#to_absolute(link) ⇒ URI::HTTP
Normalizes and expands a given link into a proper URI.
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/spidr/links.rb', line 205 def to_absolute(link) begin new_url = url.merge(link.to_s) rescue Exception return nil end if new_url.path path = new_url.path # ensure that paths begin with a leading '/' for URI::FTP if (new_url.scheme == 'ftp' && path[0,1] != '/') path.insert(0,'/') end # make sure the path does not contain any .. or . directories, # since URI::Generic#merge cannot normalize paths such as # "/stuff/../" new_url.path = URI.(path) end return new_url end |
#urls ⇒ Array<URI::HTTP>
Absolute URIs from within the page.
192 193 194 |
# File 'lib/spidr/links.rb', line 192 def urls each_url.to_a end |