Module: Spidr::Links

Includes:
Enumerable
Included in:
Page
Defined in:
lib/spidr/links.rb

Instance Method Summary collapse

Instance Method Details

Enumerates over every link in the page.

Yields:

  • (link)

    The given block will be passed every non-empty link in the page.

Yield Parameters:

  • link (String)

    A link in the page.

Returns:

  • (Enumerator)

    If no block is given, an enumerator object will be returned.

Since:

  • 0.3.0



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/spidr/links.rb', line 117

def each_link
  return enum_for(:each_link) unless block_given?

  filter = lambda { |url|
    yield url unless (url.nil? || url.empty?)
  }

  each_redirect(&filter) if is_redirect?

  if (html? && doc)
    doc.search('//a[@href]').each do |a|
      filter.call(a.get_attribute('href'))
    end

    doc.search('//frame[@src]').each do |iframe|
      filter.call(iframe.get_attribute('src'))
    end

    doc.search('//iframe[@src]').each do |iframe|
      filter.call(iframe.get_attribute('src'))
    end

    doc.search('//link[@href]').each do |link|
      filter.call(link.get_attribute('href'))
    end

    doc.search('//script[@src]').each do |script|
      filter.call(script.get_attribute('src'))
    end
  end
end

#each_meta_redirect {|link| ... } ⇒ Enumerator

Enumerates over the meta-redirect links in the page.

Yields:

  • (link)

    If a block is given, it will be passed every meta-redirect link from the page.

Yield Parameters:

  • link (String)

    A meta-redirect link from the page.

Returns:

  • (Enumerator)

    If no block is given, an enumerator object will be returned.

Since:

  • 0.3.0



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/spidr/links.rb', line 23

def each_meta_redirect
  return enum_for(:each_meta_redirect) unless block_given?

  if (html? && doc)
    search('//meta[@http-equiv and @content]').each do |node|
      if node.get_attribute('http-equiv') =~ /refresh/i
        content = node.get_attribute('content')

        if (redirect = content.match(/url=(\S+)$/))
          yield redirect[1]
        end
      end
    end
  end
end

#each_redirect {|link| ... } ⇒ Enumerator

Enumerates over every HTTP or meta-redirect link in the page.

Yields:

  • (link)

    The given block will be passed every redirection link from the page.

Yield Parameters:

  • link (String)

    A HTTP or meta-redirect link from the page.

Returns:

  • (Enumerator)

    If no block is given, an enumerator object will be returned.

Since:

  • 0.3.0



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/spidr/links.rb', line 76

def each_redirect(&block)
  return enum_for(:each_redirect) unless block

  location = headers['location']

  if location.nil?
    # check page-level meta redirects if there isn't a location header
    each_meta_redirect(&block)
  elsif location.kind_of?(Array)
    location.each(&block)
  else
    # usually the location header contains a single String
    yield location
  end
end

#each_url {|url| ... } ⇒ Enumerator Also known as: each

Enumerates over every absolute URL in the page.

Yields:

  • (url)

    The given block will be passed every URL in the page.

Yield Parameters:

  • url (URI::HTTP)

    An absolute URL in the page.

Returns:

  • (Enumerator)

    If no block is given, an enumerator object will be returned.

Since:

  • 0.3.0



174
175
176
177
178
179
180
181
182
# File 'lib/spidr/links.rb', line 174

def each_url
  return enum_for(:each_url) unless block_given?

  each_link do |link|
    if (url = to_absolute(link))
      yield url
    end
  end
end

The links from within the page.

Returns:

  • (Array<String>)

    All links within the HTML page, frame/iframe source URLs and any links in the Location header.



156
157
158
# File 'lib/spidr/links.rb', line 156

def links
  each_link.to_a
end

#meta_redirect?Boolean

Returns a boolean indicating whether or not page-level meta redirects are present in this page.

Returns:

  • (Boolean)

    Specifies whether the page includes page-level redirects.



46
47
48
# File 'lib/spidr/links.rb', line 46

def meta_redirect?
  !(each_meta_redirect.first.nil?)
end

#meta_redirectsArray<String>

The meta-redirect links of the page.

Returns:

  • (Array<String>)

    All meta-redirect links in the page.

Since:

  • 0.3.0



58
59
60
# File 'lib/spidr/links.rb', line 58

def meta_redirects
  each_meta_redirect.to_a
end

#redirects_toArray<String>

URLs that this document redirects to.

Returns:

  • (Array<String>)

    The links that this page redirects to (usually found in a location header or by way of a page-level meta redirect).



99
100
101
# File 'lib/spidr/links.rb', line 99

def redirects_to
  each_redirect.to_a
end

#to_absolute(link) ⇒ URI::HTTP

Normalizes and expands a given link into a proper URI.

Parameters:

  • link (String)

    The link to normalize and expand.

Returns:

  • (URI::HTTP)

    The normalized URI.



205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/spidr/links.rb', line 205

def to_absolute(link)
  begin
    new_url = url.merge(link.to_s)
  rescue Exception
    return nil
  end

  if new_url.path
    path = new_url.path

    # ensure that paths begin with a leading '/' for URI::FTP
    if (new_url.scheme == 'ftp' && path[0,1] != '/')
      path.insert(0,'/')
    end

    # make sure the path does not contain any .. or . directories,
    # since URI::Generic#merge cannot normalize paths such as
    # "/stuff/../"
    new_url.path = URI.expand_path(path)
  end

  return new_url
end

#urlsArray<URI::HTTP>

Absolute URIs from within the page.

Returns:

  • (Array<URI::HTTP>)

    The links from within the page, converted to absolute URIs.



192
193
194
# File 'lib/spidr/links.rb', line 192

def urls
  each_url.to_a
end