Module: RedirectedTo

Includes:
UnescapeHtmlHelper
Included in:
BrilliantWebScraper
Defined in:
lib/parsers/redirected_to.rb

Overview

Fetch latest url of the given website

Instance Method Summary collapse

Instance Method Details

#grep_redirected_to_url(response) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/parsers/redirected_to.rb', line 7

def grep_redirected_to_url(response)
  return if response.nil? || response.empty?

  patterns = [
    %r{(?im)<link\s+[\s\w="'-]*rel\s*=\s*(?:"|')canonical(?:"|')[\s\w='"-]*?\s+href\s*=\s*(?:"|')([^"']*)(?:"|')[\s\w='"-]*?(?:>|\/>)},
    %r{(?im)<link\s+[\s\w='"-]*href\s*=\s*(?:"|')([^'"]*)(?:"|')[\s\w='"-]*?rel\s*=\s*(?:"|')\s*canonical\s*(?:"|')[\s\w='"-]*(?:>|\/>)},
    %r{(?im)<meta\s+[\s\w="'-]*property=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w="'-]*content=\s*(?:'|")([^'"]*)(?:'|")[\s\w="'-]*(?:>|\/>)},
    %r{(?im)<meta\s+[\s\w"'=-]*content\s*=\s*(?:'|")([^'"]*)(?:'|")[\s\w"'=-]*property\s*=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w"'=-]*(?:>|\/>)}
  ]
  url = nil
  patterns.each do |pattern|
    web_urls = response.scan(pattern).flatten
    url = parser(web_urls)
    break unless url.nil?
  end
  unescape_html(url)
end