Module: RedirectedTo
- Includes:
- UnescapeHtmlHelper
- Included in:
- BrilliantWebScraper
- Defined in:
- lib/parsers/redirected_to.rb
Overview
Fetch latest url of the given website
Instance Method Summary collapse
Instance Method Details
#grep_redirected_to_url(response) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/parsers/redirected_to.rb', line 7 def grep_redirected_to_url(response) return if response.nil? || response.empty? patterns = [ %r{(?im)<link\s+[\s\w="'-]*rel\s*=\s*(?:"|')canonical(?:"|')[\s\w='"-]*?\s+href\s*=\s*(?:"|')([^"']*)(?:"|')[\s\w='"-]*?(?:>|\/>)}, %r{(?im)<link\s+[\s\w='"-]*href\s*=\s*(?:"|')([^'"]*)(?:"|')[\s\w='"-]*?rel\s*=\s*(?:"|')\s*canonical\s*(?:"|')[\s\w='"-]*(?:>|\/>)}, %r{(?im)<meta\s+[\s\w="'-]*property=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w="'-]*content=\s*(?:'|")([^'"]*)(?:'|")[\s\w="'-]*(?:>|\/>)}, %r{(?im)<meta\s+[\s\w"'=-]*content\s*=\s*(?:'|")([^'"]*)(?:'|")[\s\w"'=-]*property\s*=\s*(?:'|")\s*og:url\s*(?:'|")[\s\w"'=-]*(?:>|\/>)} ] url = nil patterns.each do |pattern| web_urls = response.scan(pattern).flatten url = parser(web_urls) break unless url.nil? end unescape_html(url) end |