Module: NewsCrawler::URLHelper

Included in:: LinkSelector::SameDomainSelector, LinkSelector::SameDomainSelector, Processing::StructureAnalysis

Defined in:: lib/news_crawler/url_helper.rb

Overview

Contains various method for processing url

Instance Method Summary collapse

#get_url_path(url) ⇒ Object

split URL into 3 parts: scheme, domain, path return [ Hash ] contains parts.
#same_domain?(url1, url2) ⇒ Boolean

produce true if 2 urls belong to same domain, or url is start with ‘/’.

Instance Method Details

#get_url_path(url) ⇒ `Object`

split URL into 3 parts: scheme, domain, path return [ Hash ] contains parts

Parameters:

url (String)

# File 'lib/news_crawler/url_helper.rb', line 46

def get_url_path(url)
  pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/
  md = pattern.match(url)
  { :scheme => md[:scheme],
    :domain => md[:domain],
    :path => md[:path]}
end

#same_domain?(url1, url2) ⇒ `Boolean`

produce true if 2 urls belong to same domain, or url is start with ‘/’

Parameters:

url1 (String) —

Url 1
url2 (String) —

Url 2

Returns:

(Boolean) —

true if both url belong to same domain

# File 'lib/news_crawler/url_helper.rb', line 30

def same_domain?(url1, url2)
  if (url1[0] == '/') || (url2[0] == '/')
    return true
  end
  p1 = get_url_path(url1)
  p2 = get_url_path(url2)
  d1 = p1[:domain].split('.').reverse
  d2 = p2[:domain].split('.').reverse
  d1.zip(d2).inject(true) do | mem, obj |
    mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?))
  end
end