Module: NewsCrawler::URLHelper

Included in:
LinkSelector::SameDomainSelector, LinkSelector::SameDomainSelector, Processing::StructureAnalysis
Defined in:
lib/news_crawler/url_helper.rb

Overview

Contains various method for processing url

Instance Method Summary collapse

Instance Method Details

#get_url_path(url) ⇒ Object

split URL into 3 parts: scheme, domain, path return [ Hash ] contains parts

Parameters:

  • url (String)


46
47
48
49
50
51
52
# File 'lib/news_crawler/url_helper.rb', line 46

def get_url_path(url)
  pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/
  md = pattern.match(url)
  { :scheme => md[:scheme],
    :domain => md[:domain],
    :path => md[:path]}
end

#same_domain?(url1, url2) ⇒ Boolean

produce true if 2 urls belong to same domain, or url is start with ‘/’

Parameters:

  • url1 (String)

    Url 1

  • url2 (String)

    Url 2

Returns:

  • (Boolean)

    true if both url belong to same domain



30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/news_crawler/url_helper.rb', line 30

def same_domain?(url1, url2)
  if (url1[0] == '/') || (url2[0] == '/')
    return true
  end
  p1 = get_url_path(url1)
  p2 = get_url_path(url2)
  d1 = p1[:domain].split('.').reverse
  d2 = p2[:domain].split('.').reverse
  d1.zip(d2).inject(true) do | mem, obj |
    mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?))
  end
end