Class: NewsCrawler::LinkSelector::SameDomainSelector

Inherits:
Object
  • Object
show all
Extended by:
URLHelper
Includes:
Celluloid, CrawlerModule, URLHelper
Defined in:
lib/news_crawler/link_selector/same_domain_selector.rb

Overview

Select all link from same domain. Domain is got from database

Class Method Summary collapse

Instance Method Summary collapse

Methods included from URLHelper

get_url_path, same_domain?

Methods included from CrawlerModule

#find_all, #find_one, #find_unprocessed, #load_yaml, #mark_all_as_unprocessed, #mark_processed, #mark_unprocessed, #next_unprocessed, #save_yaml

Constructor Details

#initialize(max_depth = -1,, start_on_create = true) ⇒ SameDomainSelector

Create new selector with queue URL’s selected is put back into queue

Parameters:

  • max_depth (Fixnum) (defaults to: -1,)

    maxinum depth to crawl

  • start_on_create (Boolean) (defaults to: true)

    whether start selector immediately



47
48
49
50
51
52
53
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 47

def initialize(max_depth = -1, start_on_create = true)
  @max_depth = max_depth
  @wait_time = 1
  @status = :running
  @stoping = false
  run if start_on_create
end

Class Method Details

.exclude?(url) ⇒ Boolean

Test whether url is excluded

Parameters:

  • url (String)

Returns:

  • (Boolean)

    true if url is excluded, false otherwise



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 112

def self.exclude?(url)
  config       = SimpleConfig.for :same_domain_selector
  exclude_list = []
  url_domain   = get_url_path(url)[:domain]
  begin
    exclude_group = config.exclude
  rescue NoMethodError => e
    return false
  end

  unless exclude_group.nil?
    exclude_group.to_hash.keys.each do | url_e |
      if url_domain.to_s.end_with? url_e.to_s
        exclude_list = config.exclude.get(url_e)
        break
      end
    end
  end

  exclude_list = exclude_list.map do | elt |
    if /^\/.*\/$/ =~ elt
      Regexp.new(elt[1..-2])                        # already an Regex
    else
      new_elt = "^(.*/)?#{elt}(/.*)?$"
      Regexp.new(new_elt)
    end
  end

  if exclude_list.count == 0
    return false
  end

  # url.split('/').each do | part |
  #   if exclude_list.include? part
  #     return true
  #   end
  # end
  exclude_list.each do | exclude_rule |
    if exclude_rule =~ url
      return true
    end
  end
  return false
end

Instance Method Details

#extract_url(url) ⇒ Object

Extract url from page



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 56

def extract_url(url)
  doc      = RawData.find_by_url(url)
  html_doc = Nokogiri::HTML(doc)
  results  = []

  inner_url = html_doc.xpath('//a').collect { | a_el |
    temp_url = (a_el.attribute 'href').to_s
    if (!temp_url.nil?) && (temp_url[0] == '/')
      temp_url = URI.join(url, temp_url).to_s
    end
    temp_url
  }

  inner_url.delete_if { | url |
      (url.nil?) || (url.size == 0) || (url == '#') ||
    (url == 'javascript:;')
  }

  # select url from same domain
  inner_url.select { | o_url |
    if (same_domain?(o_url, url))
      if (!SameDomainSelector.exclude?(o_url))
        begin
          URLQueue.add(o_url, url)
          results << [o_url, url]
        rescue URLQueue::DuplicateURLError => e
        end
      else
        # TODO Log here
      end
    end
  }
end

#graceful_terminateObject

Graceful terminate this selector



158
159
160
161
162
163
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 158

def graceful_terminate
  @stoping = true
  while @status == :running
    sleep(1)
  end
end

#runObject



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 90

def run
  @status = :running
  return if @stoping
  if @max_depth == 0
    @status = :stopped
    return
  end
  while !@stoping
    url = next_unprocessed(@max_depth - 1)
    while (url.nil?)
      wait_for_url
      url = next_unprocessed(@max_depth - 1)
    end
    NCLogger.get_logger.info "Processing #{url}"
    extract_url(url)
    mark_processed(url)
  end
end