Class: NewsCrawler::LinkSelector::SameDomainSelector
- Inherits:
-
Object
- Object
- NewsCrawler::LinkSelector::SameDomainSelector
- Extended by:
- URLHelper
- Includes:
- Celluloid, CrawlerModule, URLHelper
- Defined in:
- lib/news_crawler/link_selector/same_domain_selector.rb
Overview
Select all link from same domain. Domain is got from database
Class Method Summary collapse
-
.exclude?(url) ⇒ Boolean
Test whether url is excluded.
Instance Method Summary collapse
-
#extract_url(url) ⇒ Object
Extract url from page.
-
#graceful_terminate ⇒ Object
Graceful terminate this selector.
-
#initialize(max_depth = -1,, start_on_create = true) ⇒ SameDomainSelector
constructor
Create new selector with queue URL’s selected is put back into queue.
- #run ⇒ Object
Methods included from URLHelper
Methods included from CrawlerModule
#find_all, #find_one, #find_unprocessed, #load_yaml, #mark_all_as_unprocessed, #mark_processed, #mark_unprocessed, #next_unprocessed, #save_yaml
Constructor Details
#initialize(max_depth = -1,, start_on_create = true) ⇒ SameDomainSelector
Create new selector with queue URL’s selected is put back into queue
47 48 49 50 51 52 53 |
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 47 def initialize(max_depth = -1, start_on_create = true) @max_depth = max_depth @wait_time = 1 @status = :running @stoping = false run if start_on_create end |
Class Method Details
.exclude?(url) ⇒ Boolean
Test whether url is excluded
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 112 def self.exclude?(url) config = SimpleConfig.for :same_domain_selector exclude_list = [] url_domain = get_url_path(url)[:domain] begin exclude_group = config.exclude rescue NoMethodError => e return false end unless exclude_group.nil? exclude_group.to_hash.keys.each do | url_e | if url_domain.to_s.end_with? url_e.to_s exclude_list = config.exclude.get(url_e) break end end end exclude_list = exclude_list.map do | elt | if /^\/.*\/$/ =~ elt Regexp.new(elt[1..-2]) # already an Regex else new_elt = "^(.*/)?#{elt}(/.*)?$" Regexp.new(new_elt) end end if exclude_list.count == 0 return false end # url.split('/').each do | part | # if exclude_list.include? part # return true # end # end exclude_list.each do | exclude_rule | if exclude_rule =~ url return true end end return false end |
Instance Method Details
#extract_url(url) ⇒ Object
Extract url from page
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 56 def extract_url(url) doc = RawData.find_by_url(url) html_doc = Nokogiri::HTML(doc) results = [] inner_url = html_doc.xpath('//a').collect { | a_el | temp_url = (a_el.attribute 'href').to_s if (!temp_url.nil?) && (temp_url[0] == '/') temp_url = URI.join(url, temp_url).to_s end temp_url } inner_url.delete_if { | url | (url.nil?) || (url.size == 0) || (url == '#') || (url == 'javascript:;') } # select url from same domain inner_url.select { | o_url | if (same_domain?(o_url, url)) if (!SameDomainSelector.exclude?(o_url)) begin URLQueue.add(o_url, url) results << [o_url, url] rescue URLQueue::DuplicateURLError => e end else # TODO Log here end end } end |
#graceful_terminate ⇒ Object
Graceful terminate this selector
158 159 160 161 162 163 |
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 158 def graceful_terminate @stoping = true while @status == :running sleep(1) end end |
#run ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/news_crawler/link_selector/same_domain_selector.rb', line 90 def run @status = :running return if @stoping if @max_depth == 0 @status = :stopped return end while !@stoping url = next_unprocessed(@max_depth - 1) while (url.nil?) wait_for_url url = next_unprocessed(@max_depth - 1) end NCLogger.get_logger.info "Processing #{url}" extract_url(url) mark_processed(url) end end |