Class: RailsSpider::Fetcher
- Inherits:
-
Object
- Object
- RailsSpider::Fetcher
- Defined in:
- lib/rails_spider/fetchers/base.rb
Direct Known Subclasses
Instance Method Summary collapse
- #change_another_proxy(proxy_hash = nil, header_hash = nil) ⇒ Object
- #create_event(event_hash) ⇒ Object
- #event_class ⇒ Object
- #grab_update ⇒ Object
-
#initialize ⇒ Fetcher
constructor
A new instance of Fetcher.
- #is_existed?(event_hash) ⇒ Boolean
- #is_grab?(url) ⇒ Boolean
-
#keep_on? ⇒ Boolean
keep on grab?.
- #page_by_url(url, proxy_hash = nil, header_hash = nil, repeat = 5) ⇒ Object
- #run ⇒ Object
- #save_page(page) ⇒ Object
Constructor Details
#initialize ⇒ Fetcher
Returns a new instance of Fetcher.
4 5 6 |
# File 'lib/rails_spider/fetchers/base.rb', line 4 def initialize @page = '' end |
Instance Method Details
#change_another_proxy(proxy_hash = nil, header_hash = nil) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/rails_spider/fetchers/base.rb', line 59 def change_another_proxy(proxy_hash=nil, header_hash=nil) if proxy_hash && proxy_hash[:ip] && proxy_hash[:port] ip = proxy_hash[:ip] port = proxy_hash[:port] else index = rand(@proxy.size) ip = @proxy[index][:ip] port = @proxy[index][:port] end @mechanize.set_proxy ip, port @mechanize.request_headers = header_hash unless header_hash.nil? end |
#create_event(event_hash) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/rails_spider/fetchers/base.rb', line 99 def create_event(event_hash) if event_hash.blank? logger.warn "Cann't create event by blank data" return end if is_existed?(event_hash) logger.warn "Paramter:#{event_hash} has been existed cann't to create" return end event = Event.new(event_hash) if event_hash[:place].blank? event.status = -1 end event.kind_id = Kind.find_or_create_by(name: event_hash[:kind]).id unless event_hash[:kind].blank? event.subkind_id = set_subkind_id(event_hash[:subkind]) unless event_hash[:subkind].blank? if event_hash[:tags] event_hash[:tags].each do |t| EventTag.create(event_id: event.id, tag_id: Tag.find_or_create_by(name: t).id) end end event.int_id = Event.max(:int_id).blank? ? 1 : Event.max(:int_id) + 1 event.save unless event.errors.blank? logger.info event.errors..join(' / ') else logger.info 'Save event success' end end |
#event_class ⇒ Object
8 9 10 |
# File 'lib/rails_spider/fetchers/base.rb', line 8 def event_class @event_class = EventSpider.config.event_class.constantize end |
#grab_update ⇒ Object
88 89 90 91 92 93 94 95 96 97 |
# File 'lib/rails_spider/fetchers/base.rb', line 88 def grab_update logger.info "Start #{self.class} Spider grab_update." @newlinks.each do |link| @city = link['city'] unless link['city'].blank? grab_list_link(link['url']) end logger.info "End of #{self.class} Spider grab_update." end |
#is_existed?(event_hash) ⇒ Boolean
128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/rails_spider/fetchers/base.rb', line 128 def is_existed?(event_hash) #if event_hash[:event_id] && event_class.where(event_id: event_hash[:event_id]).first # return true #end # TODO title and city are the same #if event_hash[:title] && event_class.where(title: event_hash[:title]).first # return true #end if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first logger.warn "#{event_hash[:url]} has been exist in #{event.id}" return true end return false end |
#is_grab?(url) ⇒ Boolean
73 74 75 |
# File 'lib/rails_spider/fetchers/base.rb', line 73 def is_grab?(url) event_class.where(url: url).exists? # 表示没有抓取 end |
#keep_on? ⇒ Boolean
keep on grab?
143 |
# File 'lib/rails_spider/fetchers/base.rb', line 143 def keep_on?; return true end |
#page_by_url(url, proxy_hash = nil, header_hash = nil, repeat = 5) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/rails_spider/fetchers/base.rb', line 12 def page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5) logger.info "Grab the page #{url}" begin change_another_proxy(proxy_hash, header_hash) logger.info "Changed to a new proxy: #{@mechanize.proxy_addr}:#{@mechanize.proxy_port} for #{url}" page = @mechanize.get(url) logger.info "Has been get the page #{url}" page rescue => e logger.error e. e.backtrace.each do |msg| error_log.error msg end error_log.error "\n" i ||= 0 if i < repeat logger.info "Retry to get page for #{i} times" i += 1 retry else if url.include?('douban') source = 'douban' elsif url.include?('weibo') source = 'weibo' elsif url.include?('rockbundartmuseum') source = 'waitan' elsif url.include?('citymoments') source = 'citymoment' else source = 'else' end FailUrl.create(url: url, source: source, flag: "spider") logger.warn "Cann't grab url #{url}" return end end end |
#run ⇒ Object
77 78 79 80 81 82 83 84 85 86 |
# File 'lib/rails_spider/fetchers/base.rb', line 77 def run logger.info "Start #{self.class} Spider..." @links.each do |link| #@city = link.values.first grab_list_link(link.keys.first) end logger.info "End of #{self.class} Spider..." end |
#save_page(page) ⇒ Object
50 51 52 53 54 55 56 57 |
# File 'lib/rails_spider/fetchers/base.rb', line 50 def save_page(page) begin page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}") rescue => e logger.error e. logger.warn "cann't save page #{page.uri}" end end |