Class: RailsSpider::Fetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/rails_spider/fetchers/base.rb

Direct Known Subclasses

Mechanize

Instance Method Summary collapse

Constructor Details

#initializeFetcher

Returns a new instance of Fetcher.



4
5
6
# File 'lib/rails_spider/fetchers/base.rb', line 4

def initialize
  @page = ''
end

Instance Method Details

#change_another_proxy(proxy_hash = nil, header_hash = nil) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/rails_spider/fetchers/base.rb', line 59

def change_another_proxy(proxy_hash=nil, header_hash=nil)
  if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
    ip = proxy_hash[:ip]
    port = proxy_hash[:port]
  else
    index = rand(@proxy.size)
    ip = @proxy[index][:ip]
    port = @proxy[index][:port]
  end
  @mechanize.set_proxy ip, port

  @mechanize.request_headers = header_hash unless header_hash.nil?
end

#create_event(event_hash) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/rails_spider/fetchers/base.rb', line 99

def create_event(event_hash)
  if event_hash.blank?
    logger.warn "Cann't create event by blank data"
    return
  end
  if is_existed?(event_hash)
    logger.warn "Paramter:#{event_hash} has been existed cann't to create"
    return
  end
  event = Event.new(event_hash)
  if event_hash[:place].blank?
    event.status = -1
  end
  event.kind_id = Kind.find_or_create_by(name: event_hash[:kind]).id unless event_hash[:kind].blank?
  event.subkind_id = set_subkind_id(event_hash[:subkind]) unless event_hash[:subkind].blank?
  if event_hash[:tags]
    event_hash[:tags].each do |t|
      EventTag.create(event_id: event.id, tag_id: Tag.find_or_create_by(name: t).id)
    end
  end
  event.int_id = Event.max(:int_id).blank? ? 1 : Event.max(:int_id) + 1
  event.save
  unless event.errors.blank?
    logger.info event.errors.full_messages.join(' / ')
  else
    logger.info 'Save event success'
  end
end

#event_classObject



8
9
10
# File 'lib/rails_spider/fetchers/base.rb', line 8

def event_class
  @event_class = EventSpider.config.event_class.constantize
end

#grab_updateObject



88
89
90
91
92
93
94
95
96
97
# File 'lib/rails_spider/fetchers/base.rb', line 88

def grab_update
  logger.info "Start #{self.class} Spider grab_update."

  @newlinks.each do |link|
    @city = link['city'] unless link['city'].blank?
    grab_list_link(link['url'])
  end

  logger.info "End of #{self.class} Spider grab_update."
end

#is_existed?(event_hash) ⇒ Boolean

Returns:

  • (Boolean)


128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/rails_spider/fetchers/base.rb', line 128

def is_existed?(event_hash)
  #if event_hash[:event_id] && event_class.where(event_id: event_hash[:event_id]).first
  #  return true
  #end
  # TODO title and city are the same
  #if event_hash[:title] && event_class.where(title: event_hash[:title]).first
  #  return true
  #end
  if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
    logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
    return true
  end
  return false
end

#is_grab?(url) ⇒ Boolean

Returns:

  • (Boolean)


73
74
75
# File 'lib/rails_spider/fetchers/base.rb', line 73

def is_grab?(url)
  event_class.where(url: url).exists? # 表示没有抓取
end

#keep_on?Boolean

keep on grab?

Returns:

  • (Boolean)


143
# File 'lib/rails_spider/fetchers/base.rb', line 143

def keep_on?; return true end

#page_by_url(url, proxy_hash = nil, header_hash = nil, repeat = 5) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/rails_spider/fetchers/base.rb', line 12

def page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5)
  logger.info "Grab the page #{url}"
  begin
    change_another_proxy(proxy_hash, header_hash)
    logger.info "Changed to a new proxy: #{@mechanize.proxy_addr}:#{@mechanize.proxy_port} for #{url}"
    page = @mechanize.get(url)
    logger.info "Has been get the page #{url}"
    page
  rescue => e
    logger.error e.message
    e.backtrace.each do |msg|
      error_log.error msg
    end
    error_log.error "\n"
    i ||= 0
    if i < repeat
      logger.info "Retry to get page for #{i} times"
      i += 1
      retry
    else
      if url.include?('douban')
        source = 'douban'
      elsif url.include?('weibo')
        source = 'weibo'
      elsif url.include?('rockbundartmuseum')
        source = 'waitan'
      elsif url.include?('citymoments')
        source = 'citymoment'
      else
        source = 'else'
      end
      FailUrl.create(url: url, source: source, flag: "spider")
      logger.warn "Cann't grab url #{url}"
      return
    end
  end
end

#runObject



77
78
79
80
81
82
83
84
85
86
# File 'lib/rails_spider/fetchers/base.rb', line 77

def run
  logger.info "Start #{self.class} Spider..."

  @links.each do |link|
    #@city = link.values.first
    grab_list_link(link.keys.first)
  end

  logger.info "End of #{self.class} Spider..."
end

#save_page(page) ⇒ Object



50
51
52
53
54
55
56
57
# File 'lib/rails_spider/fetchers/base.rb', line 50

def save_page(page)
  begin
    page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
  rescue => e
    logger.error e.message
    logger.warn "cann't save page #{page.uri}"
  end
end