Class: RailsSpider::Mechanize

Inherits:
Fetcher
  • Object
show all
Defined in:
lib/rails_spider/fetchers/witar.rb,
lib/rails_spider/fetchers/mechanize.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Fetcher

#create_event, #event_class, #keep_on?, #page_by_url

Constructor Details

#initializeMechanize

Returns a new instance of Mechanize.



7
8
9
10
11
12
13
# File 'lib/rails_spider/fetchers/witar.rb', line 7

def initialize
  super
  @mechanize = Mechanize.new
  @mechanize.open_timeout = 20
  @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
  @logger = Logger.new STDOUT
end

Instance Attribute Details

#loggerObject

Returns the value of attribute logger.



5
6
7
# File 'lib/rails_spider/fetchers/witar.rb', line 5

def logger
  @logger
end

#mechanizeObject

Returns the value of attribute mechanize.



5
6
7
# File 'lib/rails_spider/fetchers/witar.rb', line 5

def mechanize
  @mechanize
end

Instance Method Details

#body(url) ⇒ Object



20
21
22
# File 'lib/rails_spider/fetchers/mechanize.rb', line 20

def body(url)
  page(url).search('body')
end

#change_another_proxy(proxy_hash = nil, header_hash = nil) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/rails_spider/fetchers/witar.rb', line 24

def change_another_proxy(proxy_hash=nil, header_hash=nil)
  if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
    ip = proxy_hash[:ip]
    port = proxy_hash[:port]
  else
    index = rand(@proxy.size)
    ip = @proxy[index][:ip]
    port = @proxy[index][:port]
  end
  @mechanize.set_proxy ip, port

  @mechanize.request_headers = header_hash unless header_hash.nil?
end

#grab_updateObject



53
54
55
56
57
58
59
60
61
62
# File 'lib/rails_spider/fetchers/witar.rb', line 53

def grab_update
  logger.info "Start #{self.class} Spider grab_update."

  @newlinks.each do |link|
    @city = link['city'] unless link['city'].blank?
    grab_list_link(link['url'])
  end

  logger.info "End of #{self.class} Spider grab_update."
end

#is_existed?(event_hash) ⇒ Boolean

Returns:

  • (Boolean)


64
65
66
67
68
69
70
# File 'lib/rails_spider/fetchers/witar.rb', line 64

def is_existed?(event_hash)
  if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
    logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
    return true
  end
  return false
end

#is_grab?(url) ⇒ Boolean

Returns:

  • (Boolean)


38
39
40
# File 'lib/rails_spider/fetchers/witar.rb', line 38

def is_grab?(url)
  event_class.where(url: url).exists?
end


24
25
26
27
28
29
30
31
32
# File 'lib/rails_spider/fetchers/mechanize.rb', line 24

def links(url)
  page(url).links.map do |link|
    begin
      link.resolved_uri.to_s
    rescue ::Mechanize::UnsupportedSchemeError
      ''
    end
  end
end

#page(url) ⇒ Object



16
17
18
# File 'lib/rails_spider/fetchers/mechanize.rb', line 16

def page(url)
  mechanize.get(url)
end

#runObject



42
43
44
45
46
47
48
49
50
51
# File 'lib/rails_spider/fetchers/witar.rb', line 42

def run
  logger.info "Start #{self.class} Spider..."

  @links.each do |link|
    #@city = link.values.first
    grab_list_link(link.keys.first)
  end

  logger.info "End of #{self.class} Spider..."
end

#save_page(page) ⇒ Object



15
16
17
18
19
20
21
22
# File 'lib/rails_spider/fetchers/witar.rb', line 15

def save_page(page)
  begin
    page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
  rescue => e
    logger.error e.message
    logger.warn "cann't save page #{page.uri}"
  end
end