Class: RailsSpider::Mechanize
- Inherits:
-
Fetcher
- Object
- Fetcher
- RailsSpider::Mechanize
show all
- Defined in:
- lib/rails_spider/fetchers/witar.rb,
lib/rails_spider/fetchers/mechanize.rb
Instance Attribute Summary collapse
Instance Method Summary
collapse
Methods inherited from Fetcher
#create_event, #event_class, #keep_on?, #page_by_url
Constructor Details
Returns a new instance of Mechanize.
7
8
9
10
11
12
13
|
# File 'lib/rails_spider/fetchers/witar.rb', line 7
def initialize
super
@mechanize = Mechanize.new
@mechanize.open_timeout = 20
@mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
@logger = Logger.new STDOUT
end
|
Instance Attribute Details
#logger ⇒ Object
Returns the value of attribute logger.
5
6
7
|
# File 'lib/rails_spider/fetchers/witar.rb', line 5
def logger
@logger
end
|
#mechanize ⇒ Object
Returns the value of attribute mechanize.
5
6
7
|
# File 'lib/rails_spider/fetchers/witar.rb', line 5
def mechanize
@mechanize
end
|
Instance Method Details
#body(url) ⇒ Object
20
21
22
|
# File 'lib/rails_spider/fetchers/mechanize.rb', line 20
def body(url)
page(url).search('body')
end
|
#change_another_proxy(proxy_hash = nil, header_hash = nil) ⇒ Object
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# File 'lib/rails_spider/fetchers/witar.rb', line 24
def change_another_proxy(proxy_hash=nil, =nil)
if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
ip = proxy_hash[:ip]
port = proxy_hash[:port]
else
index = rand(@proxy.size)
ip = @proxy[index][:ip]
port = @proxy[index][:port]
end
@mechanize.set_proxy ip, port
@mechanize. = unless .nil?
end
|
#grab_update ⇒ Object
53
54
55
56
57
58
59
60
61
62
|
# File 'lib/rails_spider/fetchers/witar.rb', line 53
def grab_update
logger.info "Start #{self.class} Spider grab_update."
@newlinks.each do |link|
@city = link['city'] unless link['city'].blank?
grab_list_link(link['url'])
end
logger.info "End of #{self.class} Spider grab_update."
end
|
#is_existed?(event_hash) ⇒ Boolean
64
65
66
67
68
69
70
|
# File 'lib/rails_spider/fetchers/witar.rb', line 64
def is_existed?(event_hash)
if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
return true
end
return false
end
|
#is_grab?(url) ⇒ Boolean
38
39
40
|
# File 'lib/rails_spider/fetchers/witar.rb', line 38
def is_grab?(url)
event_class.where(url: url).exists?
end
|
#links(url) ⇒ Object
24
25
26
27
28
29
30
31
32
|
# File 'lib/rails_spider/fetchers/mechanize.rb', line 24
def links(url)
page(url).links.map do |link|
begin
link.resolved_uri.to_s
rescue ::Mechanize::UnsupportedSchemeError
''
end
end
end
|
#page(url) ⇒ Object
16
17
18
|
# File 'lib/rails_spider/fetchers/mechanize.rb', line 16
def page(url)
mechanize.get(url)
end
|
#run ⇒ Object
42
43
44
45
46
47
48
49
50
51
|
# File 'lib/rails_spider/fetchers/witar.rb', line 42
def run
logger.info "Start #{self.class} Spider..."
@links.each do |link|
grab_list_link(link.keys.first)
end
logger.info "End of #{self.class} Spider..."
end
|
#save_page(page) ⇒ Object
15
16
17
18
19
20
21
22
|
# File 'lib/rails_spider/fetchers/witar.rb', line 15
def save_page(page)
begin
page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
rescue => e
logger.error e.message
logger.warn "cann't save page #{page.uri}"
end
end
|