Class: HttpCrawler::Client

Inherits:
Object show all
Defined in:
lib/http_crawler/client.rb

Direct Known Subclasses

Proxy::Client, Web::Client

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameter = {}) ⇒ Client

init_uri 如果未初始化@uri,则会报错



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/http_crawler/client.rb', line 34

def initialize(parameter = {})
  parameter = parameter.symbolize_keys

  parameter[:uri_or_path] = parameter[:url] || parameter[:uri]

  if parameter[:uri_or_path]
    # 如果自定义uri
    raise "Client uri为重复初始化" if uri
    update_uri(parameter[:uri_or_path])
  else
    # 初始化 uri
    init_uri
  end

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl unless uri.blank?

  # 初始化一些 client 自定义参数
  init_client

  self.redirect = true
  # 初始化 代理参数
  @proxy_params = {key: "#{self.class.to_s.gsub(":", "_")}"}
end

Instance Attribute Details

#all_timeoutObject

Returns the value of attribute all_timeout.



95
96
97
# File 'lib/http_crawler/client.rb', line 95

def all_timeout
  @all_timeout
end

#connect_timeObject

Returns the value of attribute connect_time.



95
96
97
# File 'lib/http_crawler/client.rb', line 95

def connect_time
  @connect_time
end

#cookies(parameter = {}) ⇒ Object

cookies相关方法



141
142
143
# File 'lib/http_crawler/client.rb', line 141

def cookies
  @cookies
end

#error_urlsObject

Returns the value of attribute error_urls.



238
239
240
# File 'lib/http_crawler/client.rb', line 238

def error_urls
  @error_urls
end

#header(parameter = {}) ⇒ Object

头文件相关方法



117
118
119
# File 'lib/http_crawler/client.rb', line 117

def header
  @header
end

#max_error_numObject

最大错误重试次数



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def max_error_num
  @max_error_num
end

#read_timeObject

Returns the value of attribute read_time.



95
96
97
# File 'lib/http_crawler/client.rb', line 95

def read_time
  @read_time
end

#redirectObject

Returns the value of attribute redirect.



113
114
115
# File 'lib/http_crawler/client.rb', line 113

def redirect
  @redirect
end

#responseObject

请求的响应



322
323
324
# File 'lib/http_crawler/client.rb', line 322

def response
  @response
end

#uriObject (readonly)

Returns the value of attribute uri.



68
69
70
# File 'lib/http_crawler/client.rb', line 68

def uri
  @uri
end

#write_timeObject

Returns the value of attribute write_time.



95
96
97
# File 'lib/http_crawler/client.rb', line 95

def write_time
  @write_time
end

Class Method Details

.for(web_name, args = {}) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name, args = {})
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(args)
end

.for_module(module_name, args = {}) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, args = {})
  "#{module_name}::Client".constantize.new(args)
end

.for_uri(path) ⇒ Object



25
26
27
# File 'lib/http_crawler/client.rb', line 25

def for_uri(path)
  self.new(uri: path)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



245
246
247
# File 'lib/http_crawler/client.rb', line 245

def add_error_url(url_string)
  self.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



170
171
172
173
174
# File 'lib/http_crawler/client.rb', line 170

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#get(path, params = {}, limit = 3) ⇒ Object

发送 get 请求



297
298
299
300
301
302
303
304
305
306
307
# File 'lib/http_crawler/client.rb', line 297

def get(path, params = {}, limit = 3)
  raise "Client uri为空" unless self.uri
  request do
    r = http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)
    return r if limit < 0
    r.html.at_xpath("//meta[@http-equiv='Refresh']").jagger_blank do |objc|
      r = self.get(objc.to_html[/(?:URL|url)="?(.*)[^";>]/, 1], params, limit - 1)
    end
    r
  end
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/http_crawler/client.rb', line 210

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params.symbolize_keys)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    # 测试本地代理
    # proxy_ip = {p_addr: "127.0.0.1", p_port: 8888} if "production" =! Rails.env
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取:proxy 为空"
    else
      break
    end
    sleep(5)
  end while true
  proxy_ip = proxy_ip.symbolize_keys

  unless proxy_ip[:p_addr] && proxy_ip[:p_port]
    Rails.logger.warn "无最新代理等待5秒后重新获取:p_addr 或 p_port 为空"
    sleep(5)
    proxy_ip = get_proxy
  end

  Rails.logger.info("当前IP => #{@proxy},切换至代理 => #{proxy_ip}")
  proxy_ip
end

#get_uriObject

直接发送uri的get请求



310
311
312
313
# File 'lib/http_crawler/client.rb', line 310

def get_uri
  raise "Client uri为空" unless self.uri
  request {http.get(self.uri.to_s, :ssl_context => @ctx)}
end

#httpObject

初始化http请求前置条件



291
292
293
# File 'lib/http_crawler/client.rb', line 291

def http
  init_http
end

#init_clientObject

初始化init_client参数



251
252
253
# File 'lib/http_crawler/client.rb', line 251

def init_client
  nil
end

#init_cookies(parameter = {}) ⇒ Object



146
147
148
149
# File 'lib/http_crawler/client.rb', line 146

def init_cookies(parameter = {})
  parameter = parameter.symbolize_keys
  @cookies = {}
end

#init_header(parameter = {}) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
# File 'lib/http_crawler/client.rb', line 122

def init_header(parameter = {})
  parameter = parameter.symbolize_keys
  @header = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Encoding": "gzip, br",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Connection": "keep-alive",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
  }
end

#init_httpObject

创建时间: 2019/9/11 17:11 更新时间: 2019/9/11 作者: Jagger 方法名称: init_http 方法说明: 初始化http请求前置条件调用方式: #init_http

Returns:

  • HTTP



264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/http_crawler/client.rb', line 264

def init_http
  h = HTTP
  # 自动重定向。最大重定向次数 max_hops: 5
  h = h.follow(max_hops: 5) if self.redirect == true

  # 添加代理
  h = h.via(@proxy[:p_addr], @proxy[:p_port].to_i, @proxy[:p_user], @proxy[:p_pass]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  if (@all_timeout)
    # 整体总计超时时间
    h = h.timeout(@all_timeout)
  else
    # 指定每个处理超时时间
    h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
  end

  h
end

#init_sslObject

初始化 ssl 协议



105
106
107
108
109
110
111
# File 'lib/http_crawler/client.rb', line 105

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



97
98
99
100
101
102
# File 'lib/http_crawler/client.rb', line 97

def init_timeout
  @connect_time = 5
  @write_time = 5
  @read_time = 5
  @all_timeout = nil
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错



72
73
74
# File 'lib/http_crawler/client.rb', line 72

def init_uri
  @uri = nil
end

#post(path, params = {}, format = :form) ⇒ Object

发送 post 请求



316
317
318
319
# File 'lib/http_crawler/client.rb', line 316

def post(path, params = {}, format = :form)
  raise "Client uri为空" unless self.uri
  request {http.post((self.uri + path).to_s, format => params, :ssl_context => @ctx)}
end

#proxy_apiObject

代理使用的api方法名



177
178
179
# File 'lib/http_crawler/client.rb', line 177

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



182
183
184
# File 'lib/http_crawler/client.rb', line 182

def proxy_params
  @proxy_params ||= {key: "default"}
end

#str_to_cookies(str) ⇒ Object

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}



163
164
165
166
167
# File 'lib/http_crawler/client.rb', line 163

def str_to_cookies(str)
  str.scan(/([^=]*)=([^;]*);? ?/) do |m|
    self.cookies[:"#{m[0]}"] = m[1]
  end
end

#update_cookies(parameter = {}) ⇒ Object



151
152
153
154
155
156
157
158
159
# File 'lib/http_crawler/client.rb', line 151

def update_cookies(parameter = {})
  parameter = parameter.symbolize_keys

  @response.cookies.each do |cookie|
    @cookies.add(cookie)
  end unless @response.blank?

  nil
end

#update_header(parameter = {}) ⇒ Object



134
135
136
137
# File 'lib/http_crawler/client.rb', line 134

def update_header(parameter = {})
  parameter = parameter.symbolize_keys
  @header = init_header
end

#update_proxy(proxy = {}) ⇒ Object



186
187
188
189
190
191
192
193
194
# File 'lib/http_crawler/client.rb', line 186

def update_proxy(proxy = {})
  proxy = proxy.symbolize_keys
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


198
199
200
201
202
203
204
205
# File 'lib/http_crawler/client.rb', line 198

def update_proxy?
  if @auto_proxy
    self.update_proxy
    return true
  else
    return false
  end
end

#update_uri(uri_or_path) ⇒ Object

更新uri



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/http_crawler/client.rb', line 77

def update_uri(uri_or_path)
  case uri_or_path
  when URI
    @uri = uri_or_path
  when String
    if uri_or_path =~ /^http/
      @uri = URI(uri_or_path)
    else
      @uri = @uri + uri_or_path
    end
  else
    raise ArgumentError, uri_or_path
  end
  # 初始化 ssl 协议
  self.init_ssl
  self.uri
end

#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理

Returns:

  • (Boolean)


326
327
328
329
330
331
332
333
334
335
336
# File 'lib/http_crawler/client.rb', line 326

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end