Module: Global

Defined in:
lib/searcher/global.rb

Constant Summary collapse

UserAgent =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0'
Beanstalk_jobs =
[['localhost:11300'],'crawler-jobs']

Class Method Summary collapse

Class Method Details

.get_final_url_from_response(url, user_agent = UserAgent, timeout = 20) ⇒ Object



57
58
59
60
# File 'lib/searcher/global.rb', line 57

def self.get_final_url_from_response(url,user_agent=UserAgent,timeout=20)
  res = get_whole_response(url,user_agent,timeout)
  res.header['location'] ? get_final_url_from_response(url,user_agent,timeout) : url
end

.get_whole_html(res, user_agent = UserAgent, timeout = 20) ⇒ Object



47
48
49
50
51
52
53
# File 'lib/searcher/global.rb', line 47

def self.get_whole_html(res,user_agent=UserAgent,timeout=20)

  encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0]
  encoding = encoding ? encoding[0].upcase : 'GB18030'
  html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8')

end

.get_whole_response(url, user_agent = UserAgent, timeout = 20) ⇒ Object



36
37
38
39
40
41
42
43
44
# File 'lib/searcher/global.rb', line 36

def self.get_whole_response(url,user_agent=UserAgent,timeout=20)
  uri = URI.parse(url)
  req = Net::HTTP::Get.new(uri.path.to_s + '?' + uri.query.to_s)
  req.add_field('User-Agent', user_agent)
  res = Net::HTTP.start(uri.host, uri.port) do |http|
    http.read_timeout = timeout
    http.request(req)
  end
end

.html_get_web_url(url, user_agent = UserAgent, timeout = 20, redirect_limit = 3) ⇒ Object

Raises:

  • (ArgumentError)


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/searcher/global.rb', line 16

def self.html_get_web_url(url,user_agent=UserAgent,timeout=20,redirect_limit=3)

  raise ArgumentError, 'too many HTTP redirects' if redirect_limit == 0
  begin
        response = Net::HTTP.get_response(URI.parse(URI.decode(url)))

        case response
          when Net::HTTPSuccess then
            url
          when Net::HTTPRedirection then
            response['location']
          else
            nil
        end
  rescue => e
        e.message
  end
end


64
65
66
67
# File 'lib/searcher/global.rb', line 64

def self.save_link_info(url,info_type='baidu',path='/link_infos')
  save_to_file(url,"#{info_type}.txt",path)
  #into DB ... some code ...
end

.save_to_file(content, file_name, path = '/link_infos') ⇒ Object



70
71
72
73
74
75
76
77
# File 'lib/searcher/global.rb', line 70

def self.save_to_file(content,file_name,path='/link_infos')
  path = ".#{path}/"
  Dir.mkdir(path)  if !Dir.exist?(path)

  logfile = File.open(path + file_name, 'a')
  logfile.puts(content)
  logfile.close
end