Class: Baiduserp::Parser
- Inherits:
-
Object
- Object
- Baiduserp::Parser
- Defined in:
- lib/parsers/ranks.rb,
lib/parsers/ads_top.rb,
lib/baiduserp/parser.rb,
lib/parsers/ads_right.rb,
lib/parsers/result_num.rb,
lib/parsers/right_hotel.rb,
lib/parsers/pinpaizhuanqu.rb,
lib/parsers/right_weather.rb,
lib/parsers/related_keywords.rb,
lib/parsers/right_personinfo.rb,
lib/parsers/right_relaperson.rb
Instance Method Summary collapse
- #_parse_ads_right(file) ⇒ Object
- #_parse_ads_top(file) ⇒ Object
- #_parse_pinpaizhuanqu(file) ⇒ Object
- #_parse_ranks(file) ⇒ Object
- #_parse_related_keywords(file) ⇒ Object
- #_parse_result_num(file) ⇒ Object
- #_parse_right_hotel(file) ⇒ Object
- #_parse_right_personinfo(file) ⇒ Object
- #_parse_right_relaperson(file) ⇒ Object
- #_parse_right_weather(file) ⇒ Object
- #parse(html) ⇒ Object
- #parse_file(file_path) ⇒ Object
- #search(keyword) ⇒ Object
Instance Method Details
#_parse_ads_right(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/parsers/ads_right.rb', line 2 def _parse_ads_right(file) result = [] file[:doc].search('div.EC_im').each do |div| r = {} r[:rank] = div['id'].sub('bdfs','').to_i + 1 r[:title] = Baiduserp::Helper.get_content_safe(div.search('a.EC_t')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc/font')) r[:site] = Baiduserp::Helper.get_content_safe(div.search('font.EC_url')) result << r end result end |
#_parse_ads_top(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/parsers/ads_top.rb', line 2 def _parse_ads_top(file) result = [] file[:doc].search('div.ec_pp_f').each do |div| id = div['id'].to_i next unless id >= 3000 r = {rank: id} r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title')) r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc')) r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url')) result << r end if result.empty? file[:doc].search('table.ec_pp_f').each_with_index do |table,i| r = {rank: i + 1} r[:title] = Baiduserp::Helper.get_content_safe(table.search('td.EC_header/a')) r[:content] = Baiduserp::Helper.get_content_safe(table.search('a.EC_desc')) r[:site] = Baiduserp::Helper.get_content_safe(table.search('a.EC_url')) result << r end end result end |
#_parse_pinpaizhuanqu(file) ⇒ Object
2 3 4 |
# File 'lib/parsers/pinpaizhuanqu.rb', line 2 def _parse_pinpaizhuanqu(file) file[:html].include? 'bs.baidu.com/adcoup-mat' end |
#_parse_ranks(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/parsers/ranks.rb', line 2 def _parse_ranks(file) result = [] file[:doc].search("//table").each do |table| next if table.nil? id = table['id'].to_i next unless id > 0 r = {:rank => id} url = table.search('h3/a').first unless url.nil? url = url['href'] url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?') end r[:url] = url r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3')) r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract')) r[:mu] = table['mu'] table.search('a').each do |link| r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com') end r[:baiduopen] = false if r[:baiduopen].nil? result << r end result end |
#_parse_related_keywords(file) ⇒ Object
2 3 4 5 6 7 8 9 10 |
# File 'lib/parsers/related_keywords.rb', line 2 def (file) result = [] file[:doc].search('div[@id="rs"]').each do |rs| rs.css('a').each do |link| result << link.content end end result end |
#_parse_result_num(file) ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/parsers/result_num.rb', line 4 def _parse_result_num(file) html = file[:html] str = html.scan(/找到相关结果(.*)个/).join str = str.gsub('约','') if str.include?('万') parts = str.split('万') result = parts[0].to_i * 10000 + parts[1].to_i else result = str.gsub(',', '').to_i end result end |
#_parse_right_hotel(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 |
# File 'lib/parsers/right_hotel.rb', line 2 def _parse_right_hotel(file) rh = file[:doc].search('div[@tpl="right_hotel"]') return nil if rh.nil? rh = rh.first return nil if rh.nil? title = Baiduserp::Helper.get_content_safe(rh.search('div.opr-hotel-title')) {:title => title} end |
#_parse_right_personinfo(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 |
# File 'lib/parsers/right_personinfo.rb', line 2 def _parse_right_personinfo(file) rp = file[:doc].search('div[@tpl="right_personinfo"]') return nil if rp.nil? title = Baiduserp::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large') info = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-info') source = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-source a') return nil if title.nil? && info.nil? && source.nil? {:title => title, :info => info, :source => source} end |
#_parse_right_relaperson(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/parsers/right_relaperson.rb', line 2 def _parse_right_relaperson(file) relapersons = file[:doc].search('div[@tpl="right_relaperson"]') return nil if relapersons.nil? result = [] relapersons.each do |rr| title = rr.search('span.opr-relaperson-subtitle-tip').first.content r = [] rr.search('p.opr-relaperson-name').each do |p| r << p.content end result << {:title => title, :names => r} end result end |
#_parse_right_weather(file) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 |
# File 'lib/parsers/right_weather.rb', line 2 def _parse_right_weather(file) rw = file[:doc].search('div[@tpl="right_weather"]') return nil if rw.nil? rw = rw.first return nil if rw.nil? title = Baiduserp::Helper.get_content_safe(rw.search('div.opr-weather-title')) week = rw.search('a.opr-weather-week').first['href'] {:title => title, :week => week} end |
#parse(html) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/baiduserp/parser.rb', line 11 def parse(html) @file = Hash.new @serp = Hash.new @file[:html] = html @file[:doc] = Nokogiri::HTML(html) self.methods.each do |m| next unless m =~ /^_parse_/ #p m @serp[m.to_s.sub('_parse_','').to_sym] = self.send m,@file #p @serp.keys end @serp end |
#parse_file(file_path) ⇒ Object
32 33 34 35 36 37 38 39 40 |
# File 'lib/baiduserp/parser.rb', line 32 def parse_file(file_path) if File.exists? file_path html = open(file_path).read else html = Client.get(URI.escape(file_path)).body end html = html.encode!('UTF-8','UTF-8',:invalid => :replace) parse html end |
#search(keyword) ⇒ Object
28 29 30 |
# File 'lib/baiduserp/parser.rb', line 28 def search(keyword) parse_file("http://www.baidu.com/s?wd=#{keyword}") end |