Class: Baiduserp::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/parsers/ranks.rb,
lib/parsers/ads_top.rb,
lib/baiduserp/parser.rb,
lib/parsers/ads_right.rb,
lib/parsers/result_num.rb,
lib/parsers/right_hotel.rb,
lib/parsers/pinpaizhuanqu.rb,
lib/parsers/right_weather.rb,
lib/parsers/related_keywords.rb,
lib/parsers/right_personinfo.rb,
lib/parsers/right_relaperson.rb

Instance Method Summary collapse

Instance Method Details

#_parse_ads_right(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/parsers/ads_right.rb', line 2

def _parse_ads_right(file)
  result = []
  file[:doc].search('div.EC_im').each do |div|
    r = {}

    r[:rank] = div['id'].sub('bdfs','').to_i + 1

    r[:title] = Baiduserp::Helper.get_content_safe(div.search('a.EC_t'))

    r[:content] = Baiduserp::Helper.get_content_safe(div.search('a.EC_desc/font'))

    r[:site] = Baiduserp::Helper.get_content_safe(div.search('font.EC_url'))

    result << r
  end
  result
end

#_parse_ads_top(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/parsers/ads_top.rb', line 2

def _parse_ads_top(file)
  result = []
  file[:doc].search('div.ec_pp_f').each do |div|
    id = div['id'].to_i
    next unless id >= 3000
    r = {rank: id}

    r[:title] = Baiduserp::Helper.get_content_safe(div.search('div.ec_title'))

    r[:content] = Baiduserp::Helper.get_content_safe(div.search('div.ec_desc'))

    r[:site] = Baiduserp::Helper.get_content_safe(div.search('span.ec_url'))

    result << r
  end

  if result.empty?
    file[:doc].search('table.ec_pp_f').each_with_index do |table,i|
      r = {rank: i + 1}

      r[:title] = Baiduserp::Helper.get_content_safe(table.search('td.EC_header/a'))

      r[:content] = Baiduserp::Helper.get_content_safe(table.search('a.EC_desc'))

      r[:site] = Baiduserp::Helper.get_content_safe(table.search('a.EC_url'))

      result << r

    end
  end

  result
end

#_parse_pinpaizhuanqu(file) ⇒ Object



2
3
4
# File 'lib/parsers/pinpaizhuanqu.rb', line 2

def _parse_pinpaizhuanqu(file)
  file[:html].include? 'bs.baidu.com/adcoup-mat'
end

#_parse_ranks(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/parsers/ranks.rb', line 2

def _parse_ranks(file)
  result = []
  file[:doc].search("//table").each do |table|
    next if table.nil?
    id = table['id'].to_i
    next unless id > 0
    r = {:rank => id}

    url = table.search('h3/a').first
    unless url.nil?
      url = url['href'] 
      url = Baiduserp::Client.get(url).headers['location'] if url.include?('http://www.baidu.com/link?')
    end
    r[:url] = url

    r[:title] = Baiduserp::Helper.get_content_safe(table.search('h3'))

    r[:content] = Baiduserp::Helper.get_content_safe(table.search('div.c-abstract'))

    r[:mu] = table['mu']

    table.search('a').each do |link|
      r[:baiduopen] = true if link['href'].to_s.include?('open.baidu.com')
    end
    r[:baiduopen] = false if r[:baiduopen].nil?
    
    result << r
  end
  result
end


2
3
4
5
6
7
8
9
10
# File 'lib/parsers/related_keywords.rb', line 2

def _parse_related_keywords(file)
  result = []
  file[:doc].search('div[@id="rs"]').each do |rs|
    rs.css('a').each do |link|
      result << link.content
    end
  end
  result
end

#_parse_result_num(file) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/parsers/result_num.rb', line 4

def _parse_result_num(file)
  html = file[:html]
  str = html.scan(/找到相关结果(.*)个/).join
  str = str.gsub('','')
  if str.include?('')
    parts = str.split('')
    result = parts[0].to_i * 10000 + parts[1].to_i
  else
    result = str.gsub(',', '').to_i
  end

  result
end

#_parse_right_hotel(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
# File 'lib/parsers/right_hotel.rb', line 2

def _parse_right_hotel(file)
  rh = file[:doc].search('div[@tpl="right_hotel"]')
  return nil if rh.nil?

  rh = rh.first
  return nil if rh.nil?
  title = Baiduserp::Helper.get_content_safe(rh.search('div.opr-hotel-title'))
  
  {:title => title}
end

#_parse_right_personinfo(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
# File 'lib/parsers/right_personinfo.rb', line 2

def _parse_right_personinfo(file)
  rp = file[:doc].search('div[@tpl="right_personinfo"]')
  return nil if rp.nil?

  title = Baiduserp::Helper.get_content_safe rp.search('span.opr-personinfo-subtitle-large')
  info = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-info')
  source = Baiduserp::Helper.get_content_safe rp.search('div.opr-personinfo-source a')
  
  return nil if title.nil? && info.nil? && source.nil?
  {:title => title, :info => info, :source => source}
end

#_parse_right_relaperson(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/parsers/right_relaperson.rb', line 2

def _parse_right_relaperson(file)
  relapersons = file[:doc].search('div[@tpl="right_relaperson"]')
  return nil if relapersons.nil?

  result = []
  relapersons.each do |rr|
    title = rr.search('span.opr-relaperson-subtitle-tip').first.content
    r = []
    rr.search('p.opr-relaperson-name').each do |p|
      r << p.content
    end
    result << {:title => title, :names => r}
  end
  result
end

#_parse_right_weather(file) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
# File 'lib/parsers/right_weather.rb', line 2

def _parse_right_weather(file)
  rw = file[:doc].search('div[@tpl="right_weather"]')
  return nil if rw.nil?

  rw = rw.first
  return nil if rw.nil?

  title = Baiduserp::Helper.get_content_safe(rw.search('div.opr-weather-title'))
  week = rw.search('a.opr-weather-week').first['href']
  
  {:title => title, :week => week}
end

#parse(html) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/baiduserp/parser.rb', line 11

def parse(html)
  @file = Hash.new
  @serp = Hash.new

  @file[:html] = html
  @file[:doc] = Nokogiri::HTML(html)

  self.methods.each do |m|
    next unless m =~ /^_parse_/
    #p m
    @serp[m.to_s.sub('_parse_','').to_sym] = self.send m,@file
    #p @serp.keys
  end

  @serp
end

#parse_file(file_path) ⇒ Object



32
33
34
35
36
37
38
39
40
# File 'lib/baiduserp/parser.rb', line 32

def parse_file(file_path)
  if File.exists? file_path
    html = open(file_path).read
  else
    html = Client.get(URI.escape(file_path)).body
  end
  html = html.encode!('UTF-8','UTF-8',:invalid => :replace)
  parse html
end

#search(keyword) ⇒ Object



28
29
30
# File 'lib/baiduserp/parser.rb', line 28

def search(keyword)
  parse_file("http://www.baidu.com/s?wd=#{keyword}")
end