Class: TheScrap::ListObj

Inherits:
Scrap
  • Object
show all
Defined in:
lib/the_scrap/list_obj.rb

Instance Attribute Summary collapse

Attributes inherited from Scrap

#base_url, #data_proc, #debug, #detail_info, #encoding, #html_proc, #item_frag, #result_proc, #url, #verbose

Instance Method Summary collapse

Methods inherited from Scrap

#method_missing, #retryable

Constructor Details

#initializeListObj

Returns a new instance of ListObj.



11
12
13
14
# File 'lib/the_scrap/list_obj.rb', line 11

def initialize()
  super
  @item_filters = []
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class TheScrap::Scrap

Instance Attribute Details

#get_next_urlObject

总页数模式时,下一页的URL生成方式,方法



9
10
11
# File 'lib/the_scrap/list_obj.rb', line 9

def get_next_url
  @get_next_url
end

#get_page_countObject

总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。



8
9
10
# File 'lib/the_scrap/list_obj.rb', line 8

def get_page_count
  @get_page_count
end

#has_many_pagesObject

是否多页



5
6
7
# File 'lib/the_scrap/list_obj.rb', line 5

def has_many_pages
  @has_many_pages
end

#item_filtersObject

条目过滤



4
5
6
# File 'lib/the_scrap/list_obj.rb', line 4

def item_filters
  @item_filters
end

#next_page_cssObject

下一页模式时取下一页链接的 css selector



7
8
9
# File 'lib/the_scrap/list_obj.rb', line 7

def next_page_css
  @next_page_css
end

#pager_methodObject

分页模式



6
7
8
# File 'lib/the_scrap/list_obj.rb', line 6

def pager_method
  @pager_method
end

Instance Method Details

#scrap(url) ⇒ Object



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/the_scrap/list_obj.rb', line 16

def scrap( url )
  items = []

  html = open(url)
  html_proc.each do |dp|
    html = dp.call(html)
  end

  doc = Nokogiri::HTML(html,nil,encoding)
  doc.css(item_frag).each do |item|

    item_info = {}
    get_attrs(url,item,item_info)

    #filter items
    need_skip = false
    item_filters.each do |filter|
      unless filter.call(item_info)
        need_skip = true
        break
      end
    end
    next if need_skip

    #has detail page?
    detail_info.each do |detail|
      detail[0].scrap(item_info[detail[1]],item_info)
    end

    #proc result
    data_proc.each do |dp|
      dp.call(url,item_info)
    end

    items << item_info

    pp item_info if debug?
    break if debug?
  end

  result_proc.each do |rp|
    rp.call(url,items)
  end

  return doc,items
end

#scrap_listObject



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/the_scrap/list_obj.rb', line 63

def scrap_list
  doc,items = retryable(:tries => 3, :on => Timeout::Error) do
    scrap(url)
  end

  return unless  @has_many_pages

  #TODO Refactor it
  next_page_url = nil
  prev_page_url = nil
  if @pager_method == :next_page #有下一页连接的方式
    while node = doc.css(next_page_css).first
      next_page_url = URI.join(next_page_url||url,node['href']).to_s
      break if prev_page_url == next_page_url

      puts "url: #{next_page_url}" if verbose?
      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
        scrap(next_page_url)
      end

      prev_page_url = next_page_url
      break if items.count == 0
      break if debug?
    end
  elsif pager_method == :total_pages #可以获取总页数的方式,start by 1
    page_cnt = get_page_count.call(doc)
    (2..page_cnt).each do |idx|
      next_page_url = get_next_url.call(url,idx)
      puts next_page_url if verbose?
      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
        scrap(next_page_url)
      end
      break if items.count == 0
      break if debug?
    end
  elsif pager_method == :total_records
    #TODO
    #可以取到总条数的方式 , 其实也可以使用上一方式(总页数)实现,只是在外部先使用总条数计算一下总页数
  end
end