Class: TheScrap::ListObj
Instance Attribute Summary collapse
-
#get_next_url ⇒ Object
总页数模式时,下一页的URL生成方式,方法.
-
#get_page_count ⇒ Object
总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。.
-
#has_many_pages ⇒ Object
是否多页.
-
#item_filters ⇒ Object
条目过滤.
-
#next_page_css ⇒ Object
下一页模式时取下一页链接的 css selector.
-
#pager_method ⇒ Object
分页模式.
Attributes inherited from Scrap
#base_url, #data_proc, #debug, #detail_info, #encoding, #html_proc, #item_frag, #result_proc, #url, #verbose
Instance Method Summary collapse
-
#initialize ⇒ ListObj
constructor
A new instance of ListObj.
- #scrap(url) ⇒ Object
- #scrap_list ⇒ Object
Methods inherited from Scrap
Constructor Details
#initialize ⇒ ListObj
Returns a new instance of ListObj.
11 12 13 14 |
# File 'lib/the_scrap/list_obj.rb', line 11 def initialize() super @item_filters = [] end |
Dynamic Method Handling
This class handles dynamic methods through the method_missing method in the class TheScrap::Scrap
Instance Attribute Details
#get_next_url ⇒ Object
总页数模式时,下一页的URL生成方式,方法
9 10 11 |
# File 'lib/the_scrap/list_obj.rb', line 9 def get_next_url @get_next_url end |
#get_page_count ⇒ Object
总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。
8 9 10 |
# File 'lib/the_scrap/list_obj.rb', line 8 def get_page_count @get_page_count end |
#has_many_pages ⇒ Object
是否多页
5 6 7 |
# File 'lib/the_scrap/list_obj.rb', line 5 def has_many_pages @has_many_pages end |
#item_filters ⇒ Object
条目过滤
4 5 6 |
# File 'lib/the_scrap/list_obj.rb', line 4 def item_filters @item_filters end |
#next_page_css ⇒ Object
下一页模式时取下一页链接的 css selector
7 8 9 |
# File 'lib/the_scrap/list_obj.rb', line 7 def next_page_css @next_page_css end |
#pager_method ⇒ Object
分页模式
6 7 8 |
# File 'lib/the_scrap/list_obj.rb', line 6 def pager_method @pager_method end |
Instance Method Details
#scrap(url) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/the_scrap/list_obj.rb', line 16 def scrap( url ) items = [] html = open(url) html_proc.each do |dp| html = dp.call(html) end doc = Nokogiri::HTML(html,nil,encoding) doc.css(item_frag).each do |item| item_info = {} get_attrs(url,item,item_info) #filter items need_skip = false item_filters.each do |filter| unless filter.call(item_info) need_skip = true break end end next if need_skip #has detail page? detail_info.each do |detail| detail[0].scrap(item_info[detail[1]],item_info) end #proc result data_proc.each do |dp| dp.call(url,item_info) end items << item_info pp item_info if debug? break if debug? end result_proc.each do |rp| rp.call(url,items) end return doc,items end |
#scrap_list ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/the_scrap/list_obj.rb', line 63 def scrap_list doc,items = retryable(:tries => 3, :on => Timeout::Error) do scrap(url) end return unless @has_many_pages #TODO Refactor it next_page_url = nil prev_page_url = nil if @pager_method == :next_page #有下一页连接的方式 while node = doc.css(next_page_css).first next_page_url = URI.join(next_page_url||url,node['href']).to_s break if prev_page_url == next_page_url puts "url: #{next_page_url}" if verbose? doc,items = retryable(:tries => 3, :on => Timeout::Error) do scrap(next_page_url) end prev_page_url = next_page_url break if items.count == 0 break if debug? end elsif pager_method == :total_pages #可以获取总页数的方式,start by 1 page_cnt = get_page_count.call(doc) (2..page_cnt).each do |idx| next_page_url = get_next_url.call(url,idx) puts next_page_url if verbose? doc,items = retryable(:tries => 3, :on => Timeout::Error) do scrap(next_page_url) end break if items.count == 0 break if debug? end elsif pager_method == :total_records #TODO #可以取到总条数的方式 , 其实也可以使用上一方式(总页数)实现,只是在外部先使用总条数计算一下总页数 end end |