Class: TheScrap::ListObj

Inherits:

Scrap

Object
Scrap
TheScrap::ListObj

show all

Defined in:: lib/the_scrap/list_obj.rb

Instance Attribute Summary collapse

#get_next_url ⇒ Object

总页数模式时，下一页的URL生成方式，方法.
#get_page_count ⇒ Object

总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。.
#has_many_pages ⇒ Object

是否多页.
#item_filters ⇒ Object

条目过滤.
#next_page_css ⇒ Object

下一页模式时取下一页链接的 css selector.
#pager_method ⇒ Object

分页模式.

Attributes inherited from Scrap

#base_url, #data_proc, #debug, #detail_info, #encoding, #html_proc, #item_frag, #result_proc, #url, #verbose

Instance Method Summary collapse

#initialize ⇒ ListObj constructor

A new instance of ListObj.
#scrap(url) ⇒ Object
#scrap_list ⇒ Object

Methods inherited from Scrap

#method_missing, #retryable

Constructor Details

#initialize ⇒ `ListObj`

Returns a new instance of ListObj.

# File 'lib/the_scrap/list_obj.rb', line 11

def initialize()
  super
  @item_filters = []
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class TheScrap::Scrap

Instance Attribute Details

#get_next_url ⇒ `Object`

总页数模式时，下一页的URL生成方式，方法



9
10
11

# File 'lib/the_scrap/list_obj.rb', line 9

def get_next_url
  @get_next_url
end

#get_page_count ⇒ `Object`

总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。



8
9
10

# File 'lib/the_scrap/list_obj.rb', line 8

def get_page_count
  @get_page_count
end

#has_many_pages ⇒ `Object`

是否多页



5
6
7

# File 'lib/the_scrap/list_obj.rb', line 5

def has_many_pages
  @has_many_pages
end

#item_filters ⇒ `Object`

条目过滤



4
5
6

# File 'lib/the_scrap/list_obj.rb', line 4

def item_filters
  @item_filters
end

#next_page_css ⇒ `Object`

下一页模式时取下一页链接的 css selector



7
8
9

# File 'lib/the_scrap/list_obj.rb', line 7

def next_page_css
  @next_page_css
end

#pager_method ⇒ `Object`

分页模式



6
7
8

# File 'lib/the_scrap/list_obj.rb', line 6

def pager_method
  @pager_method
end

Instance Method Details

#scrap(url) ⇒ `Object`

# File 'lib/the_scrap/list_obj.rb', line 16

def scrap( url )
  items = []

  html = open(url)
  html_proc.each do |dp|
    html = dp.call(html)
  end

  doc = Nokogiri::HTML(html,nil,encoding)
  doc.css(item_frag).each do |item|

    item_info = {}
    get_attrs(url,item,item_info)

    #filter items
    need_skip = false
    item_filters.each do |filter|
      unless filter.call(item_info)
        need_skip = true
        break
      end
    end
    next if need_skip

    #has detail page?
    detail_info.each do |detail|
      detail[0].scrap(item_info[detail[1]],item_info)
    end

    #proc result
    data_proc.each do |dp|
      dp.call(url,item_info)
    end

    items << item_info

    pp item_info if debug?
    break if debug?
  end

  result_proc.each do |rp|
    rp.call(url,items)
  end

  return doc,items
end

#scrap_list ⇒ `Object`

# File 'lib/the_scrap/list_obj.rb', line 63

def scrap_list
  doc,items = retryable(:tries => 3, :on => Timeout::Error) do
    scrap(url)
  end

  return unless  @has_many_pages

  #TODO Refactor it
  next_page_url = nil
  prev_page_url = nil
  if @pager_method == :next_page #有下一页连接的方式
    while node = doc.css(next_page_css).first
      next_page_url = URI.join(next_page_url||url,node['href']).to_s
      break if prev_page_url == next_page_url

      puts "url: #{next_page_url}" if verbose?
      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
        scrap(next_page_url)
      end

      prev_page_url = next_page_url
      break if items.count == 0
      break if debug?
    end
  elsif pager_method == :total_pages #可以获取总页数的方式,start by 1
    page_cnt = get_page_count.call(doc)
    (2..page_cnt).each do |idx|
      next_page_url = get_next_url.call(url,idx)
      puts next_page_url if verbose?
      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
        scrap(next_page_url)
      end
      break if items.count == 0
      break if debug?
    end
  elsif pager_method == :total_records
    #TODO
    #可以取到总条数的方式 , 其实也可以使用上一方式(总页数）实现,只是在外部先使用总条数计算一下总页数
  end
end

Class: TheScrap::ListObj

Instance Attribute Summary collapse

Attributes inherited from Scrap

Instance Method Summary collapse

Methods inherited from Scrap

Constructor Details

#initialize ⇒ ListObj

Dynamic Method Handling

Instance Attribute Details

#get_next_url ⇒ Object

#get_page_count ⇒ Object

#has_many_pages ⇒ Object

#item_filters ⇒ Object

#next_page_css ⇒ Object

#pager_method ⇒ Object