Class: DataMapper::YS::Scraper::Page

Inherits:
Base
  • Object
show all
Defined in:
lib/dm-ys/scraper.rb

Overview

Page Scraper

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Base

#base_uri, #count, #register_properties!, #uri

Methods included from CachedAccessor

included

Constructor Details

#initialize(model, uri = nil) ⇒ Page

Returns a new instance of Page.



89
90
91
92
93
94
# File 'lib/dm-ys/scraper.rb', line 89

def initialize(model, uri = nil)
  super
  @uri  = uri
  @html = NKF.nkf('-w', open(self.uri).read)
  @invalid_name_count = 0
end

Instance Attribute Details

#htmlObject (readonly)

Returns the value of attribute html.



87
88
89
# File 'lib/dm-ys/scraper.rb', line 87

def html
  @html
end

Instance Method Details

#guess_tableObject



96
97
98
99
# File 'lib/dm-ys/scraper.rb', line 96

def guess_table
  max_table or
    raise TableNotFound, "set 'table' or 'tbody' manually"
end

#inspectObject



107
108
109
110
111
112
113
114
# File 'lib/dm-ys/scraper.rb', line 107

def inspect
  attrs = [
    [ :html,      "#{html.size}bytes" ],
    [ :names,     names ],
    [ :records,   count ],
  ]
  "#<#{self.class.name} #{attrs.map { |(k,v)| "@#{k}=#{v.inspect}" } * ' '}>"
end

#page_hashObject



116
117
118
# File 'lib/dm-ys/scraper.rb', line 116

def page_hash
  Digest::SHA1.hexdigest(tbody.inspect)
end


101
102
103
104
105
# File 'lib/dm-ys/scraper.rb', line 101

def pagination_links
  base = base_uri
  urls = (doc / "a").map{|i| i[:href] =~ /^http/ ? i[:href] : (base+i[:href]).to_s}.uniq
  urls.select{|url| /^#{Regexp.escape(base.to_s)}/ === url}
end