Class: RailsSpider::Resource

Inherits:
Object
  • Object
show all
Defined in:
lib/rails_spider/resource.rb

Constant Summary collapse

DEFAULT_EXP =
"([^\/.?]+)"
SYMBOL_EXP =
/:\w+/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(work, **options) ⇒ Resource

Returns a new instance of Resource.



10
11
12
13
14
15
16
17
18
# File 'lib/rails_spider/resource.rb', line 10

def initialize(work, **options)
  @work = work
  @host = work.host
  @list_path = work.list_path
  @item_path = work.item_path
  @page_params = work.page_params
  @page = 1
  @fetcher ||= RailsSpider::Mechanize.new
end

Instance Attribute Details

#fetcherObject (readonly)

Returns the value of attribute fetcher.



5
6
7
# File 'lib/rails_spider/resource.rb', line 5

def fetcher
  @fetcher
end

#hostObject (readonly)

Returns the value of attribute host.



5
6
7
# File 'lib/rails_spider/resource.rb', line 5

def host
  @host
end

#item_pathObject (readonly)

Returns the value of attribute item_path.



5
6
7
# File 'lib/rails_spider/resource.rb', line 5

def item_path
  @item_path
end

#list_pathObject (readonly)

Returns the value of attribute list_path.



5
6
7
# File 'lib/rails_spider/resource.rb', line 5

def list_path
  @list_path
end

#pageObject

Returns the value of attribute page.



6
7
8
# File 'lib/rails_spider/resource.rb', line 6

def page
  @page
end

#page_paramsObject (readonly)

Returns the value of attribute page_params.



5
6
7
# File 'lib/rails_spider/resource.rb', line 5

def page_params
  @page_params
end

#workObject (readonly)

Returns the value of attribute work.



5
6
7
# File 'lib/rails_spider/resource.rb', line 5

def work
  @work
end

Instance Method Details

#get_itemsObject



32
33
34
# File 'lib/rails_spider/resource.rb', line 32

def get_items
  fetcher.links(list_url).select { |link| item_exp.match? link }
end

#item_expObject



53
54
55
# File 'lib/rails_spider/resource.rb', line 53

def item_exp
  Regexp.new(item_path.gsub SYMBOL_EXP, DEFAULT_EXP)
end

#list_urlObject



43
44
45
46
47
48
49
50
51
# File 'lib/rails_spider/resource.rb', line 43

def list_url
  list_url = URI.join host, list_path
  if page.to_i > 0
    page_query = URI.encode_www_form page_params => page
    list_url.query = page_query
  end

  list_url
end

#runObject



20
21
22
23
24
25
26
27
28
29
30
# File 'lib/rails_spider/resource.rb', line 20

def run
  items = get_items

  while items.size > 0 do
    items.each do |item|
      save(item)
    end
    self.page += 1
    items = get_items
  end
end

#save(url) ⇒ Object



36
37
38
39
40
41
# File 'lib/rails_spider/resource.rb', line 36

def save(url)
  body = fetcher.body(url)
  local = Local.find_or_initialize_by url: url, work_id: work.id
  local.body = body
  local.save
end