Class: Yasf::Scraper

Inherits:
Object
  • Object
show all
Includes:
HTTParty
Defined in:
lib/yasf/scraper.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source, options = nil) ⇒ Scraper

The argument source is a String (url format), or Nokogiri::XML::Element



117
118
119
120
121
122
123
124
125
126
127
# File 'lib/yasf/scraper.rb', line 117

def initialize(source, options = nil)
  @options = options || {}
  case source
  when String
    @document = Nokogiri::HTML(self.class.get(source,@options).body)
  when Nokogiri::XML::Element, Nokogiri::HTML::Document
    @document = source
  else
    raise ArgumentError, "source not recognized"
  end
end

Class Method Details

.extract_from(source, options = nil) ⇒ Object



10
11
12
# File 'lib/yasf/scraper.rb', line 10

def extract_from(source, options = nil)
  self.new(source, options).extract
end

.extractor(map) ⇒ Object

Creates an extractor that will extract values from the selected element and place them in instance variables of the scraper.



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/yasf/scraper.rb', line 50

def extractor(map)
  extracts = []
  map.each_pair do |target, source|
    source = extract_value_from(source)
    target = extract_value_to(target)
    define_method :__extractor do |element|
      value = source.call(element)
      target.call(self, value) unless value.nil?
    end
    extracts << instance_method(:__extractor)
    remove_method :__extractor
  end
  lambda do |element|
    extracts.each do |extract|
      extract.bind(self).call(element)
    end
    true
  end
end

.result(*symbols) ⇒ Object

Raises:

  • (ArgumentError)


33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/yasf/scraper.rb', line 33

def result(*symbols)
  raise ArgumentError, "one symbol to return the value of this accessor" if symbols.empty?
  symbols = symbols.map {|s| s.to_sym}
  if symbols.size == 1
    define_method :result do
      return self.send(symbols[0])
    end
  else
    struct = Struct.new(*symbols)
    define_method :result do
      return struct.new(*symbols.collect {|s| self.send(s) })
    end
  end
end

.rulesObject

Returns an array of scraper rules



29
30
31
# File 'lib/yasf/scraper.rb', line 29

def rules()
  @rules ||= []
end

.scrape(*args) ⇒ Object

Defines a processing rule.

Raises:

  • (ArgumentError)


15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/yasf/scraper.rb', line 15

def scrape(*args)
  name = args.shift if args.first.is_a?(Symbol)
  if args.last.is_a?(Hash)
    extractor = extractor(args.pop)
  end
  raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
  raise ArgumentError, "Missing selector: the first argument tells us what to select" if args.empty?
  define_method :__extractor, extractor
  method = instance_method(:__extractor)
  remove_method :__extractor
  rules << [args.pop, method, name]
end

Instance Method Details

#collectObject

Called by #scrape before calling #result typically used to run post-processing steps



135
136
# File 'lib/yasf/scraper.rb', line 135

def collect()
end

#documentObject

Returns the document being processed.



130
131
132
# File 'lib/yasf/scraper.rb', line 130

def document
  @document
end

#extractObject

Scrapes the document and returns the result.



139
140
141
142
143
144
145
146
147
148
# File 'lib/yasf/scraper.rb', line 139

def extract
  rules = self.class.rules.clone
  rules.delete_if do |selector, extractor, rule_name|
    document.search(selector).each do |element|
      extractor.bind(self).call(element)
    end
  end
  collect
  return result
end