Class: Scrubyt::Extractor

Inherits:
Object
  • Object
show all
Includes:
FetchAction
Defined in:
lib/scrubyt/core/shared/extractor.rb

Overview

Driving the whole extraction process

Extractor is a performer class - it gets an extractor definition and carries out the actions and evaluates the wrappers sequentially.

Originally also the navigation actions were here, but since the class got too big, they were factored out to an own class, NavigationAction.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from FetchAction

extractor, extractor=, get_current_doc_url, #get_host_name, get_hpricot_doc, get_mechanize_doc, #restore_host_name, #restore_page, #store_host_name, #store_page

Constructor Details

#initialize(mode, extractor_definition) ⇒ Extractor

Returns a new instance of Extractor.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/scrubyt/core/shared/extractor.rb', line 40

def initialize(mode, extractor_definition)
  @mode = mode
  @root_patterns = []
  @next_page_pattern = nil
  #      @hpricot_doc = nil
  #      @hpricot_doc_url = nil
  @evaluating_extractor_definition = false
  @next_page_list = []
  @processed_pages = []
  
  backtrace = SharedUtils.get_backtrace
  parts = backtrace[1].split(':')
  source_file = parts[0]
  
  Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
  
  @evaluating_extractor_definition = true
  context = Object.new
  context.extend NavigationActions
  context.instance_eval do
    def extractor=(value)
      @extractor = value
    end
    
    def next_page(*args)
      @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
    end
    
    def method_missing(method_name, *args, &block)
      root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
      @extractor.root_patterns << root_pattern
      root_pattern
    end
  end
  FetchAction.extractor = self
  context.extractor = self
  context.instance_eval(&extractor_definition)
  @evaluating_extractor_definition = false
  
  if @root_patterns.empty?
    # TODO: this should be an exception
    Scrubyt.log :ERROR, 'No extractor defined, exiting...'
    exit
  end
  
  #Once all is set up, evaluate the extractor from the root pattern!
  root_results = evaluate_extractor
  FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]

  
  @result = ScrubytResult.new('root')
  @result.push(*@root_results)
  @result.root_patterns = @root_patterns
  @result.source_file = source_file
  @result.source_proc = extractor_definition
  
  #Return the root pattern
  Scrubyt.log :INFO, 'Extraction finished succesfully!'
end

Instance Attribute Details

#evaluating_extractor_definitionObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def evaluating_extractor_definition
  @evaluating_extractor_definition
end

#modeObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def mode
  @mode
end

#next_page_patternObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def next_page_pattern
  @next_page_pattern
end

#resultObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def result
  @result
end

#root_patternsObject

, :hpricot_doc, :current_doc_url



13
14
15
# File 'lib/scrubyt/core/shared/extractor.rb', line 13

def root_patterns
  @root_patterns
end

Class Method Details

.define(mode = nil, &extractor_definition) ⇒ Object

The definition of the extractor is passed through this method



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/scrubyt/core/shared/extractor.rb', line 16

def self.define(mode=nil, &extractor_definition)
  if mode.is_a?(Hash)
    if mode[:agent]==:firefox
      FetchAction.class_eval do
        include Navigation::Firewatir
      end
    else
      FetchAction.class_eval do
        include Navigation::Mechanize
      end
    end
  else
    FetchAction.class_eval do
      include Navigation::Mechanize
    end
  end
  extractor = self.new(mode, extractor_definition)
  extractor.result
end

.load(filename) ⇒ Object



36
37
38
# File 'lib/scrubyt/core/shared/extractor.rb', line 36

def self.load(filename)
  define(&eval(IO.read(filename)))
end

Instance Method Details

#add_to_next_page_list(result_node) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
# File 'lib/scrubyt/core/shared/extractor.rb', line 120

def add_to_next_page_list(result_node)
  if result_node.result.is_a? Hpricot::Elem
    node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
    return if node == nil || node.attributes['href'] == nil
    href = node.attributes['href'].gsub('&amp;') {'&'}
  elsif result_node.result.is_a? String
    href = result_node.result
  end
  url = href #TODO need absolute address here 1/4
  @next_page_list << url
end

#evaluate_extractorObject



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/scrubyt/core/shared/extractor.rb', line 132

def evaluate_extractor
  @root_results ||= []
  current_page_count = 1
  xpath = nil
  catch :quit_next_page_loop do
    loop do
      url = get_current_doc_url #TODO need absolute address here 2/4
      @processed_pages << url
      @root_patterns.each do |root_pattern|
        @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
      end
      
	  node = nil
      while @processed_pages.include? url #TODO need absolute address here 3/4
        if !@next_page_pattern.nil?
   if @next_page_pattern.options[:limit] == current_page_count
    throw :quit_next_page_loop
   end
   unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
    throw :quit_next_page_loop
   end
          xpath = @next_page_pattern.filters[0].xpath
          node = (get_hpricot_doc/xpath).last
          node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
   if node == nil || node.attributes['href'] == nil
    throw :quit_next_page_loop
   end
          href = node.attributes['href'].gsub('&amp;') {'&'}
          throw :quit_next_page_loop if href == nil
          url = href #TODO need absolute address here 4/4
        else
          throw :quit_next_page_loop if @next_page_list.empty?
          url = @next_page_list.pop
        end
      end

      restore_host_name
	  if url == "#"
FetchAction.click_by_xpath_without_evaluate(xpath)
	  else
FetchAction.fetch(url)
	  end
      
      current_page_count += 1
    end
  end
  @root_patterns = []
  @root_results
end

#get_current_doc_urlObject



104
105
106
# File 'lib/scrubyt/core/shared/extractor.rb', line 104

def get_current_doc_url
  FetchAction.get_current_doc_url
end

#get_detail_pattern_relationsObject



108
109
110
# File 'lib/scrubyt/core/shared/extractor.rb', line 108

def get_detail_pattern_relations
  @detail_pattern_relations
end

#get_hpricot_docObject



100
101
102
# File 'lib/scrubyt/core/shared/extractor.rb', line 100

def get_hpricot_doc
  FetchAction.get_hpricot_doc
end

#get_modeObject



112
113
114
# File 'lib/scrubyt/core/shared/extractor.rb', line 112

def get_mode
  @mode
end

#get_original_host_nameObject



116
117
118
# File 'lib/scrubyt/core/shared/extractor.rb', line 116

def get_original_host_name
  @original_host_name
end