Class: Scrubyt::Extractor
- Inherits:
-
Object
- Object
- Scrubyt::Extractor
- Includes:
- FetchAction
- Defined in:
- lib/scrubyt/core/shared/extractor.rb
Overview
Driving the whole extraction process
Extractor is a performer class - it gets an extractor definition and carries out the actions and evaluates the wrappers sequentially.
Originally also the navigation actions were here, but since the class got too big, they were factored out to an own class, NavigationAction.
Instance Attribute Summary collapse
-
#evaluating_extractor_definition ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#mode ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#next_page_pattern ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#result ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#root_patterns ⇒ Object
, :hpricot_doc, :current_doc_url.
Class Method Summary collapse
-
.define(mode = nil, &extractor_definition) ⇒ Object
The definition of the extractor is passed through this method.
- .load(filename) ⇒ Object
Instance Method Summary collapse
- #add_to_next_page_list(result_node) ⇒ Object
- #evaluate_extractor ⇒ Object
- #get_current_doc_url ⇒ Object
- #get_detail_pattern_relations ⇒ Object
- #get_hpricot_doc ⇒ Object
- #get_mode ⇒ Object
- #get_original_host_name ⇒ Object
-
#initialize(mode, extractor_definition) ⇒ Extractor
constructor
A new instance of Extractor.
Methods included from FetchAction
extractor, extractor=, get_current_doc_url, #get_host_name, get_hpricot_doc, get_mechanize_doc, #restore_host_name, #restore_page, #store_host_name, #store_page
Constructor Details
#initialize(mode, extractor_definition) ⇒ Extractor
Returns a new instance of Extractor.
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 40 def initialize(mode, extractor_definition) @mode = mode @root_patterns = [] @next_page_pattern = nil # @hpricot_doc = nil # @hpricot_doc_url = nil @evaluating_extractor_definition = false @next_page_list = [] @processed_pages = [] backtrace = SharedUtils.get_backtrace parts = backtrace[1].split(':') source_file = parts[0] Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning' @evaluating_extractor_definition = true context = Object.new context.extend NavigationActions context.instance_eval do def extractor=(value) @extractor = value end def next_page(*args) @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor) end def method_missing(method_name, *args, &block) root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block) @extractor.root_patterns << root_pattern root_pattern end end FetchAction.extractor = self context.extractor = self context.instance_eval(&extractor_definition) @evaluating_extractor_definition = false if @root_patterns.empty? # TODO: this should be an exception Scrubyt.log :ERROR, 'No extractor defined, exiting...' exit end #Once all is set up, evaluate the extractor from the root pattern! root_results = evaluate_extractor FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close] @result = ScrubytResult.new('root') @result.push(*@root_results) @result.root_patterns = @root_patterns @result.source_file = source_file @result.source_proc = extractor_definition #Return the root pattern Scrubyt.log :INFO, 'Extraction finished succesfully!' end |
Instance Attribute Details
#evaluating_extractor_definition ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def evaluating_extractor_definition @evaluating_extractor_definition end |
#mode ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def mode @mode end |
#next_page_pattern ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def next_page_pattern @next_page_pattern end |
#result ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def result @result end |
#root_patterns ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def root_patterns @root_patterns end |
Class Method Details
.define(mode = nil, &extractor_definition) ⇒ Object
The definition of the extractor is passed through this method
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 16 def self.define(mode=nil, &extractor_definition) if mode.is_a?(Hash) if mode[:agent] == :firefox FetchAction.class_eval do include Navigation::Firewatir end else FetchAction.class_eval do include Navigation::Mechanize end end else FetchAction.class_eval do include Navigation::Mechanize end end extractor = self.new(mode, extractor_definition) extractor.result end |
.load(filename) ⇒ Object
36 37 38 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 36 def self.load(filename) define(&eval(IO.read(filename))) end |
Instance Method Details
#add_to_next_page_list(result_node) ⇒ Object
120 121 122 123 124 125 126 127 128 129 130 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 120 def add_to_next_page_list(result_node) if result_node.result.is_a? Hpricot::Elem node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href') return if node == nil || node.attributes['href'] == nil href = node.attributes['href'].gsub('&') {'&'} elsif result_node.result.is_a? String href = result_node.result end url = href #TODO need absolute address here 1/4 @next_page_list << url end |
#evaluate_extractor ⇒ Object
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 132 def evaluate_extractor @root_results ||= [] current_page_count = 1 catch :quit_next_page_loop do loop do url = get_current_doc_url #TODO need absolute address here 2/4 @processed_pages << url @root_patterns.each do |root_pattern| @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil)) end while @processed_pages.include? url #TODO need absolute address here 3/4 if !@next_page_pattern.nil? throw :quit_next_page_loop if @next_page_pattern.[:limit] == current_page_count throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true) xpath = @next_page_pattern.filters[0].xpath node = (get_hpricot_doc/xpath).map.last node = XPathUtils.find_nearest_node_with_attribute(node, 'href') throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil href = node.attributes['href'].gsub('&') {'&'} throw :quit_next_page_loop if href == nil url = href #TODO need absolute address here 4/4 else throw :quit_next_page_loop if @next_page_list.empty? url = @next_page_list.pop end end restore_host_name FetchAction.fetch(url) current_page_count += 1 end end @root_patterns = [] @root_results end |
#get_current_doc_url ⇒ Object
104 105 106 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 104 def get_current_doc_url FetchAction.get_current_doc_url end |
#get_detail_pattern_relations ⇒ Object
108 109 110 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 108 def get_detail_pattern_relations @detail_pattern_relations end |
#get_hpricot_doc ⇒ Object
100 101 102 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 100 def get_hpricot_doc FetchAction.get_hpricot_doc end |
#get_mode ⇒ Object
112 113 114 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 112 def get_mode @mode end |
#get_original_host_name ⇒ Object
116 117 118 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 116 def get_original_host_name @original_host_name end |