Class: Scrubyt::Extractor
- Inherits:
-
Object
- Object
- Scrubyt::Extractor
- Includes:
- FetchAction
- Defined in:
- lib/scrubyt/core/shared/extractor.rb
Overview
Driving the whole extraction process
Extractor is a performer class - it gets an extractor definition and carries out the actions and evaluates the wrappers sequentially.
Originally also the navigation actions were here, but since the class got too big, they were factored out to an own class, NavigationAction.
Instance Attribute Summary collapse
-
#evaluating_extractor_definition ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#mode ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#next_page_pattern ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#result ⇒ Object
, :hpricot_doc, :current_doc_url.
-
#root_patterns ⇒ Object
, :hpricot_doc, :current_doc_url.
Class Method Summary collapse
-
.define(mode = nil, &extractor_definition) ⇒ Object
The definition of the extractor is passed through this method.
- .load(filename) ⇒ Object
Instance Method Summary collapse
- #add_to_next_page_list(result_node) ⇒ Object
- #evaluate_extractor ⇒ Object
- #get_current_doc_url ⇒ Object
- #get_detail_pattern_relations ⇒ Object
- #get_hpricot_doc ⇒ Object
- #get_mode ⇒ Object
- #get_original_host_name ⇒ Object
-
#initialize(mode, extractor_definition) ⇒ Extractor
constructor
A new instance of Extractor.
Methods included from FetchAction
get_current_doc_url, #get_host_name, get_hpricot_doc, get_mechanize_doc, #restore_host_name, #restore_page, #store_host_name, #store_page
Constructor Details
#initialize(mode, extractor_definition) ⇒ Extractor
Returns a new instance of Extractor.
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 40 def initialize(mode, extractor_definition) @mode = mode @root_patterns = [] @next_page_pattern = nil # @hpricot_doc = nil # @hpricot_doc_url = nil @evaluating_extractor_definition = false @next_page_list = [] @processed_pages = [] backtrace = SharedUtils.get_backtrace parts = backtrace[1].split(':') source_file = parts[0] Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning' @evaluating_extractor_definition = true context = Object.new context.extend NavigationActions context.instance_eval do def extractor=(value) @extractor = value end def next_page(*args) @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor) end def method_missing(method_name, *args, &block) root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block) @extractor.root_patterns << root_pattern root_pattern end end context.extractor = self context.instance_eval(&extractor_definition) @evaluating_extractor_definition = false if @root_patterns.empty? # TODO: this should be an exception Scrubyt.log :ERROR, 'No extractor defined, exiting...' exit end #Once all is set up, evaluate the extractor from the root pattern! root_results = evaluate_extractor @result = ScrubytResult.new('root') @result.push(*root_results) @result.root_patterns = @root_patterns @result.source_file = source_file @result.source_proc = extractor_definition #Return the root pattern Scrubyt.log :INFO, 'Extraction finished succesfully!' end |
Instance Attribute Details
#evaluating_extractor_definition ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def evaluating_extractor_definition @evaluating_extractor_definition end |
#mode ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def mode @mode end |
#next_page_pattern ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def next_page_pattern @next_page_pattern end |
#result ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def result @result end |
#root_patterns ⇒ Object
, :hpricot_doc, :current_doc_url
13 14 15 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 13 def root_patterns @root_patterns end |
Class Method Details
.define(mode = nil, &extractor_definition) ⇒ Object
The definition of the extractor is passed through this method
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 16 def self.define(mode=nil, &extractor_definition) if mode.is_a?(Hash) if mode[:agent]==:firefox FetchAction.class_eval do include Navigation::Firewatir end else FetchAction.class_eval do include Navigation::Mechanize end end else FetchAction.class_eval do include Navigation::Mechanize end end extractor = self.new(mode, extractor_definition) extractor.result end |
.load(filename) ⇒ Object
36 37 38 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 36 def self.load(filename) define(&eval(IO.read(filename))) end |
Instance Method Details
#add_to_next_page_list(result_node) ⇒ Object
117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 117 def add_to_next_page_list(result_node) if result_node.result.is_a? Hpricot::Elem node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href') return if node == nil || node.attributes['href'] == nil href = node.attributes['href'].gsub('&') {'&'} elsif result_node.result.is_a? String href = result_node.result end url = href #TODO need absolute address here 1/4 @next_page_list << url end |
#evaluate_extractor ⇒ Object
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 129 def evaluate_extractor root_results = [] current_page_count = 1 catch :quit_next_page_loop do loop do url = get_current_doc_url #TODO need absolute address here 2/4 @processed_pages << url @root_patterns.each do |root_pattern| root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil)) end while @processed_pages.include? url #TODO need absolute address here 3/4 if !@next_page_pattern.nil? throw :quit_next_page_loop if @next_page_pattern.[:limit] == current_page_count throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true) xpath = @next_page_pattern.filters[0].xpath node = (get_hpricot_doc/xpath).map.last node = XPathUtils.find_nearest_node_with_attribute(node, 'href') throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil href = node.attributes['href'].gsub('&') {'&'} throw :quit_next_page_loop if href == nil url = href #TODO need absolute address here 4/4 else throw :quit_next_page_loop if @next_page_list.empty? url = @next_page_list.pop end end restore_host_name FetchAction.fetch(url) current_page_count += 1 end end root_results end |
#get_current_doc_url ⇒ Object
101 102 103 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 101 def get_current_doc_url FetchAction.get_current_doc_url end |
#get_detail_pattern_relations ⇒ Object
105 106 107 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 105 def get_detail_pattern_relations @detail_pattern_relations end |
#get_hpricot_doc ⇒ Object
97 98 99 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 97 def get_hpricot_doc FetchAction.get_hpricot_doc end |
#get_mode ⇒ Object
109 110 111 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 109 def get_mode @mode end |
#get_original_host_name ⇒ Object
113 114 115 |
# File 'lib/scrubyt/core/shared/extractor.rb', line 113 def get_original_host_name @original_host_name end |