Class: Scrubyt::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/scrubyt/core/shared/extractor.rb

Overview

Driving the whole extraction process

Extractor is a performer class - it gets an extractor definition and carries out the actions and evaluates the wrappers sequentially.

Originally also the navigation actions were here, but since the class got too big, they were factored out to an own class, NavigationAction.

Class Method Summary collapse

Class Method Details

.add_detail_extractor_to_pattern_name(referenced_extractor, pattern) ⇒ Object



122
123
124
# File 'lib/scrubyt/core/shared/extractor.rb', line 122

def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
  @@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
end

.define(mode = nil, &extractor_definition) ⇒ Object

The definition of the extractor is passed through this method



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/scrubyt/core/shared/extractor.rb', line 12

def self.define(mode=nil, &extractor_definition)
  backtrace = SharedUtils.get_backtrace
  parts = backtrace[1].split(':')
  source_file = parts[0]
  
  @@mode = mode
  #We are keeping the relations between the detail patterns and their root patterns
  @@detail_extractor_to_pattern_name = {}
  @@detail_pattern_relations = {} 
  #root pattern -> URIBuilder mapping
  @@next_patterns = {}
  mode_name = (mode == :production ? 'Production' : 'Learning')
  
  Scrubyt.log :MODE, mode_name

  @@evaluation_context = EvaluationContext.new
  #Hack up an artificial root pattern (i.e. do not return the pattern which 
  #is the root one in the user's definition, but rather the real (invisible)
  #root pattern
  @@evaluation_context.evaluating_extractor_definition = true
  class_eval(&extractor_definition)
  @@evaluation_context.evaluating_extractor_definition = false
  root_pattern = @@evaluation_context.root_pattern

  if root_pattern.nil?
    # TODO: this should be an exception
    Scrubyt.log :ERROR, 'No extractor defined, exiting...'
    exit
  end

  root_pattern.source_file = source_file
  root_pattern.source_proc = extractor_definition
  #Once all is set up, evaluate the extractor from the root pattern!
  root_results = evaluate_extractor(root_pattern)

  scrubyt_result = ScrubytResult.new('root')
  scrubyt_result.push(*root_results)
  scrubyt_result.root_pattern = root_pattern
        
  #Return the root pattern
  Scrubyt.log :INFO, 'Extraction finished succesfully!'
  scrubyt_result
end

.evaluate_subextractor(url, parent_pattern, resolve) ⇒ Object

Evaluate a subexttractor (i.e. an extractor on a detail page). The url passed to this function is automatically loaded. The definition of the subextractor is passed as a block

!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP.…



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/scrubyt/core/shared/extractor.rb', line 61

def self.evaluate_subextractor(url, parent_pattern, resolve)
  if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
    detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
    detail_root.last_result = nil
    FetchAction.store_page
    @@original_evaluation_context.push @@evaluation_context
    @@host_stack.push FetchAction.get_host_name
    @@evaluation_context = EvaluationContext.new
    FetchAction.restore_host_name
    fetch url, :resolve => resolve
    @@evaluation_context.extractor = self
    @@evaluation_context.root_pattern = detail_root      
    root_results = evaluate_extractor detail_root      
    @@evaluation_context = @@original_evaluation_context.pop
    FetchAction.restore_page
    FetchAction.store_host_name(@@host_stack.pop)
    root_results
  else      
    @@original_evaluation_context ||= []
    @@host_stack ||= []
    FetchAction.store_page
    @@original_evaluation_context.push @@evaluation_context
    @@host_stack.push FetchAction.get_host_name
    @@evaluation_context = EvaluationContext.new
    FetchAction.restore_host_name      
    fetch url, :resolve => resolve
    class_eval(&parent_pattern.referenced_extractor)
    root_pattern = @@evaluation_context.root_pattern
    @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
    root_results = evaluate_extractor(root_pattern)
    @@evaluation_context = @@original_evaluation_context.pop
    FetchAction.restore_page
    FetchAction.store_host_name(@@host_stack.pop)
    root_results
  end
end

.get_current_doc_urlObject



134
135
136
# File 'lib/scrubyt/core/shared/extractor.rb', line 134

def self.get_current_doc_url
  NavigationActions.get_current_doc_url
end

.get_detail_extractor(parent_pattern) ⇒ Object



126
127
128
# File 'lib/scrubyt/core/shared/extractor.rb', line 126

def self.get_detail_extractor(parent_pattern)
  @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
end

.get_detail_pattern_relationsObject



138
139
140
# File 'lib/scrubyt/core/shared/extractor.rb', line 138

def self.get_detail_pattern_relations
  @@detail_pattern_relations
end

.get_host_nameObject



142
143
144
# File 'lib/scrubyt/core/shared/extractor.rb', line 142

def self.get_host_name
  NavigationActions.get_host_name
end

.get_hpricot_docObject



130
131
132
# File 'lib/scrubyt/core/shared/extractor.rb', line 130

def self.get_hpricot_doc
  NavigationActions.get_hpricot_doc
end

.get_modeObject



146
147
148
# File 'lib/scrubyt/core/shared/extractor.rb', line 146

def self.get_mode
  @@mode
end

.get_original_host_nameObject



150
151
152
# File 'lib/scrubyt/core/shared/extractor.rb', line 150

def self.get_original_host_name
  @@original_host_name
end

.method_missing(method_name, *args, &block) ⇒ Object

build the current wrapper



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/scrubyt/core/shared/extractor.rb', line 99

def self.method_missing(method_name, *args, &block)
  if NavigationActions::KEYWORDS.include? method_name.to_s
    NavigationActions.send(method_name, *args)
    return
  end

  if method_name.to_s == 'next_page'
    pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
    pattern.evaluation_context = @@evaluation_context
    
    @@evaluation_context.setup_uri_builder(pattern, args)
    @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
  else
    raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
    #Create a root pattern
    @@evaluation_context.extractor = self
    root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
    @@last_root_pattern = root_pattern
    @@evaluation_context.root_pattern = root_pattern
    root_pattern
  end
end