Class: Tango::App

Inherits:
Object
  • Object
show all
Defined in:
lib/tango/app.rb

Overview

Tango application

Author:

  • Mckomo

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil) ⇒ Tango::App

Parameters:

  • link_stack (Tango::LinkStack) (defaults to: nil)
  • dispatcher (Tango::Etl::Dispatcher) (defaults to: nil)
  • cache (Tango::Resources::Cache) (defaults to: nil)
  • http_client (Object) (defaults to: nil)

    Must implement get method

  • parser (Object) (defaults to: nil)

    Must implement parse method

  • db_locker (DatabaseLocker) (defaults to: nil)
  • logger (Logger) (defaults to: nil)


20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/tango/app.rb', line 20

def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil )
  
  # Init app properties
  @models = {}
  @operators = {}
  
  # Set config
  @config = config
  
  # Set dependencies
  @link_stack = link_stack || LinkStack.new( config['target_url'] )
  @dispatcher = dispatcher || ETL::Dispatcher.new
  @cache = cache || Resource::Cache.new( Resource::Buffer.new )
  @http_client = http_client || HTTParty
  @parser = parser || Nokogiri::HTML
  @db_locker = db_locker || DatabaseLocker.new( Multidb.databases )
  @logger = logger || Logger.new( STDOUT )

  @models = []
  @operators = []
  
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



10
11
12
# File 'lib/tango/app.rb', line 10

def config
  @config
end

#dispatcherObject (readonly)

Returns the value of attribute dispatcher.



10
11
12
# File 'lib/tango/app.rb', line 10

def dispatcher
  @dispatcher
end

Returns the value of attribute link_stack.



10
11
12
# File 'lib/tango/app.rb', line 10

def link_stack
  @link_stack
end

#loggerObject (readonly)

Returns the value of attribute logger.



10
11
12
# File 'lib/tango/app.rb', line 10

def logger
  @logger
end

Instance Method Details

#afterObject

Filter run after Tango execution



48
49
# File 'lib/tango/app.rb', line 48

def after
end

#beforeObject

Filter run before Tango execution



44
45
# File 'lib/tango/app.rb', line 44

def before
end

#register_handler(handler) ⇒ Array

Register a new handler with the dispatcher

Parameters:

Returns:

  • (Array)


71
72
73
# File 'lib/tango/app.rb', line 71

def register_handler( handler )
  @dispatcher.register( handler )
end

#register_model(model) ⇒ Array

Register a new model

Parameters:

  • model (Symbol)

Returns:

  • (Array)


55
56
57
# File 'lib/tango/app.rb', line 55

def register_model( model )
  @models << model    
end

#register_operator(operator) ⇒ Array

Register a new resource operator

Parameters:

Returns:

  • (Array)


63
64
65
# File 'lib/tango/app.rb', line 63

def register_operator( operator )
  @operators << operator
end

#runNil

Run ETL process

Parameters:

  • link_stack (Tango::LinkStack)
  • dispatcher (Tango::Etl::Dispatcher)
  • cache (Tango::Resources::Cache)
  • http_client (Object)

    Must implement get method

  • parser (Object)

    Must implement parse method

  • logger (Logger)

Returns:

  • (Nil)


84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/tango/app.rb', line 84

def run

  # Save beginning time
  start_time = Time.now
  
  @logger.info "Running Tango v.#{Tango::VERSION} ..."
  @logger.info "Target: #{@link_stack.host}."
  
  # Use next unlocked database
  pick_database( @db_locker.unlocked )
  @logger.info "Using database '#{@db_locker.unlocked}'."

  @logger.info "Truncating non persistent models ..." 
  truncate_tables( non_persistent_models )
  
  # Load cache for persistent models
  @logger.info "Loading cache ..."
  setup_cache( @operators )
  load_cache( persistent_models )

  # Run before filter
  @logger.info "Running before callback ..."
  before
  
  # Init counter of crawled links
  links_counter = 0
  @logger.info "Tango starts crawling ..."
  
  # Start crawling website
  while( @link_stack.has_links? )
  
    # Get a link from the stack
    link = @link_stack.shift 
    
    # Skip iteration if no handler found
    if ! handler_klass = @dispatcher.find_handler( link )
      @logger.error "No handler for link: #{link}."
      next
    end
    
    # Try to get contents of the link
    begin 
      response = @http_client.get( @link_stack.host + link )
    rescue StandardError => e 
      @logger.error "Could not download contents of #{@link_stack.host + link} link."
      @logger.error e.message
      next
    end
    
    # Continue only when response has code 200 or 201
    if ! [ 200, 201 ].include?( response.code )
      @logger.error "Response code for link #{link} is #{response.code}. Only codes 200 and 201 are accepted."
      next
    end
    
    # Parse response contents
    document = @parser.parse( response.body )
    # Init handler
    handler = handler_klass.new( link, document, @cache )
    
    # Append links fetched from handler
    @link_stack.append( handler.links )
    
    # Try to fire the handler
    begin
      handler.trigger
    rescue StandardError => e  
      # Log error
      @logger.error "Link: #{link}. Handler had some troubles."
      @logger.error e.message
      @logger.error e.backtrace.join( "\n" )
    else
      links_counter += 1
      @logger.debug "Link: #{link}. Handler triggered successfully."
    end
    
    # Sleep to give crawled server time to breath
    sleep( @config["sleep"] || 0 )
    
  end
  
  # Release buffers
  @logger.info "Releasing buffers ..."
  release_buffer( @cache.buffer )
  
  # Run after filter
  @logger.info "Running after callback ..."
  after
  
  # Lock database used in this Tango iteration
  lock_database( @db_locker.unlocked )

  # Get time of script execution ending
  end_time = Time.now
  
  @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully."
  @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds."
  
  # Close logger
  @logger.close
        
end