Class: Waw::Crawler

Inherits:
Object show all
Includes:
Options
Defined in:
lib/waw/crawler.rb,
lib/waw/crawler/crawler_options.rb,
lib/waw/crawler/crawler_listener.rb

Defined Under Namespace

Modules: Options Classes: Listener

Constant Summary collapse

PINGED =
1
PENDING =
2
CHECKING =
4
CHECKED =
8

Instance Attribute Summary collapse

Attributes included from Options

#check_externals, #crawl_list, #listener, #ping_list

Instance Method Summary collapse

Methods included from Options

#ping_on, #set_default_options

Constructor Details

#initialize(root_uri = nil) ⇒ Crawler

Creates a crawler instance on a root URI



52
53
54
55
# File 'lib/waw/crawler.rb', line 52

def initialize(root_uri = nil)
  self.root_uri = root_uri
  set_default_options
end

Instance Attribute Details

#agentObject (readonly)

Mechanize agent instance



11
12
13
# File 'lib/waw/crawler.rb', line 11

def agent
  @agent
end

#root_uriObject

Root URI to crawl



14
15
16
# File 'lib/waw/crawler.rb', line 14

def root_uri
  @root_uri
end

#stackObject (readonly)

Stack of files/pages to visit



26
27
28
# File 'lib/waw/crawler.rb', line 26

def stack
  @stack
end

#uristateObject (readonly)

URI statuses



31
32
33
# File 'lib/waw/crawler.rb', line 31

def uristate
  @uristate
end

Instance Method Details

#all_ping!(query, referer_page) ⇒ Object

Pinging



134
135
136
137
138
# File 'lib/waw/crawler.rb', line 134

def all_ping!(query, referer_page)
  referer_page.search(query).each do |loc|
    ping!(loc, referer_page)
  end
end

#check_web_page(page) ⇒ Object

Checking



121
122
123
124
125
126
127
128
129
130
# File 'lib/waw/crawler.rb', line 121

def check_web_page(page)
  uristate[page.uri] |= CHECKING
  listener.checking(page){
    # Make ping checks
    all_ping!(ping_list.join(', '), page)
    # Crawl all links now
    crawl_all(crawl_list.join(', '), page)
  }
  uristate[page.uri] |= CHECKED
end

#crawlObject

Starts the crawling



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/waw/crawler.rb', line 74

def crawl
  @agent = Mechanize.new
  @uristate = Hash.new{|h,k| h[k] = 0}
  @stack = [ agent.get(root_uri) ]
  until stack.empty?
    to_check = stack.shift
    case to_check
      when ::Mechanize::Page
        check_web_page(to_check)
      else
        listener.doc_skipped(to_check)
    end
  end
  @agent = nil
  @uristate = nil
  @stack = nil
end

#crawl_all(query, referer_page) ⇒ Object



92
93
94
95
96
# File 'lib/waw/crawler.rb', line 92

def crawl_all(query, referer_page)
  referer_page.search(query).each do |loc|
    crawl_one(loc, referer_page)
  end
end

#crawl_one(location, referer_page) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/waw/crawler.rb', line 98

def crawl_one(location, referer_page)
  uri = resolve_uri(location, referer_page)
  
  # Bypass PENDING/CHECKING/CHECKED links
  if uristate[uri] < PENDING
  
    # Mark it as PENDING now
    pending!(uri)

    # Mark as to crawl by pushing on the stack
    if internal_uri?(uri)
      stack.push(agent.get(uri))
    else
      listener.crawl_skipped(referer_page, location)
    end
    
  end
rescue => ex
  handle_error(ex, referer_page, location)
end

#handle_error(ex, referer_page, loc) ⇒ Object

Handles errors that occur



162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/waw/crawler.rb', line 162

def handle_error(ex, referer_page, loc)
  case ex
    when Mechanize::ResponseCodeError
      listener.reach_failure(referer_page, loc, ex)
    when Mechanize::UnsupportedSchemeError
      listener.scheme_failure(referer_page, loc, ex)
    when SocketError
      listener.socket_error(referer_page, loc, ex)
    else
      raise ex
  end
end

#internal_uri?(uri) ⇒ Boolean

Returns true if a given page is internal to the website currently crawled

Returns:



61
62
63
# File 'lib/waw/crawler.rb', line 61

def internal_uri?(uri)
  uri.host.nil? or ((uri.host == root_uri.host) and (uri.port == root_uri.port))
end

#pending!(uri) ⇒ Object

Marks an URI as currently pending



40
41
42
# File 'lib/waw/crawler.rb', line 40

def pending!(uri)
  uristate[uri] |= PENDING
end

#ping!(loc, referer_page) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/waw/crawler.rb', line 140

def ping!(loc, referer_page)
  uri = resolve_uri(loc, referer_page)
  
  # Only ping uri that are not PINGED/PENDING/CHECKING/CHECKED
  return unless uristate[uri] < PINGED

  # bypass externals if required
  if internal_uri?(uri) || check_externals
    agent.head(uri) # ping!
    pinged!(uri)
    listener.ping_ok(referer_page, loc)
  else
    listener.ping_skipped(referer_page, loc)
  end
  
rescue => ex
  handle_error(ex, referer_page, loc)
end

#pinged!(uri) ⇒ Object

Marks an URI as being pinged



45
46
47
# File 'lib/waw/crawler.rb', line 45

def pinged!(uri)
  uristate[uri] |= PINGED
end

#resolve_uri(href_or_src, page) ⇒ Object

Resolves as an absolute URI something that has been found on a page



67
68
69
# File 'lib/waw/crawler.rb', line 67

def resolve_uri(href_or_src, page)
  URI::parse(agent.send(:resolve, href_or_src, page))
end