Class: Waw::Crawler
- Includes:
- Options
- Defined in:
- lib/waw/crawler.rb,
lib/waw/crawler/crawler_options.rb,
lib/waw/crawler/crawler_listener.rb
Defined Under Namespace
Modules: Options Classes: Listener
Constant Summary collapse
- PINGED =
1
- PENDING =
2
- CHECKING =
4
- CHECKED =
8
Instance Attribute Summary collapse
-
#agent ⇒ Object
readonly
Mechanize agent instance.
-
#root_uri ⇒ Object
Root URI to crawl.
-
#stack ⇒ Object
readonly
Stack of files/pages to visit.
-
#uristate ⇒ Object
readonly
URI statuses.
Attributes included from Options
#check_externals, #crawl_list, #listener, #ping_list
Instance Method Summary collapse
-
#all_ping!(query, referer_page) ⇒ Object
Pinging.
-
#check_web_page(page) ⇒ Object
Checking.
-
#crawl ⇒ Object
Starts the crawling.
- #crawl_all(query, referer_page) ⇒ Object
- #crawl_one(location, referer_page) ⇒ Object
-
#handle_error(ex, referer_page, loc) ⇒ Object
Handles errors that occur.
-
#initialize(root_uri = nil) ⇒ Crawler
constructor
Creates a crawler instance on a root URI.
-
#internal_uri?(uri) ⇒ Boolean
Returns true if a given page is internal to the website currently crawled.
-
#pending!(uri) ⇒ Object
Marks an URI as currently pending.
- #ping!(loc, referer_page) ⇒ Object
-
#pinged!(uri) ⇒ Object
Marks an URI as being pinged.
-
#resolve_uri(href_or_src, page) ⇒ Object
Resolves as an absolute URI something that has been found on a page.
Methods included from Options
#ping_on, #set_default_options
Constructor Details
#initialize(root_uri = nil) ⇒ Crawler
Creates a crawler instance on a root URI
52 53 54 55 |
# File 'lib/waw/crawler.rb', line 52 def initialize(root_uri = nil) self.root_uri = root_uri end |
Instance Attribute Details
#agent ⇒ Object (readonly)
Mechanize agent instance
11 12 13 |
# File 'lib/waw/crawler.rb', line 11 def agent @agent end |
#root_uri ⇒ Object
Root URI to crawl
14 15 16 |
# File 'lib/waw/crawler.rb', line 14 def root_uri @root_uri end |
#stack ⇒ Object (readonly)
Stack of files/pages to visit
26 27 28 |
# File 'lib/waw/crawler.rb', line 26 def stack @stack end |
#uristate ⇒ Object (readonly)
URI statuses
31 32 33 |
# File 'lib/waw/crawler.rb', line 31 def uristate @uristate end |
Instance Method Details
#all_ping!(query, referer_page) ⇒ Object
Pinging
134 135 136 137 138 |
# File 'lib/waw/crawler.rb', line 134 def all_ping!(query, referer_page) referer_page.search(query).each do |loc| ping!(loc, referer_page) end end |
#check_web_page(page) ⇒ Object
Checking
121 122 123 124 125 126 127 128 129 130 |
# File 'lib/waw/crawler.rb', line 121 def check_web_page(page) uristate[page.uri] |= CHECKING listener.checking(page){ # Make ping checks all_ping!(ping_list.join(', '), page) # Crawl all links now crawl_all(crawl_list.join(', '), page) } uristate[page.uri] |= CHECKED end |
#crawl ⇒ Object
Starts the crawling
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/waw/crawler.rb', line 74 def crawl @agent = Mechanize.new @uristate = Hash.new{|h,k| h[k] = 0} @stack = [ agent.get(root_uri) ] until stack.empty? to_check = stack.shift case to_check when ::Mechanize::Page check_web_page(to_check) else listener.doc_skipped(to_check) end end @agent = nil @uristate = nil @stack = nil end |
#crawl_all(query, referer_page) ⇒ Object
92 93 94 95 96 |
# File 'lib/waw/crawler.rb', line 92 def crawl_all(query, referer_page) referer_page.search(query).each do |loc| crawl_one(loc, referer_page) end end |
#crawl_one(location, referer_page) ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/waw/crawler.rb', line 98 def crawl_one(location, referer_page) uri = resolve_uri(location, referer_page) # Bypass PENDING/CHECKING/CHECKED links if uristate[uri] < PENDING # Mark it as PENDING now pending!(uri) # Mark as to crawl by pushing on the stack if internal_uri?(uri) stack.push(agent.get(uri)) else listener.crawl_skipped(referer_page, location) end end rescue => ex handle_error(ex, referer_page, location) end |
#handle_error(ex, referer_page, loc) ⇒ Object
Handles errors that occur
162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/waw/crawler.rb', line 162 def handle_error(ex, referer_page, loc) case ex when Mechanize::ResponseCodeError listener.reach_failure(referer_page, loc, ex) when Mechanize::UnsupportedSchemeError listener.scheme_failure(referer_page, loc, ex) when SocketError listener.socket_error(referer_page, loc, ex) else raise ex end end |
#internal_uri?(uri) ⇒ Boolean
Returns true if a given page is internal to the website currently crawled
61 62 63 |
# File 'lib/waw/crawler.rb', line 61 def internal_uri?(uri) uri.host.nil? or ((uri.host == root_uri.host) and (uri.port == root_uri.port)) end |
#pending!(uri) ⇒ Object
Marks an URI as currently pending
40 41 42 |
# File 'lib/waw/crawler.rb', line 40 def pending!(uri) uristate[uri] |= PENDING end |
#ping!(loc, referer_page) ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/waw/crawler.rb', line 140 def ping!(loc, referer_page) uri = resolve_uri(loc, referer_page) # Only ping uri that are not PINGED/PENDING/CHECKING/CHECKED return unless uristate[uri] < PINGED # bypass externals if required if internal_uri?(uri) || check_externals agent.head(uri) # ping! pinged!(uri) listener.ping_ok(referer_page, loc) else listener.ping_skipped(referer_page, loc) end rescue => ex handle_error(ex, referer_page, loc) end |
#pinged!(uri) ⇒ Object
Marks an URI as being pinged
45 46 47 |
# File 'lib/waw/crawler.rb', line 45 def pinged!(uri) uristate[uri] |= PINGED end |
#resolve_uri(href_or_src, page) ⇒ Object
Resolves as an absolute URI something that has been found on a page
67 68 69 |
# File 'lib/waw/crawler.rb', line 67 def resolve_uri(href_or_src, page) URI::parse(agent.send(:resolve, href_or_src, page)) end |