Class: Redback
- Inherits:
-
Object
- Object
- Redback
- Defined in:
- lib/redback.rb
Instance Method Summary collapse
- #crawl_page(url, limit = 10) ⇒ Object
- #find_links(doc, url) ⇒ Object
-
#initialize(url, &each_site) ⇒ Redback
constructor
A new instance of Redback.
- #queue_link(url) ⇒ Object
- #spider(&block) ⇒ Object
Constructor Details
#initialize(url, &each_site) ⇒ Redback
Returns a new instance of Redback.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/redback.rb', line 8 def initialize(url, &each_site) if url =~ /^(([a-zA-Z]|[a-zA-Z][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z]|[A-Za-z][A-Za-z0-9\-]*[A-Za-z0-9])$/ url = 'http://' + url end @uri = URI.parse(url) @pages_hit = 0 @visited = [] @to_visit = [] @each_site = each_site @options = { :ignore_hash => true, :ignore_query_string => false, :search_in_comments => false, :threads => 4, :num_pages => 1000 } crawl_page(url) spider end |
Instance Method Details
#crawl_page(url, limit = 10) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/redback.rb', line 38 def crawl_page(url, limit = 10) # Don't crawl a page twice return if @visited.include? url # Let's not hit this again @visited << url begin uri = URI.parse(URI.encode(url.to_s.strip)) rescue return end headers = { "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31", "Accept-Charset" => "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" } begin req = Net::HTTP::Get.new(uri.path, headers) response = Net::HTTP.start(uri.host, uri.port) { |http| http.request(req) } case response when Net::HTTPRedirection return crawl_page(response['location'], limit - 1) when Net::HTTPSuccess doc = Hpricot(response.body) end rescue return end @pages_hit += 1 @each_site.call url find_links(doc, url) do |link| next if @visited.include? link next if @to_visit.include? link @to_visit << link end end |
#find_links(doc, url) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/redback.rb', line 83 def find_links(doc, url) return unless doc.respond_to? 'search' begin uri = URI.parse(URI.encode(url.to_s.strip)) rescue return end hrefs = [] # Looks like a valid document! Let's parse it for links doc.search("//a[@href]").each do |e| hrefs << e.get_attribute("href") end if @options[:search_in_comments] # Let's also look for commented-out URIs doc.search("//comment()").each do |e| e.to_html.scan(/https?:\/\/[^\s\"]*/) { |url| hrefs << url; } end end hrefs.each do |href| # Skip mailto links next if href =~ /^mailto:/ # If we're dealing with a host-relative URL (e.g. <img src="/foo/bar.jpg">), absolutify it. if href.to_s =~ /^\// href = uri.scheme + "://" + uri.host + href.to_s end # If we're dealing with a path-relative URL, make it relative to the current directory. unless href.to_s =~ /[a-z]+:\/\// # Take everything up to the final / in the path to be the current directory. if uri.path =~ /\// /^(.*)\//.match(uri.path) path = $1 # If we're on the homepage, then we don't need a path. else path = "" end href = uri.scheme + "://" + uri.host + path + "/" + href.to_s end # At this point, we should have an absolute URL regardless of # its original format. # Strip hash links if ( @options[:ignore_hash] ) href.gsub!(/(#.*?)$/, '') end # Strip query strings if ( @options[:ignore_query_string] ) href.gsub!(/(\?.*?)$/, '') end begin href_uri = URI.parse(href) rescue # No harm in this — if we can't parse it as a URI, it probably isn't one (`javascript:` links, etc.) and we can safely ignore it. next end next if href_uri.host != uri.host next unless href_uri.scheme =~ /^https?$/ yield href end end |
#queue_link(url) ⇒ Object
34 35 36 |
# File 'lib/redback.rb', line 34 def queue_link(url) @to_visit << url end |
#spider(&block) ⇒ Object
156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/redback.rb', line 156 def spider(&block) Parallel.in_threads(@options[:threads]) { |thread_number| # We've crawled too many pages next if @pages_hit > @options[:num_pages] && @options[:num_pages] >= 0 while @to_visit.length > 0 do begin url = @to_visit.pop end while ( @visited.include? url ) crawl_page(url, block) end } end |