Class: RDig::HttpDocument

Inherits:
Document show all
Defined in:
lib/rdig/documents.rb

Overview

Remote Document to be retrieved by HTTP

Instance Attribute Summary collapse

Attributes inherited from Document

#content, #content_type, #uri

Instance Method Summary collapse

Methods inherited from Document

#body, create, #has_content?, #links, #needs_indexing?, #title, #to_s

Constructor Details

#initialize(args = {}) ⇒ HttpDocument

url: url of this document, may be relative to the referring doc or host. referrer: uri of the document we retrieved this link from



116
117
118
119
120
# File 'lib/rdig/documents.rb', line 116

def initialize(args={})
  super(args)
  @referring_uri = args[:referrer]
  @depth = args[:depth] || 0
end

Instance Attribute Details

#depthObject (readonly)

counts how far this document is away from one of the start urls. Used to limit crawling by depth.



105
106
107
# File 'lib/rdig/documents.rb', line 105

def depth
  @depth
end

#etagObject (readonly)

Returns the value of attribute etag.



108
109
110
# File 'lib/rdig/documents.rb', line 108

def etag
  @etag
end

#referring_uriObject (readonly)

Returns the value of attribute referring_uri.



106
107
108
# File 'lib/rdig/documents.rb', line 106

def referring_uri
  @referring_uri
end

#statusObject (readonly)

Returns the value of attribute status.



107
108
109
# File 'lib/rdig/documents.rb', line 107

def status
  @status
end

Instance Method Details

#create_child(uri) ⇒ Object



110
111
112
# File 'lib/rdig/documents.rb', line 110

def create_child(uri)
  HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i 
end

#fetchObject



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/rdig/documents.rb', line 122

def fetch
  RDig.logger.debug "fetching #{@uri.to_s}"
  open(@uri.to_s, RDig::open_uri_http_options) do |doc|
    if @uri.to_s != doc.base_uri.to_s
      @status = :redirect
      @content = doc.base_uri
    else
      case doc.status.first.to_i
      when 200
        @etag = doc.meta['etag']
        @content = ContentExtractors.process(doc.read, doc.content_type)
        @status = :success
      when 404
        RDig.logger.info "got 404 for #{@uri}"
      else
        RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
      end
    end
  end
rescue
  RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
ensure
  @content ||= {}
end