Class: Seep::Fetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/seep/fetcher.rb

Constant Summary collapse

AGENT =
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Fetcher

Returns a new instance of Fetcher.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/seep/fetcher.rb', line 8

def initialize(url, options = {})
  self.url              = url
  self.request_headers  = {}.tap do |head|
    head['User-Agent']  = options[:user_agent] || AGENT
  end
  self.response_headers = {}
  self.max_file_size    = options[:max_file_size] || 1_048_576 # 1MB
  
  @curb                 = Curl::Easy.new(url)
  @curb.follow_location = true
  @curb.max_redirects   = options[:max_redirects] || 5

  register_on_header!
  register_on_body!
end

Instance Attribute Details

#bodyObject (readonly)

Returns the value of attribute body.



4
5
6
# File 'lib/seep/fetcher.rb', line 4

def body
  @body
end

#curbObject (readonly)

Returns the value of attribute curb.



4
5
6
# File 'lib/seep/fetcher.rb', line 4

def curb
  @curb
end

#max_file_sizeObject

Returns the value of attribute max_file_size.



5
6
7
# File 'lib/seep/fetcher.rb', line 5

def max_file_size
  @max_file_size
end

#request_headersObject

Returns the value of attribute request_headers.



6
7
8
# File 'lib/seep/fetcher.rb', line 6

def request_headers
  @request_headers
end

#response_headersObject

Returns the value of attribute response_headers.



6
7
8
# File 'lib/seep/fetcher.rb', line 6

def response_headers
  @response_headers
end

#sizeObject (readonly)

Returns the value of attribute size.



4
5
6
# File 'lib/seep/fetcher.rb', line 4

def size
  @size
end

#urlObject

Returns the value of attribute url.



5
6
7
# File 'lib/seep/fetcher.rb', line 5

def url
  @url
end

Class Method Details

.open(url, options = {}) ⇒ Object



76
77
78
# File 'lib/seep/fetcher.rb', line 76

def self.open(url, options = {})
  self.new(url, options).open
end

Instance Method Details

#content_typeObject



24
25
26
# File 'lib/seep/fetcher.rb', line 24

def content_type
  response_headers['Content-Type']
end

#dest_urlObject



28
29
30
# File 'lib/seep/fetcher.rb', line 28

def dest_url
  curb.last_effective_url || url
end

#doc?Boolean

Returns:

  • (Boolean)


68
69
70
# File 'lib/seep/fetcher.rb', line 68

def doc?
  content_type == "text/html"
end

#export(path) ⇒ Object



43
44
45
46
47
# File 'lib/seep/fetcher.rb', line 43

def export(path)
  File.open(path, 'w') do |file|
    file.write(@body)
  end
end

#extObject



49
50
51
52
53
54
55
56
57
58
# File 'lib/seep/fetcher.rb', line 49

def ext
  case content_type
  when "image/jpeg"; ".jpg"
  when "image/png" ; ".png"
  when "image/gif" ; ".gif"
  when "text/html" ; ".html"
  when "text/plain"; ".txt"
  else; ""
  end
end

#image?Boolean

Returns:

  • (Boolean)


60
61
62
# File 'lib/seep/fetcher.rb', line 60

def image?
  (!! content_type =~ /^image/) and to_image.valid?
end

#inspectObject



39
40
41
# File 'lib/seep/fetcher.rb', line 39

def inspect
  "#<Seep::Fetcher #{ content_type.nil? ? dest_url : content_type + ' ' + dest_url }>"
end

#open(redirect = 0) ⇒ Object



32
33
34
35
36
37
# File 'lib/seep/fetcher.rb', line 32

def open(redirect = 0)
  @body = ""; @size = 0
  curb.headers = request_headers
  curb.perform
  self
end

#register_on_body!Object



96
97
98
99
100
101
102
103
104
105
106
# File 'lib/seep/fetcher.rb', line 96

def register_on_body!
  curb.on_body do |body|
    @body += body
    @size  = @body.length
    if @size > max_file_size
      @size = -1
    else
      body.length
    end
  end
end

#register_on_header!Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/seep/fetcher.rb', line 80

def register_on_header!
  curb.on_header do |header|
    key, value = header.split(":", 2)
    unless key.nil? or value.nil?
      key.strip!; value.strip!
      @size = value.to_i if key == "Content-Length"
      response_headers[key] = value
    end
    if @size > max_file_size
      @size = -1
    else
      header.length
    end
  end
end

#to_docObject



72
73
74
# File 'lib/seep/fetcher.rb', line 72

def to_doc
  @doc ||= Seep::Doc.new(url, body)
end

#to_imageObject



64
65
66
# File 'lib/seep/fetcher.rb', line 64

def to_image
  @image ||= Seep::Image.new(body)
end