Module: Murlsh::UriAsk

Defined in:
lib/murlsh/uri_ask.rb

Overview

URI mixin.

Constant Summary collapse

HtmlContentTypeRe =
%r{^text/html|application/xhtml\+xml}

Instance Method Summary collapse

Instance Method Details

#content_length(options = {}) ⇒ Object

Get the content length.

Options:

  • :failproof - if true hide all exceptions and return empty string on failure

  • :headers - hash of headers to send in request



20
# File 'lib/murlsh/uri_ask.rb', line 20

def content_length(options={}); header('content-length', options); end

#content_type(options = {}) ⇒ Object

Get the content type.

Options:

  • :failproof - if true hide all exceptions and return empty string on failure

  • :headers - hash of headers to send in request



27
# File 'lib/murlsh/uri_ask.rb', line 27

def content_type(options={}); header('content-type', options); end

#decode(s) ⇒ Object

Convert from the character set of this url to utf-8 and decode HTML entities.



112
113
114
# File 'lib/murlsh/uri_ask.rb', line 112

def decode(s)
  HTMLEntities.new.decode(Iconv.conv('utf-8', doc.encoding, s))
end

#default_headersObject

Default headers sent with the request.



93
94
95
96
97
98
99
100
101
102
103
# File 'lib/murlsh/uri_ask.rb', line 93

def default_headers
  result = {
    'User-Agent' =>
      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.126 Safari/535.1',
    }
  if host.to_s[/^www\.nytimes\.com/]
    result['Referer'] = 'http://news.google.com/'
  end

  result
end

#description(options = {}) ⇒ Object

Get the HTML meta description.

Options:

  • :failproof - if true hide all exceptions and return empty string on failure

  • :headers - hash of headers to send in request



51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/murlsh/uri_ask.rb', line 51

def description(options={})
  return @description  if defined?(@description)

  @description = ''

  d = doc(options)

  if d and d.description and not d.description.empty?
    @description = decode(d.description)
  end

  @description
end

#doc(options = {}) ⇒ Object

Get the parsed Nokogiri doc at this url.

Options:

  • :failproof - if true hide all exceptions and return empty string on failure

  • :headers - hash of headers to send in request



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/murlsh/uri_ask.rb', line 70

def doc(options={})
  return @doc  if defined?(@doc)
  options[:headers] = default_headers.merge(options.fetch(:headers, {}))

  @doc = nil
  if html?(options)
    Murlsh::failproof(options) do
      self.open(options[:headers]) do |f|
        data = f.read
        @doc = Nokogiri(data, to_s)
        # encoding unknown, reparse with f.charset, default to utf-8
        unless @doc.encoding
          @doc = Nokogiri(data, to_s, f.charset || 'utf-8')
        end
        @doc.extend(Murlsh::Doc)
      end
    end
  end

  @doc
end

#get_headers(options = {}) ⇒ Object

Get and cache response headers returned by HTTP GET for this URI.

Return hash values are single strings.

Options:

  • :failproof - if true hide all exceptions and return empty hash on failure

  • :headers - hash of headers to send in request



162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/murlsh/uri_ask.rb', line 162

def get_headers(options={})
  return @get_headers  if defined?(@get_headers)

  request_headers = default_headers.merge(options.fetch(:headers, {}))

  response_headers = {}
  # use open-uri instead of Net::HTTP because it handles redirects
  Murlsh::failproof(options) do
    response_headers = self.open(request_headers) { |f| f.meta }
  end

  @get_headers = response_headers
end

#head_headers(options = {}) ⇒ Object

Get and cache response headers returned by HTTP HEAD for this URI.

Return hash values are lists.

Options:

  • :failproof - if true hide all exceptions and return empty hash on failure

  • :headers - hash of headers to send in request



134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/murlsh/uri_ask.rb', line 134

def head_headers(options={})
  return @head_headers  if defined?(@head_headers)

  request_headers = default_headers.merge(options.fetch(:headers, {}))

  response_headers = {}
  Murlsh::failproof(options) do
    http = Net::HTTP.new(host, port)
    http.use_ssl = (scheme == 'https')

    extend(Murlsh::URIGetPathQuery)
    resp = http.request_head(get_path_query, request_headers)

    if Net::HTTPSuccess === resp
      response_headers = resp.to_hash
    end
  end

  @head_headers = response_headers
end

#header(header_name, options = {}) ⇒ Object

Get the value of a response header.

Options:

  • :failproof - if true hide all exceptions and return empty string on failure

  • :headers - hash of headers to send in request



121
122
123
124
125
# File 'lib/murlsh/uri_ask.rb', line 121

def header(header_name, options={})
  result = Array(head_headers(options)[header_name]).first
  result = get_headers(options)[header_name]  if result.to_s.empty?
  result.to_s
end

#html?(options = {}) ⇒ Boolean

Return true if the content type is HTML.

Returns:

  • (Boolean)


108
# File 'lib/murlsh/uri_ask.rb', line 108

def html?(options={}); content_type(options)[HtmlContentTypeRe]; end

#title(options = {}) ⇒ Object

Get the HTML title.

Options:

  • :failproof - if true hide all exceptions and return url on failure

  • :headers - hash of headers to send in request



34
35
36
37
38
39
40
41
42
43
44
# File 'lib/murlsh/uri_ask.rb', line 34

def title(options={})
  return @title  if defined?(@title)

  @title = to_s

  d = doc(options)

  if d and d.title and not d.title.empty?; @title = decode(d.title); end

  @title
end