Module: ExternalUrl

Defined in:
lib/api_helpers/external_url.rb

Constant Summary collapse

REQUEST_HEADERS =
{
  'User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
}

Class Method Summary collapse

Class Method Details

.fetch_response(url, limit = 10, debug = false) ⇒ Object

returns a hash containing :success flag; if true, you’ll have the :response (thus response.body) and :final_uri (e.g. if redirected) If false a :message is set and :final_uri



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/api_helpers/external_url.rb', line 32

def self.fetch_response(url, limit = 10, debug = false)
  begin
    if limit == 0
      return {:success => false, :response => nil, :message => "Redirected too many times", :final_uri => url}
    end

    message = self.invalid_uri(url)
    if message
      return {:success => false, :response => nil, :message => message, :final_uri => url}
    end
    uri = URI.safe_parse(url.to_s)
    http_request = Net::HTTP.new(uri.host)
    if debug
      puts "http request: #{http_request.inspect}"
    end
    # Adding user agent header helps some merchants feel more comfortable with our bot
    no_host_url = uri.to_s.gsub(/.*?#{uri.host}(.*)/,'\1')
    if debug
      puts "http request to: #{no_host_url}"
    end
    response = http_request.get(no_host_url, REQUEST_HEADERS)
    if debug
      puts "http response: #{response.inspect}"
    end

    case response
    when Net::HTTPSuccess then
      if debug
        puts "Success, final url: #{url}"
        ExternalUrl.to_file(url, response.body, "html")
        ExternalUrl.to_file(url, ExternalUrl.just_page_content(response.body), "txt")
      end
      {:success => true, :response => response, :final_uri => url}
    when Net::HTTPRedirection then
      redirect_url = to_absolute_url(response['location'], url)
      if debug
        puts "Redirecting to #{redirect_url}"
      end
      self.fetch_response(redirect_url, limit - 1, debug)
    else
      {:success => false, :response => response, :final_uri => url}
    end
  rescue Exception => exp
    {:success => false, :response => nil, :message => exp.message, :final_uri => url}
  end
end

.just_page_content(page) ⇒ Object

Note: This method is only used now by the Validator, and is only suitable for the Validator.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/api_helpers/external_url.rb', line 10

def self.just_page_content(page)
  # No longer used
  justbody = /.*?<body.*?>(.*)<\/body>/im
  comments = /(<!--.*?-->)(.{1,40})/m
  nostyle = /<style.*?<\/style>/im
  notags = /<.*?>/im
  noentities = /&.*?;/
  noextrawhitespace = /(\s)+/im
  # Remove comments, unless inside of JavaScript (because frequently JavaScript has good matches for model numbers, etc.)
  page.gsub(comments) do |c|
    comment = $1
    post = $2
    if post =~ /<\/script/
      comment + post
    else
      post
    end
  end.gsub(nostyle,' ').gsub(notags,'').gsub(noentities,' ').gsub(noextrawhitespace,'\1')
end