Class: RUtilAnts::URLAccess::URLHandlers::HTTP

Inherits:
Object
  • Object
show all
Defined in:
lib/rUtilAnts/URLHandlers/HTTP.rb

Overview

Handler of HTTP URLs

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(iURL) ⇒ HTTP

Constructor

Parameters
  • iURL (String): The URL that this handler will manage



24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/rUtilAnts/URLHandlers/HTTP.rb', line 24

def initialize(iURL)
  @URL = iURL
  lURLMatch = iURL.match(/^(http|https):\/\/([^\/]*)\/(.*)$/)
  if (lURLMatch == nil)
    lURLMatch = iURL.match(/^(http|https):\/\/(.*)$/)
  end
  if (lURLMatch == nil)
    log_bug "URL #{iURL} was identified as an http like, but it appears to be false."
  else
    @URLProtocol, @URLServer, @URLPath = lURLMatch[1..3]
  end
end

Class Method Details

.get_matching_regexpsObject

Get a list of regexps matching the URL to get to this handler

Return
  • list<Regexp>: The list of regexps matching URLs from this handler



14
15
16
17
18
# File 'lib/rUtilAnts/URLHandlers/HTTP.rb', line 14

def self.get_matching_regexps
  return [
    /^(http|https):\/\/.*$/
  ]
end

Instance Method Details

#get_content(iFollowRedirections) ⇒ Object

Get the content of the URL

Parameters
  • iFollowRedirections (Boolean): Do we follow redirections while accessing the content ?

Return
  • Integer: Type of content returned

  • Object: The content, depending on the type previously returned:

    • Exception if CONTENT_ERROR: The corresponding error

    • String if CONTENT_REDIRECT: The new URL

    • String if CONTENT_STRING: The real content

    • String if CONTENT_LOCALFILENAME: The name of the local file name storing the content

    • String if CONTENT_LOCALFILENAME_TEMPORARY: The name of the temporary local file name storing the content



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/rUtilAnts/URLHandlers/HTTP.rb', line 77

def get_content(iFollowRedirections)
  rContentFormat = nil
  rContent = nil

  begin
    require 'net/http'
    Net::HTTP.start(@URLServer) do |iHTTPConnection|
      # Some websites filter out the default user agent (commons.mediawiki.org for example). Set another one.
      lResponse = iHTTPConnection.request_get("/#{@URLPath}", {'User-Agent' => 'RUtilAnts'})
      if ((iFollowRedirections) and
          (lResponse.is_a?(Net::HTTPRedirection)))
        # We access the file through a new URL
        rContent = lResponse['location']
        lNewURLMatch = rContent.match(/^(ftp|ftps|http|https):\/\/(.*)$/)
        if (lNewURLMatch == nil)
          if (rContent[0..0] == '/')
            rContent = "#{@URLProtocol}://#{@URLServer}#{rContent}"
          else
            rContent = "#{@URLProtocol}://#{@URLServer}/#{File.dirname(@URLPath)}/#{rContent}"
          end
        end
        rContentFormat = CONTENT_REDIRECT
      elsif (lResponse.is_a?(Net::HTTPOK))
        # We have the web page
        rContent = lResponse.body
        rContentFormat = CONTENT_STRING
      else
        # An error occurred
        rContent = RuntimeError.new("Access error to #{@URL}: #{lResponse.code}.")
        rContentFormat = CONTENT_ERROR
      end
    end
  rescue Exception
    rContent = $!
    rContentFormat = CONTENT_ERROR
  end

  return rContentFormat, rContent
end

#get_corresponding_file_base_nameObject

Get a corresponding file base name. This method has to make sure file extensions are respected, as it can be used for further processing.

Return
  • String: The file name



59
60
61
62
63
# File 'lib/rUtilAnts/URLHandlers/HTTP.rb', line 59

def get_corresponding_file_base_name
  # TODO: Handle the case where there is no base name (ie. www.google.com instead of www.google.com/index.html)
  # Check that extension has no characters following the URL (#, ? and ;)
  return get_valid_file_name(File.basename(@URLPath.gsub(/^([^#\?;]*).*$/,'\1')))
end

#get_crcObject

Get the current CRC of the URL

Return
  • Integer: The CRC



49
50
51
52
# File 'lib/rUtilAnts/URLHandlers/HTTP.rb', line 49

def get_crc
  # We consider HTTP URLs to be definitive: CRCs will never change.
  return 0
end

#get_server_idObject

Get the server ID

Return
  • String: The server ID



41
42
43
# File 'lib/rUtilAnts/URLHandlers/HTTP.rb', line 41

def get_server_id
  return "#{@URLProtocol}://#{@URLServer}"
end