Module: FeedTools::UriHelper

Defined in:
lib/feed_tools/helpers/uri_helper.rb

Overview

Generic url processing methods needed in numerous places throughout FeedTools

Class Method Summary collapse

Class Method Details

.build_tag_uri(url, date) ⇒ Object

Converts a url into a tag uri



176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/feed_tools/helpers/uri_helper.rb', line 176

def self.build_tag_uri(url, date)
  unless url.kind_of? String
    raise ArgumentError, "Expected String, got #{url.class.name}"
  end
  unless date.kind_of? Time
    raise ArgumentError, "Expected Time, got #{date.class.name}"
  end
  tag_uri = normalize_url(url)
  unless FeedTools::UriHelper.is_uri?(tag_uri)
    raise ArgumentError, "Must supply a valid URL."
  end
  host = URI.parse(tag_uri).host
  tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
  tag_uri.gsub!(/#/, "/")
  tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
    "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
  return tag_uri
end

.build_urn_uri(url) ⇒ Object

Converts a url into a urn:uuid: uri



196
197
198
199
200
201
202
203
# File 'lib/feed_tools/helpers/uri_helper.rb', line 196

def self.build_urn_uri(url)
  unless url.kind_of? String
    raise ArgumentError, "Expected String, got #{url.class.name}"
  end
  normalized_url = normalize_url(url)
  require 'uuidtools'
  return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
end

.idn_enabled?Boolean

Returns true if the idn module can be used.

Returns:

  • (Boolean)


33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/feed_tools/helpers/uri_helper.rb', line 33

def self.idn_enabled?
  # This is an override variable to keep idn from being used even if it
  # is available.
  if FeedTools.configurations[:idn_enabled] == false
    return false
  end
  if @idn_enabled.nil? || @idn_enabled == false
    @idn_enabled = false
    begin
      require 'idn'
      if IDN::Idna.toASCII('http://www.詹姆斯.com/') ==
        "http://www.xn--8ws00zhy3a.com/"
        @idn_enabled = true
      else
        @idn_enabled = false
      end
    rescue LoadError
      # Tidy not installed, disable features that rely on tidy.
      @idn_enabled = false
    end
  end
  return @idn_enabled
end

.is_uri?(url) ⇒ Boolean

Returns true if the parameter appears to be a valid uri

Returns:

  • (Boolean)


206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/feed_tools/helpers/uri_helper.rb', line 206

def self.is_uri?(url)
  return false if url.nil?
  begin
    uri = URI.parse(url)
    if uri.scheme.blank?
      return false
    end
  rescue URI::InvalidURIError
    return false
  end
  return true
end

.normalize_url(url) ⇒ Object

Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls and makes every effort to figure out what it was supposed to be. Also translates from the feed: and rss: pseudo-protocols to the http: protocol.



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/feed_tools/helpers/uri_helper.rb', line 61

def self.normalize_url(url)
  if url.nil?
    return nil
  end
  if !url.kind_of?(String)
    url = url.to_s
  end
  if url.blank?
    return ""
  end
  normalized_url = url.strip

  begin
    normalized_url =
      FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
  rescue Exception
  end
  
  begin
    begin
      normalized_url =
        FeedTools::URI.parse(normalized_url.strip).normalize.to_s
    rescue Exception
      normalized_url = CGI.unescape(url.strip)
    end
  rescue Exception
    normalized_url = url.strip
  end

  # if a url begins with the '/' character, it only makes sense that they
  # meant to be using a file:// url.  Fix it for them.
  if normalized_url.length > 0 && normalized_url[0..0] == "/"
    normalized_url = "file://" + normalized_url
  end

  # if a url begins with a drive letter followed by a colon, we're looking at
  # a file:// url.  Fix it for them.
  if normalized_url.length > 0 &&
      normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
    normalized_url = "file:///" + normalized_url
  end

  # if a url begins with javascript:, it's quite possibly an attempt at
  # doing something malicious.  Let's keep that from getting anywhere,
  # shall we?
  if (normalized_url.downcase =~ /javascript:/) != nil
    return "#"
  end

  # deal with all of the many ugly possibilities involved in the rss:
  # and feed: pseudo-protocols (incidentally, whose crazy idea was this
  # mess?)
  normalized_url.gsub!(/^htp:\/*/i, "http://")
  normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
  normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
  normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
  normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
  normalized_url.gsub!(/^file:\/*/i, "file:///")
  normalized_url.gsub!(/^https:\/*/i, "https://")
  normalized_url.gsub!(/^mms:\/*/i, "http://")
  # fix (very) bad urls (usually of the user-entered sort)
  normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
  normalized_url.gsub!(/^http:\/*$/i, "")

  if (normalized_url =~ /^file:/i) == 0
    # Adjust windows-style urls
    normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
    normalized_url.gsub!(/\\/, '/')
  else
    if FeedTools::URI.parse(normalized_url).scheme == nil &&
        normalized_url =~ /\./ &&
      normalized_url = "http://" + normalized_url
    end
    if normalized_url == "http://"
      return nil
    end
  end
  if normalized_url =~ /^https?:\/\/#/i
    normalized_url.gsub!(/^https?:\/\/#/i, "#")
  end
  if normalized_url =~ /^https?:\/\/\?/i
    normalized_url.gsub!(/^https?:\/\/\?/i, "?")
  end

  normalized_url =
    FeedTools::URI.parse(normalized_url.strip).normalize.to_s
  return normalized_url
end

.resolve_relative_uri(relative_uri, base_uri_sources = []) ⇒ Object

Resolves a relative uri



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/feed_tools/helpers/uri_helper.rb', line 151

def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
  return relative_uri if base_uri_sources.blank?
  return nil if relative_uri.nil?
  begin
    # Massive HACK to get around file protocol URIs being used to
    # resolve relative URIs on feeds in the local file system.
    # Better to leave these URIs unresolved and hope some other
    # tool resolves them correctly.
    base_uri_sources.reject! do |base_uri|
      base_uri == nil ||
        FeedTools::URI.parse(base_uri).scheme == "file"
    end
    base_uri = FeedTools::URI.parse(
      FeedTools::XmlHelper.select_not_blank(base_uri_sources))
    resolved_uri = base_uri
    if relative_uri.to_s != ''
      resolved_uri = base_uri + relative_uri.to_s
    end
    return FeedTools::UriHelper.normalize_url(resolved_uri.to_s)
  rescue
    return relative_uri
  end
end