Module: SL::URL

Included in:
SL
Defined in:
lib/searchlink/url.rb

Overview

URL module

Class Method Summary collapse

Class Method Details

.amazon_affiliatize(url, amazon_partner) ⇒ Object



110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/searchlink/url.rb', line 110

def amazon_affiliatize(url, amazon_partner)
  return url if amazon_partner.nil? || amazon_partner.empty?

  return [url, ""] unless url =~ %r{https?://(?<subdomain>.*?)amazon.com/(?:(?<title>.*?)/)?(?<type>[dg])p/(?<id>[^?]+)}

  m = Regexp.last_match
  sd = m["subdomain"]
  title = m["title"].gsub(/-/, " ")
  t = m["type"]
  id = m["id"]
  ["https://#{sd}amazon.com/#{t}p/#{id}/?ref=as_li_ss_tl&ie=UTF8&linkCode=sl1&tag=#{amazon_partner}", title]
end

.follow_redirects(url, limit = 5) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/searchlink/url.rb', line 8

def follow_redirects(url, limit = 5)
  return url if limit.zero?

  uri = URI.parse(url)
  response = Net::HTTP.get_response(uri)

  case response
  when Net::HTTPSuccess
    response.uri.to_s
  when Net::HTTPRedirection
    follow_redirects(response["location"], limit - 1)
  else
    url
  end
end

.only_url?(input) ⇒ Boolean

Returns:

  • (Boolean)


68
69
70
# File 'lib/searchlink/url.rb', line 68

def only_url?(input)
  input =~ %r{(?i)^((http|https)://)?([\w\-_]+(\.[\w\-_]+)+)([\w\-.,@?^=%&amp;:/~+#]*[\w\-@^=%&amp;/~+#])?$}
end

.ref_title_for_url(url) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/searchlink/url.rb', line 72

def ref_title_for_url(url)
  url = URI.parse(url) if url.is_a?(String)

  parts = url.hostname.split(/\./)
  domain = if parts.count > 1
      parts.slice(-2, 1).join("")
    else
      parts.join("")
    end

  path = url.path.split(%r{/}).last
  if path
    path.gsub!(/-/, " ").gsub!(/\.\w{2-4}$/, "")
  else
    path = domain
  end

  path.length > domain.length ? path : domain
end

.title(url) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/searchlink/url.rb', line 123

def title(url)
  title = nil

  ## Gather proving too inexact
  # gather = false
  # ['/usr/local/bin', '/opt/homebrew/bin'].each do |root|
  #   if File.exist?(File.join(root, 'gather')) && File.executable?(File.join(root, 'gather'))
  #     gather = File.join(root, 'gather')
  #     break
  #   end
  # end

  # if gather
  #   cmd = %(#{gather} --title-only '#{url.strip}' --fallback-title 'Unknown')
  #   title = SL::Util.exec_with_timeout(cmd, 15)
  #   if title
  #     title = title.strip.gsub(/\n+/, ' ').gsub(/ +/, ' ')
  #     title.remove_seo!(url) if SL.config['remove_seo']
  #     return title.remove_protocol
  #   else
  #     SL.add_error('Error retrieving title', "Gather timed out on #{url}")
  #     SL.notify('Error retrieving title', 'Gather timed out')
  #   end
  # end

  begin
    if url =~ %r{https://(amzn.to|(www\.)?amazon\.com)/}
      final_url = follow_redirects(url)
      m = final_url.match(%r{https://www.amazon.com/(.*?)/dp/})
      title = if m
          m[1].gsub(/-/, " ")
        else
          url.remove_protocol
        end
      return title
    end

    page = Curl::Html.new(url)

    title = page.title || nil

    if title.nil? || title =~ /^\s*$/
      SL.add_error("Title not found", "Warning: missing title for #{url.strip}")
      title = url.gsub(%r{(^https?://|/.*$)}, "").gsub(/-/, " ").strip
    else
      title = title.gsub(/\n/, " ").gsub(/\s+/, " ").strip # .sub(/[^a-z]*$/i,'')
      title.remove_seo!(url) if SL.config["remove_seo"]
    end
    title.gsub!(/\|/, "")
    title.remove_seo!(url.strip) if SL.config["remove_seo"]
    title.remove_protocol
  rescue StandardError
    SL.add_error("Error retrieving title", "Error determining title for #{url.strip}")
    url.remove_protocol
  end
end

.url?(input) ⇒ Boolean

Returns:

  • (Boolean)


64
65
66
# File 'lib/searchlink/url.rb', line 64

def url?(input)
  input =~ %r{^(#.*|https?://\S+|/\S+|\S+/|[^!]\S+\.\S+)(\s+".*?")?$}
end


92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/searchlink/url.rb', line 92

def url_to_link(url, type)
  input = url.dup

  if only_url?(input)
    input.sub!(%r{(?mi)^(?!https?://)(.*?)$}, 'https://\1')
    url = URI.parse(input.downcase)

    title = if type == :ref_title
        ref_title_for_url(url)
      else
        title(url.to_s) || input.sub(%r{^https?://}, "")
      end

    return [url.to_s, title] if url.hostname
  end
  false
end

.valid_link?(uri_str, limit = 5) ⇒ Boolean

Validates that a link exists and returns 200

Returns:

  • (Boolean)


25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/searchlink/url.rb', line 25

def valid_link?(uri_str, limit = 5)
  return false unless uri_str

  SL.notify("Validating", uri_str)
  return false if limit.zero?

  url = URI(uri_str)
  return true unless url.scheme

  url.path = "/" if url.path == ""
  # response = Net::HTTP.get_response(URI(uri_str))
  response = false

  Net::HTTP.start(url.host, url.port, use_ssl: url.scheme == "https") do |http|
    response = http.request_head(url.path)
  end

  case response
  when Net::HTTPMethodNotAllowed, Net::HTTPServiceUnavailable
    unless /amazon\.com/ =~ url.host
      SL.add_error("link validation", "Validation blocked: #{uri_str} (#{e})")
    end
    SL.notify("Error validating", uri_str)
    true
  when Net::HTTPSuccess
    true
  when Net::HTTPRedirection
    location = response["location"]
    valid_link?(location, limit - 1)
  else
    SL.notify("Error validating", uri_str)
    false
  end
rescue StandardError => e
  SL.notify("Error validating", uri_str)
  SL.add_error("link validation", "Possibly invalid => #{uri_str} (#{e})")
  true
end