Module: SL::URL

Included in:
SL
Defined in:
lib/searchlink/url.rb

Overview

URL module

Class Method Summary collapse

Class Method Details

.amazon_affiliatize(url, amazon_partner) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/searchlink/url.rb', line 94

def amazon_affiliatize(url, amazon_partner)
  return url if amazon_partner.nil? || amazon_partner.empty?

  return [url, ""] unless url =~ %r{https?://(?<subdomain>.*?)amazon.com/(?:(?<title>.*?)/)?(?<type>[dg])p/(?<id>[^?]+)}

  m = Regexp.last_match
  sd = m["subdomain"]
  title = m["title"].gsub(/-/, " ")
  t = m["type"]
  id = m["id"]
  ["https://#{sd}amazon.com/#{t}p/#{id}/?ref=as_li_ss_tl&ie=UTF8&linkCode=sl1&tag=#{amazon_partner}", title]
end

.only_url?(input) ⇒ Boolean

Returns:

  • (Boolean)


52
53
54
# File 'lib/searchlink/url.rb', line 52

def only_url?(input)
  input =~ %r{(?i)^((http|https)://)?([\w\-_]+(\.[\w\-_]+)+)([\w\-.,@?^=%&amp;:/~+#]*[\w\-@^=%&amp;/~+#])?$}
end

.ref_title_for_url(url) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/searchlink/url.rb', line 56

def ref_title_for_url(url)
  url = URI.parse(url) if url.is_a?(String)

  parts = url.hostname.split(/\./)
  domain = if parts.count > 1
      parts.slice(-2, 1).join("")
    else
      parts.join("")
    end

  path = url.path.split(%r{/}).last
  if path
    path.gsub!(/-/, " ").gsub!(/\.\w{2-4}$/, "")
  else
    path = domain
  end

  path.length > domain.length ? path : domain
end

.title(url) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/searchlink/url.rb', line 107

def title(url)
  title = nil

  ## Gather proving too inexact
  # gather = false
  # ['/usr/local/bin', '/opt/homebrew/bin'].each do |root|
  #   if File.exist?(File.join(root, 'gather')) && File.executable?(File.join(root, 'gather'))
  #     gather = File.join(root, 'gather')
  #     break
  #   end
  # end

  # if gather
  #   cmd = %(#{gather} --title-only '#{url.strip}' --fallback-title 'Unknown')
  #   title = SL::Util.exec_with_timeout(cmd, 15)
  #   if title
  #     title = title.strip.gsub(/\n+/, ' ').gsub(/ +/, ' ')
  #     title.remove_seo!(url) if SL.config['remove_seo']
  #     return title.remove_protocol
  #   else
  #     SL.add_error('Error retrieving title', "Gather timed out on #{url}")
  #     SL.notify('Error retrieving title', 'Gather timed out')
  #   end
  # end

  begin
    page = Curl::Html.new(url)

    title = page.title || nil

    if title.nil? || title =~ /^\s*$/
      SL.add_error("Title not found", "Warning: missing title for #{url.strip}")
      title = url.gsub(%r{(^https?://|/.*$)}, "").gsub(/-/, " ").strip
    else
      title = title.gsub(/\n/, " ").gsub(/\s+/, " ").strip # .sub(/[^a-z]*$/i,'')
      title.remove_seo!(url) if SL.config["remove_seo"]
    end
    title.gsub!(/\|/, "")
    title.remove_seo!(url.strip) if SL.config["remove_seo"]
    title.remove_protocol
  rescue StandardError
    SL.add_error("Error retrieving title", "Error determining title for #{url.strip}")
    warn "Error retrieving title for #{url.strip}"
    url.remove_protocol
  end
end

.url?(input) ⇒ Boolean

Returns:

  • (Boolean)


48
49
50
# File 'lib/searchlink/url.rb', line 48

def url?(input)
  input =~ %r{^(#.*|https?://\S+|/\S+|\S+/|[^!]\S+\.\S+)(\s+".*?")?$}
end


76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/searchlink/url.rb', line 76

def url_to_link(url, type)
  input = url.dup

  if only_url?(input)
    input.sub!(%r{(?mi)^(?!https?://)(.*?)$}, 'https://\1')
    url = URI.parse(input.downcase)

    title = if type == :ref_title
        ref_title_for_url(url)
      else
        title(url.to_s) || input.sub(%r{^https?://}, "")
      end

    return [url.to_s, title] if url.hostname
  end
  false
end

.valid_link?(uri_str, limit = 5) ⇒ Boolean

Validates that a link exists and returns 200

Returns:

  • (Boolean)


9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/searchlink/url.rb', line 9

def valid_link?(uri_str, limit = 5)
  return false unless uri_str

  SL.notify("Validating", uri_str)
  return false if limit.zero?

  url = URI(uri_str)
  return true unless url.scheme

  url.path = "/" if url.path == ""
  # response = Net::HTTP.get_response(URI(uri_str))
  response = false

  Net::HTTP.start(url.host, url.port, use_ssl: url.scheme == "https") do |http|
    response = http.request_head(url.path)
  end

  case response
  when Net::HTTPMethodNotAllowed, Net::HTTPServiceUnavailable
    unless /amazon\.com/ =~ url.host
      SL.add_error("link validation", "Validation blocked: #{uri_str} (#{e})")
    end
    SL.notify("Error validating", uri_str)
    true
  when Net::HTTPSuccess
    true
  when Net::HTTPRedirection
    location = response["location"]
    valid_link?(location, limit - 1)
  else
    SL.notify("Error validating", uri_str)
    false
  end
rescue StandardError => e
  SL.notify("Error validating", uri_str)
  SL.add_error("link validation", "Possibly invalid => #{uri_str} (#{e})")
  true
end