Module: SL::URL

Included in:: SL

Defined in:: lib/searchlink/url.rb

Overview

URL module

Class Method Summary collapse

.amazon_affiliatize(url, amazon_partner) ⇒ Object
.follow_redirects(url, limit = 5) ⇒ Object
.only_url?(input) ⇒ Boolean
.ref_title_for_url(url) ⇒ Object
.title(url) ⇒ Object
.url?(input) ⇒ Boolean
.url_to_link(url, type) ⇒ Object
.valid_link?(uri_str, limit = 5) ⇒ Boolean

Validates that a link exists and returns 200.

Class Method Details

.amazon_affiliatize(url, amazon_partner) ⇒ `Object`

# File 'lib/searchlink/url.rb', line 110

def amazon_affiliatize(url, amazon_partner)
  return url if amazon_partner.nil? || amazon_partner.empty?

  return [url, ""] unless url =~ %r{https?://(?<subdomain>.*?)amazon.com/(?:(?<title>.*?)/)?(?<type>[dg])p/(?<id>[^?]+)}

  m = Regexp.last_match
  sd = m["subdomain"]
  title = m["title"].gsub(/-/, " ")
  t = m["type"]
  id = m["id"]
  ["https://#{sd}amazon.com/#{t}p/#{id}/?ref=as_li_ss_tl&ie=UTF8&linkCode=sl1&tag=#{amazon_partner}", title]
end

.follow_redirects(url, limit = 5) ⇒ `Object`

# File 'lib/searchlink/url.rb', line 8

def follow_redirects(url, limit = 5)
  return url if limit.zero?

  uri = URI.parse(url)
  response = Net::HTTP.get_response(uri)

  case response
  when Net::HTTPSuccess
    response.uri.to_s
  when Net::HTTPRedirection
    follow_redirects(response["location"], limit - 1)
  else
    url
  end
end

.only_url?(input) ⇒ `Boolean`

Returns:

(Boolean)



68
69
70

# File 'lib/searchlink/url.rb', line 68

def only_url?(input)
  input =~ %r{(?i)^((http|https)://)?([\w\-_]+(\.[\w\-_]+)+)([\w\-.,@?^=%&amp;:/~+#]*[\w\-@^=%&amp;/~+#])?$}
end

.ref_title_for_url(url) ⇒ `Object`

# File 'lib/searchlink/url.rb', line 72

def ref_title_for_url(url)
  url = URI.parse(url) if url.is_a?(String)

  parts = url.hostname.split(/\./)
  domain = if parts.count > 1
      parts.slice(-2, 1).join("")
    else
      parts.join("")
    end

  path = url.path.split(%r{/}).last
  if path
    path.gsub!(/-/, " ").gsub!(/\.\w{2-4}$/, "")
  else
    path = domain
  end

  path.length > domain.length ? path : domain
end

.title(url) ⇒ `Object`

# File 'lib/searchlink/url.rb', line 123

def title(url)
  title = nil

  ## Gather proving too inexact
  # gather = false
  # ['/usr/local/bin', '/opt/homebrew/bin'].each do |root|
  #   if File.exist?(File.join(root, 'gather')) && File.executable?(File.join(root, 'gather'))
  #     gather = File.join(root, 'gather')
  #     break
  #   end
  # end

  # if gather
  #   cmd = %(#{gather} --title-only '#{url.strip}' --fallback-title 'Unknown')
  #   title = SL::Util.exec_with_timeout(cmd, 15)
  #   if title
  #     title = title.strip.gsub(/\n+/, ' ').gsub(/ +/, ' ')
  #     title.remove_seo!(url) if SL.config['remove_seo']
  #     return title.remove_protocol
  #   else
  #     SL.add_error('Error retrieving title', "Gather timed out on #{url}")
  #     SL.notify('Error retrieving title', 'Gather timed out')
  #   end
  # end

  begin
    if url =~ %r{https://(amzn.to|(www\.)?amazon\.com)/}
      final_url = follow_redirects(url)
      m = final_url.match(%r{https://www.amazon.com/(.*?)/dp/})
      title = if m
          m[1].gsub(/-/, " ")
        else
          url.remove_protocol
        end
      return title
    end

    page = Curl::Html.new(url)

    title = page.title || nil

    if title.nil? || title =~ /^\s*$/
      SL.add_error("Title not found", "Warning: missing title for #{url.strip}")
      title = url.gsub(%r{(^https?://|/.*$)}, "").gsub(/-/, " ").strip
    else
      title = title.gsub(/\n/, " ").gsub(/\s+/, " ").strip # .sub(/[^a-z]*$/i,'')
      title.remove_seo!(url) if SL.config["remove_seo"]
    end
    title.gsub!(/\|/, "—")
    title.remove_seo!(url.strip) if SL.config["remove_seo"]
    title.remove_protocol
  rescue StandardError
    SL.add_error("Error retrieving title", "Error determining title for #{url.strip}")
    url.remove_protocol
  end
end

.url?(input) ⇒ `Boolean`

Returns:

(Boolean)



64
65
66

# File 'lib/searchlink/url.rb', line 64

def url?(input)
  input =~ %r{^(#.*|https?://\S+|/\S+|\S+/|[^!]\S+\.\S+)(\s+".*?")?$}
end

.url_to_link(url, type) ⇒ `Object`

# File 'lib/searchlink/url.rb', line 92

def url_to_link(url, type)
  input = url.dup

  if only_url?(input)
    input.sub!(%r{(?mi)^(?!https?://)(.*?)$}, 'https://\1')
    url = URI.parse(input.downcase)

    title = if type == :ref_title
        ref_title_for_url(url)
      else
        title(url.to_s) || input.sub(%r{^https?://}, "")
      end

    return [url.to_s, title] if url.hostname
  end
  false
end

.valid_link?(uri_str, limit = 5) ⇒ `Boolean`

Validates that a link exists and returns 200

Returns:

(Boolean)

# File 'lib/searchlink/url.rb', line 25

def valid_link?(uri_str, limit = 5)
  return false unless uri_str

  SL.notify("Validating", uri_str)
  return false if limit.zero?

  url = URI(uri_str)
  return true unless url.scheme

  url.path = "/" if url.path == ""
  # response = Net::HTTP.get_response(URI(uri_str))
  response = false

  Net::HTTP.start(url.host, url.port, use_ssl: url.scheme == "https") do |http|
    response = http.request_head(url.path)
  end

  case response
  when Net::HTTPMethodNotAllowed, Net::HTTPServiceUnavailable
    unless /amazon\.com/ =~ url.host
      SL.add_error("link validation", "Validation blocked: #{uri_str} (#{e})")
    end
    SL.notify("Error validating", uri_str)
    true
  when Net::HTTPSuccess
    true
  when Net::HTTPRedirection
    location = response["location"]
    valid_link?(location, limit - 1)
  else
    SL.notify("Error validating", uri_str)
    false
  end
rescue StandardError => e
  SL.notify("Error validating", uri_str)
  SL.add_error("link validation", "Possibly invalid => #{uri_str} (#{e})")
  true
end

Module: SL::URL

Overview

Class Method Summary collapse

Class Method Details

.amazon_affiliatize(url, amazon_partner) ⇒ Object

.follow_redirects(url, limit = 5) ⇒ Object

.only_url?(input) ⇒ Boolean

.ref_title_for_url(url) ⇒ Object

.title(url) ⇒ Object

.url?(input) ⇒ Boolean

.url_to_link(url, type) ⇒ Object

.valid_link?(uri_str, limit = 5) ⇒ Boolean

.amazon_affiliatize(url, amazon_partner) ⇒ `Object`

.follow_redirects(url, limit = 5) ⇒ `Object`

.only_url?(input) ⇒ `Boolean`

.ref_title_for_url(url) ⇒ `Object`

.title(url) ⇒ `Object`

.url?(input) ⇒ `Boolean`

.url_to_link(url, type) ⇒ `Object`

.valid_link?(uri_str, limit = 5) ⇒ `Boolean`