Class: Domainatrix::DomainParser

Inherits:

Object

Object
Domainatrix::DomainParser

show all

Includes:: Addressable

Defined in:: lib/domainatrix/domain_parser.rb

Constant Summary collapse

VALID_SCHEMA =

/^http[s]{0,1}$/

Instance Attribute Summary collapse

#approved_sections ⇒ Object readonly

Returns the value of attribute approved_sections.
#found_sections ⇒ Object readonly

Returns the value of attribute found_sections.
#public_suffixes ⇒ Object readonly

Returns the value of attribute public_suffixes.

Class Method Summary collapse

.parse(url) ⇒ Object

Instance Method Summary collapse

#initialize(file_name, approved_sections = (Array.new << "*")) ⇒ DomainParser constructor

A new instance of DomainParser.
#parse(url) ⇒ Object
#parse_domains_from_host(host) ⇒ Object
#read_dat_file(file_name) ⇒ Object
#split_domain(parts, tld_size) ⇒ Object

Constructor Details

#initialize(file_name, approved_sections = (Array.new << "*")) ⇒ `DomainParser`

Returns a new instance of DomainParser.

# File 'lib/domainatrix/domain_parser.rb', line 15

def initialize(file_name, approved_sections = (Array.new << "*"))
  @public_suffixes = {}
  @found_sections =[]
  @approved_sections = approved_sections
  read_dat_file(file_name)
end

Instance Attribute Details

#approved_sections ⇒ `Object` (readonly)

Returns the value of attribute approved_sections.



8
9
10

# File 'lib/domainatrix/domain_parser.rb', line 8

def approved_sections
  @approved_sections
end

#found_sections ⇒ `Object` (readonly)

Returns the value of attribute found_sections.



8
9
10

# File 'lib/domainatrix/domain_parser.rb', line 8

def found_sections
  @found_sections
end

#public_suffixes ⇒ `Object` (readonly)

Returns the value of attribute public_suffixes.



8
9
10

# File 'lib/domainatrix/domain_parser.rb', line 8

def public_suffixes
  @public_suffixes
end

Class Method Details

.parse(url) ⇒ `Object`



11
12
13

# File 'lib/domainatrix/domain_parser.rb', line 11

def self.parse(url)
  self.new("#{File.dirname(__FILE__)}/../effective_tld_names.dat").parse(url)
end

Instance Method Details

#parse(url) ⇒ `Object`

Raises:

(ParseError)

# File 'lib/domainatrix/domain_parser.rb', line 53

def parse(url)
  return {} unless url && url.strip != ''

  url = "http://#{url}" unless url[/:\/\//]
  url = url.downcase

  uri = begin
    Addressable::URI.parse(url)
  rescue Addressable::URI::InvalidURIError
    nil
  end

  raise ParseError, "URL is not parsable by Addressable::URI" if not uri
  url = uri.normalize.to_s
  raise ParseError, "URL does not have valid scheme" unless uri.scheme =~ VALID_SCHEMA
  raise ParseError, "URL does not have a valid host" if uri.host.nil?
 
  path = uri.path
  path << "?#{uri.query}" if uri.query
  path << "##{uri.fragment}" if uri.fragment

  if uri.host == 'localhost'
    uri_hash = { :public_suffix => '', :domain => 'localhost', :subdomain => '' }
  else
    uri_hash = parse_domains_from_host(uri.host || uri.basename)
  end

  uri_hash.merge({
    :scheme => uri.scheme,
    :host   => uri.host,
    :path   => path,
    :url    => url
  })
end

#parse_domains_from_host(host) ⇒ `Object`

# File 'lib/domainatrix/domain_parser.rb', line 108

def parse_domains_from_host(host)
  return {} unless host

  parts = host.split(".").reverse
  ip_address = false

  if host == '*'
    tld_size = 0
  elsif !parts.map { |part| part.match(/^\d{1,3}$/) }.include?(nil)
    # host is an ip address
    ip_address = true
  else
    main_tld = parts.first
    tld_size = 1
    raise ParseError, "Invalid URL" if parts.size < 2

    if main_tld != '*'

      #PunyCode, and New Anydomain TLD invalidate this, just use the DAT file
      #raise ParseError, "Invalid characters for TLD" unless main_tld =~ /^[a-z]{2,}/

      if not current_suffixes = @public_suffixes[main_tld]
        raise ParseError, "Invalid main TLD: #{main_tld}"
      end

      parts.each_with_index do |part, i|
        if current_suffixes.empty?
          # no extra rules found (eg domain.net)
          break
        else
          if current_suffixes.has_key?("!#{parts[i+1]}")
            # exception tld domain found (eg metro.tokyo.jp)
            break
          elsif current_suffixes.has_key?(parts[i+1])
            # valid extra domain level found (eg co.uk)
            tld_size += 1
            current_suffixes = current_suffixes[parts[i+1]]
          elsif current_suffixes.has_key?('*')
            # wildcard domain level (eg *.jp)
            tld_size += 1
            break
          else
            # no extra rules found (eg domain.net)
            break
          end # if current_suffixes
        end # if current_suffixes.empty?
      end # parts .. do  
    end# if main_tld
  end # if host 

  if ip_address
    subdomain, domain, tld = '', host, ''
  else
    subdomain, domain, tld = split_domain(parts, tld_size)
  end

  {:public_suffix => tld, :domain => domain, :subdomain => subdomain, :ip_address => ip_address}
end

#read_dat_file(file_name) ⇒ `Object`

# File 'lib/domainatrix/domain_parser.rb', line 22

def read_dat_file(file_name)
  # If we're in 1.9, make sure we're opening it in UTF-8
  if RUBY_VERSION >= '1.9'
    dat_file = File.open(file_name, "r:UTF-8")
  else
    dat_file = File.open(file_name)
  end
  section = ""
  
  dat_file.each_line do |line|
    line = line.strip
     #// ===BEGIN ICANN DOMAINS===
    if line =~ /^\/\/ ===BEGIN/
      section = /^\/\/ ===BEGIN(.*)===/.match(line)[1].strip
      @found_sections << section
    end 
    
    if @approved_sections.include?(section) or @approved_sections.include?("*")
      unless (line =~ /^\/\//) || line.empty?
        parts = line.split(".").reverse

        sub_hash = @public_suffixes
        parts.each do |part|
          sub_hash = (sub_hash[part] ||= {})
        end
      end
    end
    
  end
end

#split_domain(parts, tld_size) ⇒ `Object`

# File 'lib/domainatrix/domain_parser.rb', line 88

def split_domain(parts, tld_size)
  if parts.size == 1 and tld_size == 0
    subdomain = ''
    domain = '*'
    tld = ''
  else
    # parts are host split on . reversed, eg com.pauldix.www
    domain_parts = parts.reverse
    if domain_parts.size - tld_size <= 0
      raise ParseError, "Invalid TLD size found for #{domain_parts.join('.')}: #{tld_size}"
    end

    tld = domain_parts.slice!(-tld_size, tld_size).join('.')
    domain = domain_parts.pop
    subdomain = domain_parts.join('.')
  end

  [subdomain, domain, tld]
end

Class: Domainatrix::DomainParser

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_name, approved_sections = (Array.new << "*")) ⇒ DomainParser

Instance Attribute Details

#approved_sections ⇒ Object (readonly)

#found_sections ⇒ Object (readonly)

#public_suffixes ⇒ Object (readonly)

Class Method Details

.parse(url) ⇒ Object