Class: NetworkProfile::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/network_profile/extractor.rb

Constant Summary collapse

%r{
  (?: ((?:ed2k|ftp|http|https|irc|mailto|news|gopher|nntp|telnet|webcal|xmpp|callto|feed|svn|urn|aim|rsync|tag|ssh|sftp|rtsp|afs|file):)// | www\. )
  [^\s<\u00A0"]+
}ix.freeze
WORD_PATTERN =
'\p{Word}'.freeze
BRACKETS =
{ ']' => '[', ')' => '(', '}' => '{' }.freeze
TLD =
/(?<tld>com|de|net|fr|at|ch|info)/.freeze
HOST_PART =
%r{(?<host>[a-z\-\.0-9]+)}.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ Extractor

Returns a new instance of Extractor.



16
17
18
# File 'lib/network_profile/extractor.rb', line 16

def initialize(string)
  @string = string
end

Class Method Details

.call(string) ⇒ Object



12
13
14
# File 'lib/network_profile/extractor.rb', line 12

def self.call(string)
  new(string).extracted_links!
end

Instance Method Details

#extracted_links!Object



20
21
22
23
24
25
26
27
28
# File 'lib/network_profile/extractor.rb', line 20

def extracted_links!
  extracted = links.map do |l|
    NetworkProfile.parse(l)
  rescue StandardError => e
    p e
    nil
  end
  extracted.compact
end


30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/network_profile/extractor.rb', line 30

def links
  return @links if @links

  @links ||= []
  mapped_string.scan(AUTO_LINK_RE) { |_|
    scheme = Regexp.last_match(1)
    href = $&
    punctuation = []
    while href.sub!(%r{[^#{WORD_PATTERN}/-=&]$}, '')
      punctuation.push($&)
      if opening = BRACKETS[punctuation.last] and href.scan(opening).size > href.scan(punctuation.last).size
        href << punctuation.pop
        break
      end
    end
    href = 'https://' + href unless scheme
    @links << href
  }
  @links.uniq
end

#mapped_stringObject



54
55
56
57
58
59
60
61
62
63
64
# File 'lib/network_profile/extractor.rb', line 54

def mapped_string
  @string.
    gsub(%r{ (#{HOST_PART}\.#{TLD}/)}) { |_|
      host = Regexp.last_match['host']
      "https://#{host}.#{Regexp.last_match['tld']}/"
    }.
    gsub(%r{ www *\. +#{HOST_PART} *\. *#{TLD}(?<path>[^<\u00A0"]+)}) { |_|
      path = Regexp.last_match['path'].remove(' ')
      "www.#{Regexp.last_match['host']}.#{Regexp.last_match['tld']}#{path}"
    }
end