Class: NetworkProfile::Extractor
- Inherits:
-
Object
- Object
- NetworkProfile::Extractor
- Defined in:
- lib/network_profile/extractor.rb
Constant Summary collapse
- AUTO_LINK_RE =
%r{ (?: ((?:ed2k|ftp|http|https|irc|mailto|news|gopher|nntp|telnet|webcal|xmpp|callto|feed|svn|urn|aim|rsync|tag|ssh|sftp|rtsp|afs|file):)// | www\. ) [^\s<\u00A0"]+ }ix.freeze
- WORD_PATTERN =
'\p{Word}'.freeze
- BRACKETS =
{ ']' => '[', ')' => '(', '}' => '{' }.freeze
- TLD =
/(?<tld>com|de|net|fr|at|ch|info)/.freeze
- HOST_PART =
%r{(?<host>[a-z\-\.0-9]+)}.freeze
Class Method Summary collapse
Instance Method Summary collapse
- #extracted_links! ⇒ Object
-
#initialize(string) ⇒ Extractor
constructor
A new instance of Extractor.
- #links ⇒ Object
- #mapped_string ⇒ Object
Constructor Details
#initialize(string) ⇒ Extractor
Returns a new instance of Extractor.
16 17 18 |
# File 'lib/network_profile/extractor.rb', line 16 def initialize(string) @string = string end |
Class Method Details
.call(string) ⇒ Object
12 13 14 |
# File 'lib/network_profile/extractor.rb', line 12 def self.call(string) new(string).extracted_links! end |
Instance Method Details
#extracted_links! ⇒ Object
20 21 22 23 24 25 26 27 28 |
# File 'lib/network_profile/extractor.rb', line 20 def extracted_links! extracted = links.map do |l| NetworkProfile.parse(l) rescue StandardError => e p e nil end extracted.compact end |
#links ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/network_profile/extractor.rb', line 30 def links return @links if @links @links ||= [] mapped_string.scan(AUTO_LINK_RE) { |_| scheme = Regexp.last_match(1) href = $& punctuation = [] while href.sub!(%r{[^#{WORD_PATTERN}/-=&]$}, '') punctuation.push($&) if opening = BRACKETS[punctuation.last] and href.scan(opening).size > href.scan(punctuation.last).size href << punctuation.pop break end end href = 'https://' + href unless scheme @links << href } @links.uniq end |
#mapped_string ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/network_profile/extractor.rb', line 54 def mapped_string @string. gsub(%r{ (#{HOST_PART}\.#{TLD}/)}) { |_| host = Regexp.last_match['host'] "https://#{host}.#{Regexp.last_match['tld']}/" }. gsub(%r{ www *\. +#{HOST_PART} *\. *#{TLD}(?<path>[^<\u00A0"]+)}) { |_| path = Regexp.last_match['path'].remove(' ') "www.#{Regexp.last_match['host']}.#{Regexp.last_match['tld']}#{path}" } end |