Class: Rack::Httperflog::HtmlPathsExtractor

Inherits:
Struct
  • Object
show all
Defined in:
lib/rack/httperflog.rb

Overview

Extract each path linked inside the source html document

HtmlPathsExtractor.new(source)

source

an html document

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#sourceObject

Returns the value of attribute source

Returns:

  • (Object)

    the current value of source



93
94
95
# File 'lib/rack/httperflog.rb', line 93

def source
  @source
end

Instance Method Details

#attributes_from_nodes(attribute_name, node_type) ⇒ Object

extract the attribute attribute_name from all the html nodes of type node_type. returns only non-empty attributes that do not start with http (quick way to avoid paths that point to an external host).



112
113
114
# File 'lib/rack/httperflog.rb', line 112

def attributes_from_nodes(attribute_name, node_type)
  (parsed_body / node_type).map { |node| node[attribute_name].to_s.strip }.select { |path| path =~ /\S/ && path !~ /^http:\/\// }
end

#parsed_bodyObject

parse the source html



105
106
107
# File 'lib/rack/httperflog.rb', line 105

def parsed_body
  @parsed_body ||= Nokogiri(source.to_s)
end

#pathsObject

return all link, script, img and iframe paths found in the html document



96
97
98
99
100
101
102
# File 'lib/rack/httperflog.rb', line 96

def paths
  links   = attributes_from_nodes("href", "link")
  scripts = attributes_from_nodes("src",  "script")
  imgs    = attributes_from_nodes("src",  "img")
  iframes = attributes_from_nodes("src",  "iframe")
  links + scripts + imgs + iframes
end