Method: ContentLinkParser#initialize
- Defined in:
- lib/content_link_parser.rb
#initialize(url, content, options = {}) ⇒ ContentLinkParser
Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/content_link_parser.rb', line 8 def initialize(url, content, = {}) = {}.merge() @url = url @doc = Nokogiri::HTML(content) base_url = @url.to_s if @doc.at("base[href]") base_url = @doc.at("base[href]").attr("href").to_s @url = base_url if base_url end [:tags] = {} [:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]] [:tags][:images] = [["img[src]", "src"]] [:tags][:related] = [["link[rel]", "href"]] [:tags][:scripts] = [["script[src]", "src"]] [:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", lambda{|array,tag| first_regex =/url\((['"]?)(.*?)\1\)/ tag.content.scan(first_regex) {|match| array << Addressable::URI.parse(match[1]).to_s} }]] #clear the default tags if required [:tags] = {} if [:ignore_default_tags] [:tags].merge!([:additional_tags]) unless [:additional_tags].nil? end |