Class: ImageScraper::Client
- Inherits:
-
Object
- Object
- ImageScraper::Client
- Defined in:
- lib/image_scraper/client.rb
Instance Attribute Summary collapse
-
#convert_to_absolute_url ⇒ Object
Returns the value of attribute convert_to_absolute_url.
-
#doc ⇒ Object
Returns the value of attribute doc.
-
#include_css_data_images ⇒ Object
Returns the value of attribute include_css_data_images.
-
#include_css_images ⇒ Object
Returns the value of attribute include_css_images.
-
#url ⇒ Object
Returns the value of attribute url.
Instance Method Summary collapse
- #image_urls ⇒ Object
-
#initialize(url, options = {}) ⇒ Client
constructor
A new instance of Client.
- #page_images ⇒ Object
- #stylesheet_images ⇒ Object
- #stylesheets ⇒ Object
Constructor Details
#initialize(url, options = {}) ⇒ Client
Returns a new instance of Client.
6 7 8 9 10 11 12 13 14 |
# File 'lib/image_scraper/client.rb', line 6 def initialize(url,={}) .reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false) @url = URI.escape(url) @convert_to_absolute_url = [:convert_to_absolute_url] @include_css_images = [:include_css_images] @include_css_data_images = [:include_css_data_images] html = open(@url).read rescue nil @doc = html ? Nokogiri::HTML(html, nil, 'UTF-8') : nil end |
Instance Attribute Details
#convert_to_absolute_url ⇒ Object
Returns the value of attribute convert_to_absolute_url.
4 5 6 |
# File 'lib/image_scraper/client.rb', line 4 def convert_to_absolute_url @convert_to_absolute_url end |
#doc ⇒ Object
Returns the value of attribute doc.
4 5 6 |
# File 'lib/image_scraper/client.rb', line 4 def doc @doc end |
#include_css_data_images ⇒ Object
Returns the value of attribute include_css_data_images.
4 5 6 |
# File 'lib/image_scraper/client.rb', line 4 def include_css_data_images @include_css_data_images end |
#include_css_images ⇒ Object
Returns the value of attribute include_css_images.
4 5 6 |
# File 'lib/image_scraper/client.rb', line 4 def include_css_images @include_css_images end |
#url ⇒ Object
Returns the value of attribute url.
4 5 6 |
# File 'lib/image_scraper/client.rb', line 4 def url @url end |
Instance Method Details
#image_urls ⇒ Object
16 17 18 19 20 |
# File 'lib/image_scraper/client.rb', line 16 def image_urls images = page_images images += stylesheet_images if include_css_images images end |
#page_images ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/image_scraper/client.rb', line 22 def page_images urls = [] return urls if doc.blank? doc.xpath("//img").each do |img| next if img["src"].blank? image = URI.escape(img["src"].strip) image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url urls << image end urls end |
#stylesheet_images ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/image_scraper/client.rb', line 35 def stylesheet_images images = [] stylesheets.each do |stylesheet| file = open(stylesheet) rescue next css = file.string rescue IO.read(file) rescue next css = css.unpack("C*").pack("U*") images += css.scan(/url\((.*?)\)/).collect do |image_url| image_url = URI.escape ImageScraper::Util.cleanup_url(image_url[0]) image_url = image_url.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get if image_url.include?("data:image") and @include_css_data_images image_url else @convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url end end end images.compact end |
#stylesheets ⇒ Object
54 55 56 57 58 59 |
# File 'lib/image_scraper/client.rb', line 54 def stylesheets return [] if doc.blank? doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet| ImageScraper::Util.absolute_url url, URI.escape(ImageScraper::Util.cleanup_url(stylesheet['href'])) rescue nil end.compact end |