Class: ImageScraper::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/image_scraper/client.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Client

Returns a new instance of Client.



6
7
8
9
10
11
12
13
14
# File 'lib/image_scraper/client.rb', line 6

def initialize(url,options={})
  options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
  @url = URI.escape(url)
  @convert_to_absolute_url = options[:convert_to_absolute_url]
  @include_css_images = options[:include_css_images]
  @include_css_data_images = options[:include_css_data_images]
  html = open(@url).read rescue nil
  @doc = html ? Nokogiri::HTML(html, nil, 'UTF-8') : nil
end

Instance Attribute Details

#convert_to_absolute_urlObject

Returns the value of attribute convert_to_absolute_url.



4
5
6
# File 'lib/image_scraper/client.rb', line 4

def convert_to_absolute_url
  @convert_to_absolute_url
end

#docObject

Returns the value of attribute doc.



4
5
6
# File 'lib/image_scraper/client.rb', line 4

def doc
  @doc
end

#include_css_data_imagesObject

Returns the value of attribute include_css_data_images.



4
5
6
# File 'lib/image_scraper/client.rb', line 4

def include_css_data_images
  @include_css_data_images
end

#include_css_imagesObject

Returns the value of attribute include_css_images.



4
5
6
# File 'lib/image_scraper/client.rb', line 4

def include_css_images
  @include_css_images
end

#urlObject

Returns the value of attribute url.



4
5
6
# File 'lib/image_scraper/client.rb', line 4

def url
  @url
end

Instance Method Details

#image_urlsObject



16
17
18
19
20
# File 'lib/image_scraper/client.rb', line 16

def image_urls
  images = page_images
  images += stylesheet_images if include_css_images
  images
end

#page_imagesObject



22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/image_scraper/client.rb', line 22

def page_images
  urls = []
  return urls if doc.blank?
  doc.xpath("//img").each do |img|
    next if img["src"].blank?
    image = URI.escape(img["src"].strip)
    image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
    image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
    urls << image
  end
  urls
end

#stylesheet_imagesObject



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/image_scraper/client.rb', line 35

def stylesheet_images
  images = []
  stylesheets.each do |stylesheet|
    file = open(stylesheet) rescue next
    css = file.string rescue IO.read(file) rescue next
    css = css.unpack("C*").pack("U*")
    images += css.scan(/url\((.*?)\)/).collect do |image_url|
      image_url = URI.escape ImageScraper::Util.cleanup_url(image_url[0])
      image_url = image_url.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
      if image_url.include?("data:image") and @include_css_data_images
        image_url
      else
        @convert_to_absolute_url ? ImageScraper::Util.absolute_url(stylesheet, image_url) : image_url
      end
    end
  end
  images.compact
end

#stylesheetsObject



54
55
56
57
58
59
# File 'lib/image_scraper/client.rb', line 54

def stylesheets
  return [] if doc.blank?
  doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
    ImageScraper::Util.absolute_url url, URI.escape(ImageScraper::Util.cleanup_url(stylesheet['href'])) rescue nil
  end.compact
end