Class: ImageDownloader::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/image_downloader/parser.rb

Constant Summary collapse

A_HREF_IMAGE_PREFIX =
'_a_href_'
STYLE_URL_IMAGE_PREFIX =
'_style_url_'
'_link_icon_'
COLLECT_METHODS_PREFIX =
'collect_from_'

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, user_agent) ⇒ Parser

Returns a new instance of Parser.



24
25
26
27
28
29
30
# File 'lib/image_downloader/parser.rb', line 24

def initialize(url, user_agent)
  @argument_url = url
  @user_agent = user_agent
  @url = URI.parse(url)
  @images = []
  @images_hash = {}
end

Instance Attribute Details

#argument_urlObject

Returns the value of attribute argument_url.



17
18
19
# File 'lib/image_downloader/parser.rb', line 17

def argument_url
  @argument_url
end

#contentObject

Returns the value of attribute content.



17
18
19
# File 'lib/image_downloader/parser.rb', line 17

def content
  @content
end

#imagesObject

Returns the value of attribute images.



17
18
19
# File 'lib/image_downloader/parser.rb', line 17

def images
  @images
end

#images_hashObject

Returns the value of attribute images_hash.



17
18
19
# File 'lib/image_downloader/parser.rb', line 17

def images_hash
  @images_hash
end

#urlObject

Returns the value of attribute url.



17
18
19
# File 'lib/image_downloader/parser.rb', line 17

def url
  @url
end

#user_agentObject

Returns the value of attribute user_agent.



17
18
19
# File 'lib/image_downloader/parser.rb', line 17

def user_agent
  @user_agent
end

Class Method Details

.all_collect_from_methodsObject Also known as: all_image_places



117
118
119
# File 'lib/image_downloader/parser.rb', line 117

def self.all_collect_from_methods
  Parser.instance_methods.select{|m| m =~ /#{COLLECT_METHODS_PREFIX}/}.map{|m| m.to_sym}.to_hash_keys{true}
end

.clear(str) ⇒ Object



46
47
48
49
50
51
52
# File 'lib/image_downloader/parser.rb', line 46

def self.clear(str)
  if str =~ /url/i
    str.gsub!(/^.*?url\(/,'')
    str.gsub!(/\)/,'')
  end
  str
end

Instance Method Details

#collect_from_a_href(path) ⇒ Object



74
75
76
77
78
79
80
81
# File 'lib/image_downloader/parser.rb', line 74

def collect_from_a_href(path)
  self.content.xpath('//a').each do |a|
    href = a[:href]
    URL.remove_new_line_symbols!(href)
    next if href !~ /\.(?:#{Images::IMAGE_EXTENSIONS.join('|')})/i
    self.push_to_images(path,href,{:file_name_prefix => A_HREF_IMAGE_PREFIX})
  end
end

#collect_from_img_src(path) ⇒ Object



66
67
68
69
70
71
72
# File 'lib/image_downloader/parser.rb', line 66

def collect_from_img_src(path)
  self.content.xpath('//img').each do |img|
    src = img[:src]
    URL.remove_new_line_symbols!(src)
    self.push_to_images(path,src)
  end
end


94
95
96
97
98
99
100
# File 'lib/image_downloader/parser.rb', line 94

def collect_from_link_icon(path)
  self.content.xpath('//link[@rel="shortcut icon"]').each do |link|
    src = link[:href]
    URL.remove_new_line_symbols!(src)
    self.push_to_images(path,src,{:file_name_prefix => LINK_ICON_IMAGE_PREFIX})
  end
end

#collect_from_style_url(path) ⇒ Object



83
84
85
86
87
88
89
90
91
92
# File 'lib/image_downloader/parser.rb', line 83

def collect_from_style_url(path)
  self.content.xpath("//*[@style]").each do |element|
    style = element[:style]
    next if style !~ /(?:background|background-image):\s*url\(['"]?(.*?)['"]?\)/i
    src = $1
    next if !src
    URL.remove_new_line_symbols!(src)
    self.push_to_images(path,src,{:file_name_prefix => STYLE_URL_IMAGE_PREFIX})
  end
end

#get_contentObject



58
59
60
# File 'lib/image_downloader/parser.rb', line 58

def get_content
  @content = Nokogiri::HTML(self.open_url)
end

#get_content_rawObject



32
33
34
35
# File 'lib/image_downloader/parser.rb', line 32

def get_content_raw
  @content = self.open_url.read
  @content.gsub!(/[\n\r\t]+/,' ')
end

#get_images(path, h = {}) ⇒ Object



62
63
64
# File 'lib/image_downloader/parser.rb', line 62

def get_images(path,h={})
  h.each_key{|key| self.send(key, path)}
end

#get_images_raw(path, h = {}) ⇒ Object



37
38
39
40
41
42
43
44
# File 'lib/image_downloader/parser.rb', line 37

def get_images_raw(path,h={})
  self.content.scan(/['"]+([^'"]+\.(?:#{Images::IMAGE_EXTENSIONS.join('|')}))[^'"]*['"]+/i) {|src|
    self.push_to_images(path,Parser.clear(src[0]))
  }
  self.content.scan(/(?:href|src)=([^\s'">]+\.(?:#{Images::IMAGE_EXTENSIONS.join('|')}))[^\s'">]*[>\s]+/i) {|src|
    self.push_to_images(path,Parser.clear(src[0]))
  }
end

#get_images_regexp(path, regexp) ⇒ Object



54
55
56
# File 'lib/image_downloader/parser.rb', line 54

def get_images_regexp(path,regexp)
  self.content.scan(regexp) {|src| self.push_to_images(path,src.to_s)}
end

#ignore_file_without(h = {}) ⇒ Object



111
112
113
114
115
# File 'lib/image_downloader/parser.rb', line 111

def ignore_file_without(h={})
  return if !h
  self.images.delete_if {|image| image.file_name !~ /\.[a-z]{0,5}$/i } if h[:extension]
  self.images.delete_if {|image| image.file_name !~ /\.(?:#{Images::IMAGE_EXTENSIONS.join('|')})$/i } if h[:image_extension]
end

#open_urlObject



125
126
127
# File 'lib/image_downloader/parser.rb', line 125

def open_url
  open(self.argument_url, 'User-Agent' => self.user_agent)
end

#push_to_images(path, src, h = {}) ⇒ Object



102
103
104
105
106
107
108
109
# File 'lib/image_downloader/parser.rb', line 102

def push_to_images(path,src,h={})
  if !self.images_hash.has_key?(src)
    self.images_hash[src] = 1
    self.images.push Images.new(self.url.host,URI.escape(src), {
        :catalog_path => path,
        :file_name_prefix =>  (h[:file_name_prefix] || '')})
  end
end