Class: StaticImageDownloader::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/static_image_download/parser.rb

Constant Summary collapse

PARSER_OPTIONS =
{
	'URI_EXTRACT'		=>	:img_parse_uri_extract,
	'NOKOGIRI'			=>	:img_parse_nokogiri,
	'HPRICOT'			=>	:img_parse_hpricot
}
@@DEFAULTPARSEOPTION =

also you can use one ‘NOKOGIRI’ or ‘HPRICOT’

'URI_EXTRACT'
@@DEFAULTUSERAGENT =
'Mozilla/5.0'
@@DEFAULTPATH =
"./"
@@DEFAULTSITE =
'http://feed.informer.com'
@@DEFAULTTIMEOUT =
15

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url = @@DEFAULTSITE, path = @@DEFAULTPATH, parse_option = @@DEFAULTPARSEOPTION, timeout = @@DEFAULTTIMEOUT, user_agent = @@DEFAULTUSERAGENT, h = {}) ⇒ Parser

Returns a new instance of Parser.



21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/static_image_download/parser.rb', line 21

def initialize(url=@@DEFAULTSITE, path=@@DEFAULTPATH, parse_option=@@DEFAULTPARSEOPTION, timeout=@@DEFAULTTIMEOUT, user_agent=@@DEFAULTUSERAGENT, h={})
	@url 				= url.nil? ? @@DEFAULTSITE : url
	@user_agent 		= user_agent.nil? ? @@DEFAULTUSERAGENT : user_agent
	@path 				= path.nil? ? @@DEFAULTPATH : path
	@timeout 			= timeout.nil? ? @@DEFAULTTIMEOUT : timeout
	@parse_option 		= parse_option.nil? ? @@DEFAULTPARSEOPTION : parse_option
	@images 			= []
	@extracted_links 	= []
	@rgxp_img_uri 		= Regexp.new(/^(http|https|ftp)\:\/\/([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i)
	#@rgxp_img_uri 		= Regexp.new(/^(((http|https|ftp)\:\/\/)|www|(\/\/))([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i)
	@domain 			= URI.parse(url).host
	@content			= nil
end

Instance Attribute Details

#contentObject

Returns the value of attribute content.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def content
  @content
end

Returns the value of attribute extracted_links.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def extracted_links
  @extracted_links
end

#imagesObject

Returns the value of attribute images.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def images
  @images
end

#parse_optionObject

Returns the value of attribute parse_option.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def parse_option
  @parse_option
end

#urlObject

Returns the value of attribute url.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def url
  @url
end

#user_agentObject

Returns the value of attribute user_agent.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def user_agent
  @user_agent
end

Class Method Details

.default_parse_optionObject



36
37
38
# File 'lib/static_image_download/parser.rb', line 36

def default_parse_option
	@@DEFAULTPARSEOPTION
end

.default_pathObject



44
45
46
# File 'lib/static_image_download/parser.rb', line 44

def default_path
	@@DEFAULTPATH
end

.default_timeoutObject



48
49
50
# File 'lib/static_image_download/parser.rb', line 48

def default_timeout
	@@DEFAULTTIMEOUT
end

.default_user_agentObject



40
41
42
# File 'lib/static_image_download/parser.rb', line 40

def default_user_agent
	@@DEFAULTUSERAGENT
end

Instance Method Details

#collect_imagesObject



116
117
118
119
120
# File 'lib/static_image_download/parser.rb', line 116

def collect_images
	@extracted_links.each do |link|
		self.push_image(link)
	end
end

#get_content_rawObject



69
70
71
72
73
# File 'lib/static_image_download/parser.rb', line 69

def get_content_raw
	@content = self.get_url.read
	@content.gsub!(/[\n\r\t]+/,' ')
	#p @content if $debug_option
end


93
94
95
96
97
98
99
100
101
# File 'lib/static_image_download/parser.rb', line 93

def get_extracted_links(links)
	return false unless links 
	links.each do |link|
		p "link= #{link}" if $debug_option
		link = link[:src].to_s unless link.is_a?(String)
		@extracted_links << link.match(@rgxp_img_uri)[0] if link.match(@rgxp_img_uri) and !@extracted_links.include?(link.match(@rgxp_img_uri)[0])
	end
	#p "extracted_links= #{@extracted_links}" if $debug_option
end

#get_urlObject



75
76
77
# File 'lib/static_image_download/parser.rb', line 75

def get_url
	open(self.url, 'User-Agent' => self.user_agent)
end

#img_parse_hpricot(h = {}) ⇒ Object



84
85
86
87
# File 'lib/static_image_download/parser.rb', line 84

def img_parse_hpricot(h={})
	doc = Hpricot(@content)
	get_extracted_links(doc.search("//img"))
end

#img_parse_nokogiri(h = {}) ⇒ Object



79
80
81
82
# File 'lib/static_image_download/parser.rb', line 79

def img_parse_nokogiri(h={})
	doc = Nokogiri::HTML(@content)
	get_extracted_links(doc.search("//img"))
end

#img_parse_uri_extract(h = {}) ⇒ Object



89
90
91
# File 'lib/static_image_download/parser.rb', line 89

def img_parse_uri_extract(h={})
	get_extracted_links(URI.extract(@content).select{ |l| l[/#{@rgxp_img_uri}/] })
end

#method_to_value(option, h = {}) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
# File 'lib/static_image_download/parser.rb', line 57

def method_to_value(option, h={})
	method = option_to_method(option)
	p "method= #{method}" if $debug_option
	begin
		response = send(method, h) || ""
		return response
	rescue => error
		p "method_to_value.error = #{error}"
		nil
	end
end

#option_to_method(option) ⇒ Object



53
54
55
# File 'lib/static_image_download/parser.rb', line 53

def option_to_method(option)
	opt = PARSER_OPTIONS[option]
end

#parse_images(h = {}) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/static_image_download/parser.rb', line 103

def parse_images(h={})
	begin
		response = nil
		status = Timeout::timeout(@timeout) {
			response = method_to_value(self.parse_option, h)
			collect_images
		}
	rescue => error
		p "#{error}"
		nil
	end
end

#push_image(src) ⇒ Object



122
123
124
# File 'lib/static_image_download/parser.rb', line 122

def push_image(src)
	self.images.push Images.new(src, @path, Images.default_download_option)
end