Class: StaticImageDownloader::Parser
- Inherits:
-
Object
- Object
- StaticImageDownloader::Parser
- Defined in:
- lib/static_image_download/parser.rb
Constant Summary collapse
- PARSER_OPTIONS =
{ 'URI_EXTRACT' => :img_parse_uri_extract, 'NOKOGIRI' => :img_parse_nokogiri, 'HPRICOT' => :img_parse_hpricot }
- @@DEFAULTPARSEOPTION =
also you can use one ‘NOKOGIRI’ or ‘HPRICOT’
'URI_EXTRACT'
- @@DEFAULTUSERAGENT =
'Mozilla/5.0'
- @@DEFAULTPATH =
"./"
- @@DEFAULTSITE =
'http://feed.informer.com'
- @@DEFAULTTIMEOUT =
15
Instance Attribute Summary collapse
-
#content ⇒ Object
Returns the value of attribute content.
-
#extracted_links ⇒ Object
Returns the value of attribute extracted_links.
-
#images ⇒ Object
Returns the value of attribute images.
-
#parse_option ⇒ Object
Returns the value of attribute parse_option.
-
#url ⇒ Object
Returns the value of attribute url.
-
#user_agent ⇒ Object
Returns the value of attribute user_agent.
Class Method Summary collapse
- .default_parse_option ⇒ Object
- .default_path ⇒ Object
- .default_timeout ⇒ Object
- .default_user_agent ⇒ Object
Instance Method Summary collapse
- #collect_images ⇒ Object
- #get_content_raw ⇒ Object
- #get_extracted_links(links) ⇒ Object
- #get_url ⇒ Object
- #img_parse_hpricot(h = {}) ⇒ Object
- #img_parse_nokogiri(h = {}) ⇒ Object
- #img_parse_uri_extract(h = {}) ⇒ Object
-
#initialize(url = @@DEFAULTSITE, path = @@DEFAULTPATH, parse_option = @@DEFAULTPARSEOPTION, timeout = @@DEFAULTTIMEOUT, user_agent = @@DEFAULTUSERAGENT, h = {}) ⇒ Parser
constructor
A new instance of Parser.
- #method_to_value(option, h = {}) ⇒ Object
- #option_to_method(option) ⇒ Object
- #parse_images(h = {}) ⇒ Object
- #push_image(src) ⇒ Object
Constructor Details
#initialize(url = @@DEFAULTSITE, path = @@DEFAULTPATH, parse_option = @@DEFAULTPARSEOPTION, timeout = @@DEFAULTTIMEOUT, user_agent = @@DEFAULTUSERAGENT, h = {}) ⇒ Parser
Returns a new instance of Parser.
21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/static_image_download/parser.rb', line 21 def initialize(url=@@DEFAULTSITE, path=@@DEFAULTPATH, parse_option=@@DEFAULTPARSEOPTION, timeout=@@DEFAULTTIMEOUT, user_agent=@@DEFAULTUSERAGENT, h={}) @url = url.nil? ? @@DEFAULTSITE : url @user_agent = user_agent.nil? ? @@DEFAULTUSERAGENT : user_agent @path = path.nil? ? @@DEFAULTPATH : path @timeout = timeout.nil? ? @@DEFAULTTIMEOUT : timeout @parse_option = parse_option.nil? ? @@DEFAULTPARSEOPTION : parse_option @images = [] @extracted_links = [] @rgxp_img_uri = Regexp.new(/^(http|https|ftp)\:\/\/([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i) #@rgxp_img_uri = Regexp.new(/^(((http|https|ftp)\:\/\/)|www|(\/\/))([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i) @domain = URI.parse(url).host @content = nil end |
Instance Attribute Details
#content ⇒ Object
Returns the value of attribute content.
6 7 8 |
# File 'lib/static_image_download/parser.rb', line 6 def content @content end |
#extracted_links ⇒ Object
Returns the value of attribute extracted_links.
6 7 8 |
# File 'lib/static_image_download/parser.rb', line 6 def extracted_links @extracted_links end |
#images ⇒ Object
Returns the value of attribute images.
6 7 8 |
# File 'lib/static_image_download/parser.rb', line 6 def images @images end |
#parse_option ⇒ Object
Returns the value of attribute parse_option.
6 7 8 |
# File 'lib/static_image_download/parser.rb', line 6 def parse_option @parse_option end |
#url ⇒ Object
Returns the value of attribute url.
6 7 8 |
# File 'lib/static_image_download/parser.rb', line 6 def url @url end |
#user_agent ⇒ Object
Returns the value of attribute user_agent.
6 7 8 |
# File 'lib/static_image_download/parser.rb', line 6 def user_agent @user_agent end |
Class Method Details
.default_parse_option ⇒ Object
36 37 38 |
# File 'lib/static_image_download/parser.rb', line 36 def default_parse_option @@DEFAULTPARSEOPTION end |
.default_path ⇒ Object
44 45 46 |
# File 'lib/static_image_download/parser.rb', line 44 def default_path @@DEFAULTPATH end |
.default_timeout ⇒ Object
48 49 50 |
# File 'lib/static_image_download/parser.rb', line 48 def default_timeout @@DEFAULTTIMEOUT end |
.default_user_agent ⇒ Object
40 41 42 |
# File 'lib/static_image_download/parser.rb', line 40 def default_user_agent @@DEFAULTUSERAGENT end |
Instance Method Details
#collect_images ⇒ Object
116 117 118 119 120 |
# File 'lib/static_image_download/parser.rb', line 116 def collect_images @extracted_links.each do |link| self.push_image(link) end end |
#get_content_raw ⇒ Object
69 70 71 72 73 |
# File 'lib/static_image_download/parser.rb', line 69 def get_content_raw @content = self.get_url.read @content.gsub!(/[\n\r\t]+/,' ') #p @content if $debug_option end |
#get_extracted_links(links) ⇒ Object
93 94 95 96 97 98 99 100 101 |
# File 'lib/static_image_download/parser.rb', line 93 def get_extracted_links(links) return false unless links links.each do |link| p "link= #{link}" if $debug_option link = link[:src].to_s unless link.is_a?(String) @extracted_links << link.match(@rgxp_img_uri)[0] if link.match(@rgxp_img_uri) and !@extracted_links.include?(link.match(@rgxp_img_uri)[0]) end #p "extracted_links= #{@extracted_links}" if $debug_option end |
#get_url ⇒ Object
75 76 77 |
# File 'lib/static_image_download/parser.rb', line 75 def get_url open(self.url, 'User-Agent' => self.user_agent) end |
#img_parse_hpricot(h = {}) ⇒ Object
84 85 86 87 |
# File 'lib/static_image_download/parser.rb', line 84 def img_parse_hpricot(h={}) doc = Hpricot(@content) get_extracted_links(doc.search("//img")) end |
#img_parse_nokogiri(h = {}) ⇒ Object
79 80 81 82 |
# File 'lib/static_image_download/parser.rb', line 79 def img_parse_nokogiri(h={}) doc = Nokogiri::HTML(@content) get_extracted_links(doc.search("//img")) end |
#img_parse_uri_extract(h = {}) ⇒ Object
89 90 91 |
# File 'lib/static_image_download/parser.rb', line 89 def img_parse_uri_extract(h={}) get_extracted_links(URI.extract(@content).select{ |l| l[/#{@rgxp_img_uri}/] }) end |
#method_to_value(option, h = {}) ⇒ Object
57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/static_image_download/parser.rb', line 57 def method_to_value(option, h={}) method = option_to_method(option) p "method= #{method}" if $debug_option begin response = send(method, h) || "" return response rescue => error p "method_to_value.error = #{error}" nil end end |
#option_to_method(option) ⇒ Object
53 54 55 |
# File 'lib/static_image_download/parser.rb', line 53 def option_to_method(option) opt = PARSER_OPTIONS[option] end |
#parse_images(h = {}) ⇒ Object
103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/static_image_download/parser.rb', line 103 def parse_images(h={}) begin response = nil status = Timeout::timeout(@timeout) { response = method_to_value(self.parse_option, h) collect_images } rescue => error p "#{error}" nil end end |
#push_image(src) ⇒ Object
122 123 124 |
# File 'lib/static_image_download/parser.rb', line 122 def push_image(src) self.images.push Images.new(src, @path, Images.default_download_option) end |