Class: MechanizeContent
- Inherits:
-
Object
- Object
- MechanizeContent
- Defined in:
- lib/mechanize-content.rb,
lib/mechanize-content/util.rb
Defined Under Namespace
Classes: Util
Instance Attribute Summary collapse
-
#urls ⇒ Object
Returns the value of attribute urls.
Instance Method Summary collapse
- #best_image ⇒ Object
- #best_text ⇒ Object
- #best_title ⇒ Object
- #fetch_content(page) ⇒ Object
- #fetch_image(page) ⇒ Object
- #fetch_images ⇒ Object
- #fetch_page(url) ⇒ Object
- #fetch_pages ⇒ Object
- #fetch_text(page) ⇒ Object
- #fetch_texts ⇒ Object
- #fetch_titles ⇒ Object
- #find_best_image(all_images, url) ⇒ Object
- #init_agent ⇒ Object
-
#initialize(*args) ⇒ MechanizeContent
constructor
A new instance of MechanizeContent.
Constructor Details
#initialize(*args) ⇒ MechanizeContent
Returns a new instance of MechanizeContent.
11 12 13 |
# File 'lib/mechanize-content.rb', line 11 def initialize(*args) @urls = *args end |
Instance Attribute Details
#urls ⇒ Object
Returns the value of attribute urls.
9 10 11 |
# File 'lib/mechanize-content.rb', line 9 def urls @urls end |
Instance Method Details
#best_image ⇒ Object
23 24 25 |
# File 'lib/mechanize-content.rb', line 23 def best_image @best_image || fetch_images end |
#best_text ⇒ Object
19 20 21 |
# File 'lib/mechanize-content.rb', line 19 def best_text @best_text || fetch_texts end |
#best_title ⇒ Object
15 16 17 |
# File 'lib/mechanize-content.rb', line 15 def best_title @best_title || fetch_titles end |
#fetch_content(page) ⇒ Object
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/mechanize-content.rb', line 103 def fetch_content(page) doc = page.parser readability = {} doc.css('p').each do |paragraph| if readability[paragraph.parent].nil? readability[paragraph.parent] = 0 end parent_class = paragraph.parent['class'] || "" parent_id = paragraph.parent['id'] || "" if !parent_class.match('(comment|meta|footer|footnote)').nil? readability[paragraph.parent] -= 50 elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil? readability[paragraph.parent] += 25 end if !parent_id.match('(comment|meta|footer|footnote)').nil? readability[paragraph.parent] -= 50 elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil? readability[paragraph.parent] += 25 end if paragraph.inner_text().length > 10 readability[paragraph.parent] += 1 end if !paragraph.parent.attributes.values.nil? if !paragraph.parent.attributes.values.first.nil? if paragraph.parent.attributes.values.first.value.include? "comment" break end end end readability[paragraph.parent] += paragraph.inner_text().count(',') end sorted_results = readability.sort_by { |parent,score| -score } if sorted_results.nil? || sorted_results.first.nil? return nil elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? return nil else top_result = sorted_results.first.first top_result.css('script').unlink top_result.css('iframe').unlink top_result.css('h1').unlink top_result.css('h2').unlink return top_result end end |
#fetch_image(page) ⇒ Object
151 152 153 154 155 156 157 158 |
# File 'lib/mechanize-content.rb', line 151 def fetch_image(page) top_content = fetch_content(page) if top_content return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri)) else return nil end end |
#fetch_images ⇒ Object
27 28 29 30 31 32 33 |
# File 'lib/mechanize-content.rb', line 27 def fetch_images (@pages || fetch_pages).each do |page| image = fetch_image(page) return @best_image = image unless image.nil? end return nil end |
#fetch_page(url) ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/mechanize-content.rb', line 65 def fetch_page(url) begin page = (@agent || init_agent).get(url) if page.class == Mechanize::Page return page else return nil end rescue Timeout::Error puts "Timeout - "+url rescue Errno::ECONNRESET puts "Connection reset by peer - "+url rescue Mechanize::ResponseCodeError puts "Invalid url" rescue Mechanize::UnsupportedSchemeError puts "Unsupported Scheme" rescue puts "There was a problem connecting - "+url end end |
#fetch_pages ⇒ Object
56 57 58 59 60 61 62 63 |
# File 'lib/mechanize-content.rb', line 56 def fetch_pages @pages = [] @urls.each do |url| page = fetch_page(url) @pages << page unless page.nil? end @pages end |
#fetch_text(page) ⇒ Object
92 93 94 95 96 97 98 99 100 101 |
# File 'lib/mechanize-content.rb', line 92 def fetch_text(page) top_content = fetch_content(page) if top_content text = top_content.text.delete("\t").delete("\n").strip ic = Iconv.new('UTF-8//IGNORE', 'UTF-8') text = ic.iconv(text + ' ')[0..-2] else return nil end end |
#fetch_texts ⇒ Object
35 36 37 38 39 40 41 |
# File 'lib/mechanize-content.rb', line 35 def fetch_texts (@pages || fetch_pages).each do |page| text = fetch_text(page) return @best_text = text unless text.nil? || text.empty? end return nil end |
#fetch_titles ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/mechanize-content.rb', line 43 def fetch_titles (@pages || fetch_pages).each do |page| title = page.title unless title.nil? ic = Iconv.new('UTF-8//IGNORE', 'UTF-8') title = ic.iconv(title + ' ')[0..-2] return @best_title = title end end return @urls.first end |
#find_best_image(all_images, url) ⇒ Object
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/mechanize-content.rb', line 160 def find_best_image(all_images, url) begin current_src = nil all_images.each do |img| current_src = img["src"] if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src) return Util.build_absolute_url(current_src, url) end end all_images.each do |img| current_src = img["src"] current_src = Util.build_absolute_url(current_src, url) open(current_src, "rb") do |fh| is = ImageSize.new(fh.read) if Util.valid_image?(is.width, is.height, current_src) return current_src end end end return nil rescue Errno::ENOENT puts "No such file - " + current_src rescue puts "There was a problem connecting - " + current_src end end |
#init_agent ⇒ Object
86 87 88 89 90 |
# File 'lib/mechanize-content.rb', line 86 def init_agent agent = Mechanize.new agent.user_agent_alias = 'Mac Safari' return @agent = agent end |