Class: Readable::Webpage
- Inherits:
-
Object
- Object
- Readable::Webpage
- Defined in:
- lib/readable/webpage.rb
Instance Method Summary collapse
- #content ⇒ Object
- #doc ⇒ Object
- #html ⇒ Object
-
#initialize(url) ⇒ Webpage
constructor
A new instance of Webpage.
- #title ⇒ Object
Constructor Details
#initialize(url) ⇒ Webpage
Returns a new instance of Webpage.
8 9 10 |
# File 'lib/readable/webpage.rb', line 8 def initialize(url) @url = url end |
Instance Method Details
#content ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/readable/webpage.rb', line 46 def content return @content unless @content.nil? Rule.rules.each do |rule| @content = rule.content(doc) return @content unless @content.nil? end contents = {} doc.css('p').each do |p| p.parent.css("script").remove id = p.parent.name id += "##{p.parent.attributes['id'].value}" if p.parent.attributes["id"] id += ".#{p.parent.attributes['class'].value}" if p.parent.attributes["class"] contents[id] ||= p.parent.inner_html end @content = '' contents.each do |key, content| @content = content if Sanitize.clean(content).length > Sanitize.clean(@content).length end @content end |
#doc ⇒ Object
27 28 29 |
# File 'lib/readable/webpage.rb', line 27 def doc @doc ||= Nokogiri::HTML(html) end |
#html ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/readable/webpage.rb', line 12 def html uri = URI.parse(@url) result = Net::HTTP.start(uri.host, uri.port) {|http| http.get(uri.request_uri) } m = /charset=(.*?)"/.match(result.body) encoding = m[1].downcase rescue 'utf-8' if encoding != 'utf-8' return result.body.force_encoding(encoding).encode('utf-8') else return result.body end end |
#title ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/readable/webpage.rb', line 31 def title return @content unless @content.nil? Rule.rules.each do |rule| @title = rule.title(doc) return @title unless @title.nil? || @title.strip == '' end %w(h1 h2 h3 h4 h5).each do |h| titles = doc.css(h) @title = titles.first.text unless titles.empty? return @title unless @title.nil? || @title.strip == '' end end |