Class: Readable::Webpage

Inherits:
Object
  • Object
show all
Defined in:
lib/readable/webpage.rb

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Webpage

Returns a new instance of Webpage.



8
9
10
# File 'lib/readable/webpage.rb', line 8

def initialize(url)
  @url = url
end

Instance Method Details

#contentObject



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/readable/webpage.rb', line 46

def content
  return @content unless @content.nil?
  
  Rule.rules.each do |rule|
    @content = rule.content(doc)
    return @content unless @content.nil?
  end
  
  contents = {}
  doc.css('p').each do |p|
    p.parent.css("script").remove
    id  = p.parent.name
    id += "##{p.parent.attributes['id'].value}" if p.parent.attributes["id"]
    id += ".#{p.parent.attributes['class'].value}" if p.parent.attributes["class"]
    contents[id] ||= p.parent.inner_html
  end
  
  @content = ''
  contents.each do |key, content|
    @content = content if Sanitize.clean(content).length > Sanitize.clean(@content).length
  end
  @content
end

#docObject



27
28
29
# File 'lib/readable/webpage.rb', line 27

def doc
  @doc  ||= Nokogiri::HTML(html)
end

#htmlObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/readable/webpage.rb', line 12

def html
  uri = URI.parse(@url)
  result = Net::HTTP.start(uri.host, uri.port) {|http|
    http.get(uri.request_uri)
  }
  
  m        = /charset=(.*?)"/.match(result.body)
  encoding = m[1].downcase rescue 'utf-8'
  if encoding != 'utf-8'
    return result.body.force_encoding(encoding).encode('utf-8')
  else
    return result.body
  end
end

#titleObject



31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/readable/webpage.rb', line 31

def title
  return @content unless @content.nil?
  
  Rule.rules.each do |rule|
    @title = rule.title(doc)
    return @title unless @title.nil? || @title.strip == ''
  end
  
  %w(h1 h2 h3 h4 h5).each do |h|
    titles = doc.css(h)
    @title = titles.first.text unless titles.empty?
    return @title unless @title.nil? || @title.strip == ''
  end
end