Class: MechanizeContent

Inherits:
Object
  • Object
show all
Defined in:
lib/mechanize-content.rb,
lib/mechanize-content/util.rb

Defined Under Namespace

Classes: Util

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ MechanizeContent

Returns a new instance of MechanizeContent.



11
12
13
# File 'lib/mechanize-content.rb', line 11

def initialize(*args)
  @urls = *args
end

Instance Attribute Details

#urlsObject

Returns the value of attribute urls.



9
10
11
# File 'lib/mechanize-content.rb', line 9

def urls
  @urls
end

Instance Method Details

#best_imageObject



23
24
25
# File 'lib/mechanize-content.rb', line 23

def best_image
  @best_image || fetch_images
end

#best_textObject



19
20
21
# File 'lib/mechanize-content.rb', line 19

def best_text
  @best_text || fetch_texts
end

#best_titleObject



15
16
17
# File 'lib/mechanize-content.rb', line 15

def best_title
  @best_title || fetch_titles
end

#fetch_content(page) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/mechanize-content.rb', line 103

def fetch_content(page)
  doc = page.parser
  readability = {}
  doc.css('p').each do |paragraph|
    if readability[paragraph.parent].nil?
      readability[paragraph.parent] = 0
    end
    parent_class = paragraph.parent['class'] || ""
    parent_id = paragraph.parent['id'] || ""
    if !parent_class.match('(comment|meta|footer|footnote)').nil?
      readability[paragraph.parent] -= 50
    elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
      readability[paragraph.parent] += 25
    end
  
    if !parent_id.match('(comment|meta|footer|footnote)').nil?
      readability[paragraph.parent] -= 50
    elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
      readability[paragraph.parent] += 25
    end
  
    if paragraph.inner_text().length > 10
      readability[paragraph.parent] += 1
    end
    if !paragraph.parent.attributes.values.nil?
      if !paragraph.parent.attributes.values.first.nil?
        if paragraph.parent.attributes.values.first.value.include? "comment"
          break
        end
      end
    end
    readability[paragraph.parent] += paragraph.inner_text().count(',')
  end
  sorted_results = readability.sort_by { |parent,score| -score }
  if sorted_results.nil? || sorted_results.first.nil?
    return nil
  elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty?
    return nil
  else
    top_result = sorted_results.first.first
    top_result.css('script').unlink
    top_result.css('iframe').unlink
    top_result.css('h1').unlink
    top_result.css('h2').unlink
    return top_result
  end
end

#fetch_image(page) ⇒ Object



151
152
153
154
155
156
157
158
# File 'lib/mechanize-content.rb', line 151

def fetch_image(page)
  top_content = fetch_content(page)
  if top_content
    return find_best_image(top_content.css('img'), Util.get_base_url(page.parser, page.uri))
  else
    return nil
  end
end

#fetch_imagesObject



27
28
29
30
31
32
33
# File 'lib/mechanize-content.rb', line 27

def fetch_images
  (@pages || fetch_pages).each do |page|
    image = fetch_image(page)
    return @best_image = image unless image.nil?
  end
  return nil
end

#fetch_page(url) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/mechanize-content.rb', line 65

def fetch_page(url)
  begin
    page = (@agent || init_agent).get(url)
    if page.class ==  Mechanize::Page
      return page
    else
      return nil
    end
  rescue Timeout::Error
    puts "Timeout - "+url
  rescue Errno::ECONNRESET
    puts "Connection reset by peer - "+url
  rescue Mechanize::ResponseCodeError
    puts "Invalid url"
  rescue Mechanize::UnsupportedSchemeError
    puts "Unsupported Scheme"
  rescue
    puts "There was a problem connecting - "+url
  end
end

#fetch_pagesObject



56
57
58
59
60
61
62
63
# File 'lib/mechanize-content.rb', line 56

def fetch_pages
  @pages = []
  @urls.each do |url|
    page = fetch_page(url)
    @pages << page unless page.nil?
  end
  @pages
end

#fetch_text(page) ⇒ Object



92
93
94
95
96
97
98
99
100
101
# File 'lib/mechanize-content.rb', line 92

def fetch_text(page)
  top_content = fetch_content(page)
  if top_content
    text = top_content.text.delete("\t").delete("\n").strip
    ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
    text = ic.iconv(text + ' ')[0..-2]
  else
    return nil
  end
end

#fetch_textsObject



35
36
37
38
39
40
41
# File 'lib/mechanize-content.rb', line 35

def fetch_texts
  (@pages || fetch_pages).each do |page|
    text = fetch_text(page)
    return @best_text = text unless text.nil? || text.empty?
  end
  return nil
end

#fetch_titlesObject



43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/mechanize-content.rb', line 43

def fetch_titles
  (@pages || fetch_pages).each do |page|
    title = page.title
    unless title.nil?
      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
      title = ic.iconv(title + ' ')[0..-2]
      return @best_title = title
    end
    
  end
  return @urls.first
end

#find_best_image(all_images, url) ⇒ Object



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/mechanize-content.rb', line 160

def find_best_image(all_images, url)
  begin
    current_src = nil
    all_images.each do |img|
      current_src = img["src"]
      if Util.valid_image?(img['width'].to_i, img['height'].to_i, current_src)
        return Util.build_absolute_url(current_src, url)
      end
    end
    all_images.each do |img|
      current_src = img["src"]
      current_src = Util.build_absolute_url(current_src, url)
      open(current_src, "rb") do |fh|
        is = ImageSize.new(fh.read)
        if Util.valid_image?(is.width, is.height, current_src)
          return current_src
        end
      end
    end
    return nil
  rescue Errno::ENOENT
    puts "No such file - " + current_src
  rescue 
    puts "There was a problem connecting - " + current_src
  end
end

#init_agentObject



86
87
88
89
90
# File 'lib/mechanize-content.rb', line 86

def init_agent
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  return @agent = agent
end