Class: Sumitup::Parser
- Inherits:
-
Object
- Object
- Sumitup::Parser
- Defined in:
- lib/sumitup/parser.rb
Instance Attribute Summary collapse
-
#attributes ⇒ Object
Returns the value of attribute attributes.
-
#elements ⇒ Object
Returns the value of attribute elements.
-
#image_count ⇒ Object
Returns the value of attribute image_count.
-
#image_width_limit ⇒ Object
Returns the value of attribute image_width_limit.
-
#max_images ⇒ Object
Returns the value of attribute max_images.
-
#max_words ⇒ Object
Returns the value of attribute max_words.
-
#min_image_size ⇒ Object
Returns the value of attribute min_image_size.
-
#omission ⇒ Object
Returns the value of attribute omission.
-
#protocols ⇒ Object
Returns the value of attribute protocols.
-
#remove_contents ⇒ Object
Returns the value of attribute remove_contents.
-
#word_count ⇒ Object
Returns the value of attribute word_count.
Instance Method Summary collapse
- #image_height(existing_height, existing_width, image_width_limit) ⇒ Object
- #image_transformer ⇒ Object
-
#initialize(options = {}) ⇒ Parser
constructor
A new instance of Parser.
- #is_blank?(text) ⇒ Boolean
- #request_image_size(image_url) ⇒ Object
-
#snippet(text, max) ⇒ Object
Truncates text at a word boundry Parameters: text - The text to truncate wordcount - The number of words.
-
#summarize(html, max = nil) ⇒ Object
Removes html and generate a summary.
- #summarize_fragment(node, max = nil) ⇒ Object
- #summarize_node(node, max = nil) ⇒ Object
- #word_transformer ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Parser
Returns a new instance of Parser.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/sumitup/parser.rb', line 9 def initialize( = {}) self.omission = [:omission] || '' self.word_count = 0 self.max_words = [:max_words] || 100 self.image_count = 0 self.min_image_size = [:min_image_size] || 40 self.image_width_limit = [:image_width_limit] || 230 self.max_images = [:max_images] || 1 # White listed elements self.elements = [:elements] || %w( a abbr b blockquote cite code dfn em i kbd mark q samp small s strike strong sub sup time u var br dd dl dt li ol p pre ul img span ) self.attributes = [:attributes] || { 'a' => ['href', 'title'], 'blockquote' => ['cite'], 'img' => ['alt', 'src', 'title', 'width', 'height'] } self.protocols = [:protocols] || { 'a' => {'href' => ['http', 'https', 'mailto']} } self.remove_contents = [:remove_contents] || %w( style script ) end |
Instance Attribute Details
#attributes ⇒ Object
Returns the value of attribute attributes.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def attributes @attributes end |
#elements ⇒ Object
Returns the value of attribute elements.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def elements @elements end |
#image_count ⇒ Object
Returns the value of attribute image_count.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def image_count @image_count end |
#image_width_limit ⇒ Object
Returns the value of attribute image_width_limit.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def image_width_limit @image_width_limit end |
#max_images ⇒ Object
Returns the value of attribute max_images.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def max_images @max_images end |
#max_words ⇒ Object
Returns the value of attribute max_words.
4 5 6 |
# File 'lib/sumitup/parser.rb', line 4 def max_words @max_words end |
#min_image_size ⇒ Object
Returns the value of attribute min_image_size.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def min_image_size @min_image_size end |
#omission ⇒ Object
Returns the value of attribute omission.
7 8 9 |
# File 'lib/sumitup/parser.rb', line 7 def omission @omission end |
#protocols ⇒ Object
Returns the value of attribute protocols.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def protocols @protocols end |
#remove_contents ⇒ Object
Returns the value of attribute remove_contents.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def remove_contents @remove_contents end |
#word_count ⇒ Object
Returns the value of attribute word_count.
4 5 6 |
# File 'lib/sumitup/parser.rb', line 4 def word_count @word_count end |
Instance Method Details
#image_height(existing_height, existing_width, image_width_limit) ⇒ Object
127 128 129 130 131 132 133 134 135 |
# File 'lib/sumitup/parser.rb', line 127 def image_height(existing_height, existing_width, image_width_limit) # if width is empty just set it to the default width existing_width ||= image_width_limit # if height is empty set it to width and then to the default width (not a lot of other options) existing_height ||= existing_width existing_height ||= image_width_limit ratio = image_width_limit.to_f/existing_width.to_f (existing_height.to_f * ratio).to_i end |
#image_transformer ⇒ Object
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/sumitup/parser.rb', line 160 def image_transformer me = self lambda do |env| node = env[:node] return unless ['img'].include?(env[:node_name]) if (me.image_count+1) > me.max_images # We add a new image below so we have to make sure we won't go over the limit node.remove else keep_it = false existing_width = node.attributes['width'].value.to_i rescue nil if node.attributes['width'] existing_height = node.attributes['height'].value.to_i rescue nil if node.attributes['height'] if !existing_width || !existing_height image_url = node.attributes['src'] rescue nil existing_width, existing_height = me.request_image_size(image_url) rescue [nil, nil] if image_url end existing_width ||= 0 keep_it = true if existing_width > me.min_image_size if keep_it me.image_count += 1 node['height'] = me.image_height(existing_height, existing_width, me.image_width_limit).to_s node['width'] = me.image_width_limit.to_s else node.remove end end end end |
#is_blank?(text) ⇒ Boolean
111 112 113 |
# File 'lib/sumitup/parser.rb', line 111 def is_blank?(text) text.nil? || text.empty? end |
#request_image_size(image_url) ⇒ Object
115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/sumitup/parser.rb', line 115 def request_image_size(image_url) width = nil height = nil open(image_url, 'rb') do |f| img = Dimensions(f) img.read width = img.width height = img.height end [width, height] end |
#snippet(text, max) ⇒ Object
Truncates text at a word boundry Parameters:
text - The text to truncate
wordcount - The number of words
98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/sumitup/parser.rb', line 98 def snippet(text, max) result = '' count = 0 # TODO figure out support for pre that contains code blocks.. return [result, count] if is_blank?(text) text.split.each do |word| return [result.strip!, count] if count >= max result << "#{word} " count += 1 end [result.strip!, count] end |
#summarize(html, max = nil) ⇒ Object
Removes html and generate a summary
44 45 46 47 48 |
# File 'lib/sumitup/parser.rb', line 44 def summarize(html, max = nil) return '' if is_blank?(html) unclean = Nokogiri::HTML::DocumentFragment.parse(html.dup) summarize_fragment(unclean, max).to_html end |
#summarize_fragment(node, max = nil) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/sumitup/parser.rb', line 50 def summarize_fragment(node, max = nil) # Always reset counts self.word_count = 0 self.image_count = 0 clean = Sanitize.clean_node!(node, :elements => elements, :attributes => attributes, :protocols => protocols, :remove_contents => remove_contents, :transformers => [word_transformer, image_transformer]) summarize_node(clean, max) end |
#summarize_node(node, max = nil) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/sumitup/parser.rb', line 63 def summarize_node(node, max = nil) max ||= self.max_words # summarize all children of the node node.children.each do |child| summarize_node(child, max) end if node.text? if self.word_count > max node.remove else # if the text of the current node makes us go over then truncate it result, count = snippet(node.inner_text, max - self.word_count) if count == 0 || is_blank?(result) node.remove else self.word_count += count node.content = result end end else # Remove empty nodes if node.text.empty? && node.children.empty? && !['img', 'br'].include?(node.name) node.remove end end node end |
#word_transformer ⇒ Object
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/sumitup/parser.rb', line 137 def word_transformer me = self lambda do |env| node = env[:node] name = env[:node_name] return if !node.element? # Remove nodes with display none if node['style'] && node['style'] =~ /display\s*:\s*none/ node.remove return end # Remove empty nodes if node.text.empty? && node.children.empty? && !['img', 'br'].include?(name) node.remove return end end end |