Class: Sumitup::Parser
- Inherits:
-
Object
- Object
- Sumitup::Parser
- Defined in:
- lib/sumitup/parser.rb
Instance Attribute Summary collapse
-
#attributes ⇒ Object
Returns the value of attribute attributes.
-
#elements ⇒ Object
Returns the value of attribute elements.
-
#image_count ⇒ Object
Returns the value of attribute image_count.
-
#image_width_limit ⇒ Object
Returns the value of attribute image_width_limit.
-
#max_images ⇒ Object
Returns the value of attribute max_images.
-
#max_words ⇒ Object
Returns the value of attribute max_words.
-
#min_image_size ⇒ Object
Returns the value of attribute min_image_size.
-
#omission ⇒ Object
Returns the value of attribute omission.
-
#protocols ⇒ Object
Returns the value of attribute protocols.
-
#remove_contents ⇒ Object
Returns the value of attribute remove_contents.
-
#word_count ⇒ Object
Returns the value of attribute word_count.
Instance Method Summary collapse
- #image_transformer ⇒ Object
-
#initialize(options = {}) ⇒ Parser
constructor
A new instance of Parser.
- #is_blank?(text) ⇒ Boolean
-
#snippet(text, max) ⇒ Object
Truncates text at a word boundry Parameters: text - The text to truncate wordcount - The number of words.
-
#summarize(html, max = nil) ⇒ Object
Removes html and generate a summary.
- #summarize_fragment(node, max = nil) ⇒ Object
- #summarize_node(node, max = nil) ⇒ Object
- #word_transformer ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Parser
Returns a new instance of Parser.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/sumitup/parser.rb', line 9 def initialize( = {}) self.omission = [:omission] || '' self.word_count = 0 self.max_words = [:max_words] || 100 self.image_count = 0 self.min_image_size = [:min_image_size] || 40 self.image_width_limit = [:image_width_limit] || 230 self.max_images = [:max_images] || 1 # White listed elements self.elements = [:elements] || %w( a abbr b blockquote cite code dfn em i kbd mark q samp small s strike strong sub sup time u var br dd dl dt li ol p pre ul img span ) self.attributes = [:attributes] || { 'a' => ['href', 'title'], 'blockquote' => ['cite'], 'img' => ['alt', 'src', 'title', 'width', 'height'] } self.protocols = [:protocols] || { 'a' => {'href' => ['http', 'https', 'mailto']} } self.remove_contents = [:remove_contents] || %w( style script ) end |
Instance Attribute Details
#attributes ⇒ Object
Returns the value of attribute attributes.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def attributes @attributes end |
#elements ⇒ Object
Returns the value of attribute elements.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def elements @elements end |
#image_count ⇒ Object
Returns the value of attribute image_count.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def image_count @image_count end |
#image_width_limit ⇒ Object
Returns the value of attribute image_width_limit.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def image_width_limit @image_width_limit end |
#max_images ⇒ Object
Returns the value of attribute max_images.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def max_images @max_images end |
#max_words ⇒ Object
Returns the value of attribute max_words.
4 5 6 |
# File 'lib/sumitup/parser.rb', line 4 def max_words @max_words end |
#min_image_size ⇒ Object
Returns the value of attribute min_image_size.
5 6 7 |
# File 'lib/sumitup/parser.rb', line 5 def min_image_size @min_image_size end |
#omission ⇒ Object
Returns the value of attribute omission.
7 8 9 |
# File 'lib/sumitup/parser.rb', line 7 def omission @omission end |
#protocols ⇒ Object
Returns the value of attribute protocols.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def protocols @protocols end |
#remove_contents ⇒ Object
Returns the value of attribute remove_contents.
6 7 8 |
# File 'lib/sumitup/parser.rb', line 6 def remove_contents @remove_contents end |
#word_count ⇒ Object
Returns the value of attribute word_count.
4 5 6 |
# File 'lib/sumitup/parser.rb', line 4 def word_count @word_count end |
Instance Method Details
#image_transformer ⇒ Object
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/sumitup/parser.rb', line 134 def image_transformer me = self lambda do |env| node = env[:node] return unless ['img'].include?(env[:node_name]) if (me.image_count+1) > me.max_images # We add a new image below so we have to make sure we won't go over the limit node.remove else keep_it = false if node.attributes['width'] width = node.attributes['width'].value.to_i rescue 0 keep_it = true if width > me.min_image_size else width = nil keep_it = true end if keep_it me.image_count += 1 if width == nil || width > me.image_width_limit node['width'] = me.image_width_limit.to_s node.attributes['height'].remove if node.attributes['height'] end else node.remove end end end end |
#is_blank?(text) ⇒ Boolean
107 108 109 |
# File 'lib/sumitup/parser.rb', line 107 def is_blank?(text) text.nil? || text.empty? end |
#snippet(text, max) ⇒ Object
Truncates text at a word boundry Parameters:
text - The text to truncate
wordcount - The number of words
95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/sumitup/parser.rb', line 95 def snippet(text, max) result = '' count = 0 return [result, count] if is_blank?(text) text.split.each do |word| return [result.strip!, count] if count >= max result << "#{word} " count += 1 end [result.strip!, count] end |
#summarize(html, max = nil) ⇒ Object
Removes html and generate a summary
44 45 46 47 48 |
# File 'lib/sumitup/parser.rb', line 44 def summarize(html, max = nil) return '' if is_blank?(html) unclean = Nokogiri::HTML::DocumentFragment.parse(html.dup) summarize_fragment(unclean, max).to_html end |
#summarize_fragment(node, max = nil) ⇒ Object
50 51 52 53 54 55 56 57 58 |
# File 'lib/sumitup/parser.rb', line 50 def summarize_fragment(node, max = nil) clean = Sanitize.clean_node!(node, :elements => elements, :attributes => attributes, :protocols => protocols, :remove_contents => remove_contents, :transformers => [word_transformer, image_transformer]) summarize_node(clean, max) end |
#summarize_node(node, max = nil) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/sumitup/parser.rb', line 60 def summarize_node(node, max = nil) max ||= self.max_words # summarize all children of the node node.children.each do |child| summarize_node(child, max) end if node.text? if self.word_count > max node.remove else # if the text of the current node makes us go over then truncate it result, count = snippet(node.inner_text, max - self.word_count) if count == 0 || is_blank?(result) node.remove else self.word_count += count node.content = result end end else # Remove empty nodes if node.text.empty? && node.children.empty? && !['img', 'br'].include?(node.name) node.remove end end node end |
#word_transformer ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/sumitup/parser.rb', line 111 def word_transformer me = self lambda do |env| node = env[:node] name = env[:node_name] return if !node.element? # Remove nodes with display none if node['style'] && node['style'] =~ /display\s*:\s*none/ node.remove return end # Remove empty nodes if node.text.empty? && node.children.empty? && !['img', 'br'].include?(name) node.remove return end end end |