Module: Liquor::HTMLTruncater

Extended by:
HTMLTruncater
Included in:
HTMLTruncater
Defined in:
lib/liquor/stdlib/html_truncater.rb

Overview

Thanks to gist.github.com/101410 and other sources

Instance Method Summary collapse

Instance Method Details

#truncate(input, number = 300, truncate_string = "...") ⇒ Object


6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/liquor/stdlib/html_truncater.rb', line 6

def truncate(input, number = 300, truncate_string = "...")
  doc = Nokogiri::HTML(input.to_s, nil, "UTF-8")

  current = doc.children.first
  count = 0

  while true
    # we found a text node
    if current.is_a?(Nokogiri::XML::Text)
      count += current.text.mb_chars.length
      # we reached our limit, let's get outta here!
      break if count > number
      previous = current
    end

    if current.children.length > 0
      # this node has children, can't be a text node,
      # lets descend and look for text nodes
      current = current.children.first
    elsif !current.next.nil?
      #this has no children, but has a sibling, let's check it out
      current = current.next
    else
      # we are the last child, we need to ascend until we are
      # either done or find a sibling to continue on to
      n = current
      while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
        n = n.parent
      end

      # we've reached the top and found no more text nodes, break
      if n.is_a?(Nokogiri::HTML::Document)
        break;
      else
        current = n.parent.next
      end
    end
  end

  if count >= number
    unless count == number
      new_content = current.text.mb_chars

      index = number-(count-new_content.length)-1
      if index >= 0
        new_content = new_content[0..index]
        current.send(:native_content=, new_content + truncate_string)
      else
        current = previous
        current.send(:native_content=, current.content + truncate_string)
      end
    end

    # remove everything else
    while !current.is_a?(Nokogiri::HTML::Document)
      while !current.next.nil?
        current.next.remove
      end
      current = current.parent
    end
  end

  # now we grab the html and not the text.
  # we do first because nokogiri adds html and body tags
  # which we don't want
  if doc.root.present?
    doc.root.children.first.inner_html
  else # no root element present (probably empty?), will return the first element
    doc.children.first.inner_html
  end
end

#truncate_words(input, num_words, truncate_string = "...") ⇒ Object


78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/liquor/stdlib/html_truncater.rb', line 78

def truncate_words(input, num_words, truncate_string = "...")
  doc = Nokogiri::HTML(input)

  current = doc.children.first
  count = 0

  while true
    # we found a text node
    if current.is_a?(Nokogiri::XML::Text)
      count += current.text.split.length
      # we reached our limit, let's get outta here!
      break if count > num_words
      previous = current
    end

    if current.children.length > 0
      # this node has children, can't be a text node,
      # lets descend and look for text nodes
      current = current.children.first
    elsif !current.next.nil?
      #this has no children, but has a sibling, let's check it out
      current = current.next
    else
      # we are the last child, we need to ascend until we are
      # either done or find a sibling to continue on to
      n = current
      while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
        n = n.parent
      end

      # we've reached the top and found no more text nodes, break
      if n.is_a?(Nokogiri::HTML::Document)
        break;
      else
        current = n.parent.next
      end
    end
  end

  if count >= num_words
    unless count == num_words
      new_content = current.text.split

      # If we're here, the last text node we counted eclipsed the number of words
      # that we want, so we need to cut down on words.  The easiest way to think about
      # this is that without this node we'd have fewer words than the limit, so all
      # the previous words plus a limited number of words from this node are needed.
      # We simply need to figure out how many words are needed and grab that many.
      # Then we need to -subtract- an index, because the first word would be index zero.

      # For example, given:
      # <p>Testing this HTML truncater.</p><p>To see if its working.</p>
      # Let's say I want 6 words.  The correct returned string would be:
      # <p>Testing this HTML truncater.</p><p>To see...</p>
      # All the words in both paragraphs = 9
      # The last paragraph is the one that breaks the limit.  How many words would we
      # have without it? 4.  But we want up to 6, so we might as well get that many.
      # 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
      # we subtract 1.  If this gives us -1, we want nothing from this node. So go back to
      # the previous node instead.
      index = num_words-(count-new_content.length)-1
      if index >= 0
        new_content = new_content[0..index]
        current.content = new_content.join(' ') + truncate_string
      else
        current = previous
        current.content = current.content + truncate_string
      end
    end

    # remove everything else
    while !current.is_a?(Nokogiri::HTML::Document)
      while !current.next.nil?
        current.next.remove
      end
      current = current.parent
    end
  end

  # now we grab the html and not the text.
  # we do first because nokogiri adds html and body tags
  # which we don't want
  doc.root.children.first.inner_html
end