Class: TruncatedSaxDocument

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/abbreviato/truncated_sax_document.rb

Constant Summary collapse

IGNORABLE_TAGS =
%w[html head body].freeze
VOID_TAGS =

These don’t have to be closed (which also impacts ongoing length calculations) www.456bereastreet.com/archive/201005/void_empty_elements_and_self-closing_start_tags_in_html/

%w[area base br col command hr img input keygen link meta param source wbr].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ TruncatedSaxDocument

FIXME: Call super to initialize state of the parent class.



20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/abbreviato/truncated_sax_document.rb', line 20

def initialize(options) # rubocop:disable Lint/MissingSuper
  @html_coder = HTMLEntities.new

  @max_length = options[:max_length]
  @tail = options[:tail] || ''
  @fragment_mode = options[:fragment]

  @truncated_string = ''
  @closing_tags = []
  @estimated_length = 0
  @ignored_levels = 0
  @truncated = false
end

Instance Attribute Details

#ignored_levelsObject (readonly)

Returns the value of attribute ignored_levels.



13
14
15
# File 'lib/abbreviato/truncated_sax_document.rb', line 13

def ignored_levels
  @ignored_levels
end

#max_lengthObject (readonly)

Returns the value of attribute max_length.



13
14
15
# File 'lib/abbreviato/truncated_sax_document.rb', line 13

def max_length
  @max_length
end

#tailObject (readonly)

Returns the value of attribute tail.



13
14
15
# File 'lib/abbreviato/truncated_sax_document.rb', line 13

def tail
  @tail
end

#truncatedObject (readonly)

Returns the value of attribute truncated.



13
14
15
# File 'lib/abbreviato/truncated_sax_document.rb', line 13

def truncated
  @truncated
end

#truncated_stringObject (readonly)

Returns the value of attribute truncated_string.



13
14
15
# File 'lib/abbreviato/truncated_sax_document.rb', line 13

def truncated_string
  @truncated_string
end

Instance Method Details

#cdata_block(string) ⇒ Object

This method is called when the parser encounters cdata. In practice, this also gets called for this style of comment inside an element:

<style><!--
  /* Font Definitions */
  @font-face
    {font-family:Wingdings;
    panose-1:5 0 0 0 0 0 0 0 0 0;}
--></style>


103
104
105
106
107
108
109
# File 'lib/abbreviato/truncated_sax_document.rb', line 103

def cdata_block(string)
  if string.bytesize <= remaining_length
    append_to_truncated_string(string)
  else
    @truncated = true
  end
end

#characters(decoded_string) ⇒ Object

This method is called when the parser encounters characters between tags



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/abbreviato/truncated_sax_document.rb', line 64

def characters(decoded_string)
  if max_length_reached? || ignore_mode?
    @truncated = true
    return
  end

  # Use encoded length, so &gt; counts as 4 bytes, not 1 (which is what '>' would give)
  encoded_string = @html_coder.encode(decoded_string, :named)
  string_to_append = if encoded_string.bytesize > remaining_length
                       # This is the line which prevents HTML entities getting truncated - treat them as a single char
                       str = truncate_string(decoded_string)
                       str << tail if remaining_length - str.bytesize >= tail.bytesize
                       str
                     else
                       encoded_string
                     end
  append_to_truncated_string(string_to_append)
end

#comment(string) ⇒ Object

This method is called when the parser encounters a comment



84
85
86
87
88
89
90
91
# File 'lib/abbreviato/truncated_sax_document.rb', line 84

def comment(string)
  comment = comment_tag(string)
  if comment.bytesize <= remaining_length
    append_to_truncated_string(comment)
  else
    @truncated = true
  end
end

#end_documentObject



130
131
132
# File 'lib/abbreviato/truncated_sax_document.rb', line 130

def end_document
  @closing_tags.reverse_each { |name| append_to_truncated_string(closing_tag(name), 0) }
end

#end_element(name) ⇒ Object

This method is called when the parser encounters a closing tag



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/abbreviato/truncated_sax_document.rb', line 112

def end_element(name)
  if ignore_mode?
    exit_ignored_level(name)
    return
  end

  # Note that any remaining end tags get added automatically (in `end_document`) as the document is closed
  return if max_length_reached? || ignorable_tag?(name)

  # FIXME: Style/GuardClause: Use a guard clause (return if single_tag_element?(name)) instead of wrapping the code inside a conditional expression. (https://rubystyle.guide#no-nested-conditionals)
  unless single_tag_element?(name) # rubocop:disable Style/GuardClause
    @closing_tags.pop
    # Don't count the length when closing a tag - it was accommodated when
    # the tag was opened
    append_to_truncated_string(closing_tag(name), 0)
  end
end

#start_element(name, attributes) ⇒ Object

This method is called when the parser encounters an open tag



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/abbreviato/truncated_sax_document.rb', line 35

def start_element(name, attributes)
  if max_length_reached? || ignorable_tag?(name)
    @truncated = true if max_length_reached?
    return
  end

  # If already in ignore mode, go in deeper
  if ignore_mode?
    enter_ignored_level(name)
    return
  end

  string_to_add = opening_tag(name, attributes)

  # Abort if there is not enough space to add the combined opening tag and (potentially) the closing tag
  length_of_tags = overridden_tag_length(name, string_to_add)
  if length_of_tags > remaining_length
    @truncated = true
    enter_ignored_level(name)
    return
  end

  # Save the tag so we can push it on at the end
  @closing_tags.push name unless single_tag_element?(name)

  append_to_truncated_string(string_to_add, length_of_tags)
end