Class: Tabula::TextElement
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::TextElement
- Defined in:
- lib/tabula/entities/text_element.rb
Overview
a Glyph
Constant Summary collapse
- TOLERANCE_FACTOR =
0.25
- EMPTY =
TextElement.new(0, 0, 0, 0, nil, 0, '', 0)
Instance Attribute Summary collapse
-
#direction ⇒ Object
Returns the value of attribute direction.
-
#font ⇒ Object
Returns the value of attribute font.
-
#font_size ⇒ Object
Returns the value of attribute font_size.
-
#text ⇒ Object
Returns the value of attribute text.
-
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
Attributes inherited from ZoneEntity
Class Method Summary collapse
-
.merge_words(text_elements, options = {}) ⇒ Object
heuristically merge an iterable of TextElement into a list of TextChunk lots of ideas taken from PDFBox’s PDFTextStripper.writePage here be dragons.
- .overlap(y1, height1, y2, height2, variance = 0.1) ⇒ Object
- .within(first, second, variance) ⇒ Object
Instance Method Summary collapse
- #==(other) ⇒ Object
-
#initialize(top, left, width, height, font, font_size, text, width_of_space, direction = 0) ⇒ TextElement
constructor
A new instance of TextElement.
- #inspect ⇒ Object
-
#merge!(other) ⇒ Object
merge this TextElement with another (adjust size and text content accordingly).
- #to_h ⇒ Object
Methods inherited from ZoneEntity
#<=>, #points, #tlbr, #tlwh, #to_json
Constructor Details
#initialize(top, left, width, height, font, font_size, text, width_of_space, direction = 0) ⇒ TextElement
Returns a new instance of TextElement.
11 12 13 14 15 16 17 18 |
# File 'lib/tabula/entities/text_element.rb', line 11 def initialize(top, left, width, height, font, font_size, text, width_of_space, direction=0) super(top, left, width, height) self.font = font self.font_size = font_size self.text = text self.width_of_space = width_of_space self.direction = direction end |
Instance Attribute Details
#direction ⇒ Object
Returns the value of attribute direction.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def direction @direction end |
#font ⇒ Object
Returns the value of attribute font.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def font @font end |
#font_size ⇒ Object
Returns the value of attribute font_size.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def font_size @font_size end |
#text ⇒ Object
Returns the value of attribute text.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def text @text end |
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
7 8 9 |
# File 'lib/tabula/entities/text_element.rb', line 7 def width_of_space @width_of_space end |
Class Method Details
.merge_words(text_elements, options = {}) ⇒ Object
heuristically merge an iterable of TextElement into a list of TextChunk lots of ideas taken from PDFBox’s PDFTextStripper.writePage here be dragons
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/tabula/entities/text_element.rb', line 36 def self.merge_words(text_elements, ={}) = {:vertical_rulings => []} = .merge() vertical_ruling_locations = [:vertical_rulings].map(&:left) if [:vertical_rulings] return [] if text_elements.empty? text_chunks = [TextChunk.create_from_text_element(text_elements.shift)] endOfLastTextX = text_chunks.first.right maxYForLine = text_chunks.first.bottom maxHeightForLine = text_chunks.first.height minYTopForLine = text_chunks.first.top sp = nil char_widths_so_far = [] word_spacings_so_far = [] text_elements.inject(text_chunks) do |chunks, char| current_chunk = chunks.last prev_char = current_chunk.text_elements.last # Resets the character/spacing widths (used for averages) when we see a change in font # or a change in the font size if (char.font != prev_char.font) || (char.font_size != prev_char.font_size) char_widths_so_far = [] word_spacings_so_far = [] end # if same char AND overlapped, skip if (prev_char.text == char.text) && prev_char.overlaps_with_ratio?(char, 0.5) next chunks end # if char is a space that overlaps with the prev_char, skip if char.text == ' ' && prev_char.left == char.left && prev_char.top == char.top next chunks end # any vertical ruling goes across prev_char and char? across_vertical_ruling = vertical_ruling_locations.any? { |loc| prev_char.left < loc && char.left > loc } # Estimate the expected width of the space based on the # average width of the space character with some margin wordSpacing = char.width_of_space deltaSpace = 0 deltaSpace = if (wordSpacing.nan? || wordSpacing == 0) ::Float::MAX elsif word_spacings_so_far.empty? wordSpacing * 0.5 # 0.5 == spacingTolerance else (word_spacings_so_far.reduce(&:+).to_f / word_spacings_so_far.size) * 0.5 end word_spacings_so_far << wordSpacing char_widths_so_far << (char.width / char.text.size) # Estimate the expected width of the space based on the # average character width with some margin. Based on experiments we also found that # .3 worked well. averageCharWidth = char_widths_so_far.reduce(&:+).to_f / char_widths_so_far.size deltaCharWidth = averageCharWidth * 0.3 # 0.3 == averageCharTolerance # Compares the values obtained by the average method and the wordSpacing method and picks # the smaller number. expectedStartOfNextWordX = -::Float::MAX if endOfLastTextX != -1 expectedStartOfNextWordX = endOfLastTextX + [deltaCharWidth, deltaSpace].min end sameLine = true if !overlap(char.bottom, char.height, maxYForLine, maxHeightForLine) endOfLastTextX = -1 expectedStartOfNextWordX = -::Float::MAX maxYForLine = -::Float::MAX maxHeightForLine = -1 minYTopForLine = ::Float::MAX sameLine = false end # characters tend to be ordered by their left location # in determining whether to add a space, we need to know the distance # between the current character's left and the nearest character's # right. The nearest character may not be the previous character, so we # need to keep track of the character with the greatest right x-axis # location -- that's endOfLastTextX # (in some fonts, one character may be completely "on top of" # another character, with the wider character starting to the left and # ending to the right of the narrower character, e.g. ANSI # representations of some South Asian languages, see # https://github.com/tabulapdf/tabula/issues/303) endOfLastTextX = [char.right, endOfLastTextX].max # should we add a space? if !across_vertical_ruling \ && sameLine \ && expectedStartOfNextWordX < char.left \ && !prev_char.text.end_with?(' ') sp = self.new(prev_char.top, prev_char.right, expectedStartOfNextWordX - prev_char.right, prev_char.height, prev_char.font, prev_char.font_size, ' ', prev_char.width_of_space) current_chunk << sp else sp = nil end maxYForLine = [char.bottom, maxYForLine].max maxHeightForLine = [maxHeightForLine, char.height].max minYTopForLine = [minYTopForLine, char.top].min # if sameLine # puts "prev: #{prev_char.text} - char: #{char.text} - diff: #{char.left - prev_char.right} - space: #{[deltaCharWidth, deltaSpace].min} - spacing: #{wordSpacing} - sp: #{!sp.nil?}" # else # puts # end dist = (char.left - (sp ? sp.right : prev_char.right)) if !across_vertical_ruling \ && sameLine \ && (dist < 0 ? current_chunk.vertically_overlaps?(char) : dist < wordSpacing) current_chunk << char else # create a new chunk chunks << TextChunk.create_from_text_element(char) end chunks end.each{|chunk| chunk.text_elements.sort_by!{|char| char.left + char.right } } end |
.overlap(y1, height1, y2, height2, variance = 0.1) ⇒ Object
26 27 28 29 |
# File 'lib/tabula/entities/text_element.rb', line 26 def self.overlap(y1, height1, y2, height2, variance=0.1) within( y1, y2, variance) || (y2 <= y1 && y2 >= y1 - height1) \ || (y1 <= y2 && y1 >= y2-height2) end |
.within(first, second, variance) ⇒ Object
22 23 24 |
# File 'lib/tabula/entities/text_element.rb', line 22 def self.within(first, second, variance ) second < first + variance && second > first - variance end |
Instance Method Details
#==(other) ⇒ Object
201 202 203 |
# File 'lib/tabula/entities/text_element.rb', line 201 def ==(other) self.text.strip == other.text.strip end |
#inspect ⇒ Object
197 198 199 |
# File 'lib/tabula/entities/text_element.rb', line 197 def inspect "#<TextElement: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>" end |
#merge!(other) ⇒ Object
merge this TextElement with another (adjust size and text content accordingly)
182 183 184 185 186 187 188 189 190 191 |
# File 'lib/tabula/entities/text_element.rb', line 182 def merge!(other) raise TypeError, "argument is not a TextElement" unless other.instance_of?(TextElement) if (self <=> other) < 0 self.text = other.text + self.text else self.text << other.text end super(other) end |
#to_h ⇒ Object
193 194 195 |
# File 'lib/tabula/entities/text_element.rb', line 193 def to_h super.merge({:font => self.font, :text => self.text }) end |