Class: Tabula::TextChunk
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::TextChunk
- Defined in:
- lib/tabula/entities/text_chunk.rb
Overview
a “collection” of TextElements
Instance Attribute Summary collapse
-
#font ⇒ Object
Returns the value of attribute font.
-
#font_size ⇒ Object
Returns the value of attribute font_size.
-
#text_elements ⇒ Object
Returns the value of attribute text_elements.
-
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
Attributes inherited from ZoneEntity
Class Method Summary collapse
-
.column_positions(lines) ⇒ Object
returns a list of column boundaries (x axis)
lines
must be an array of lines sorted by theirtop
attribute. -
.create_from_text_element(text_element) ⇒ Object
initialize a new TextChunk from a TextElement.
- .group_by_lines(text_chunks) ⇒ Object
Instance Method Summary collapse
-
#<<(text_element) ⇒ Object
add a TextElement to this TextChunk.
- #inspect ⇒ Object
- #merge!(other) ⇒ Object
-
#split_vertically!(y) ⇒ Object
split this TextChunk vertically (in place, returns the remaining chunk).
- #text ⇒ Object
- #to_h ⇒ Object
Methods inherited from ZoneEntity
#<=>, #initialize, #points, #tlbr, #tlwh, #to_json
Constructor Details
This class inherits a constructor from Tabula::ZoneEntity
Instance Attribute Details
#font ⇒ Object
Returns the value of attribute font.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def font @font end |
#font_size ⇒ Object
Returns the value of attribute font_size.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def font_size @font_size end |
#text_elements ⇒ Object
Returns the value of attribute text_elements.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def text_elements @text_elements end |
#width_of_space ⇒ Object
Returns the value of attribute width_of_space.
5 6 7 |
# File 'lib/tabula/entities/text_chunk.rb', line 5 def width_of_space @width_of_space end |
Class Method Details
.column_positions(lines) ⇒ Object
returns a list of column boundaries (x axis) lines
must be an array of lines sorted by their top
attribute
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/tabula/entities/text_chunk.rb', line 48 def self.column_positions(lines) return [] if lines.empty? init = lines.first.text_elements.inject([]) { |memo, text_chunk| next memo if text_chunk.text =~ ONLY_SPACES_RE memo << Tabula::ZoneEntity.new(*text_chunk.tlwh) memo } regions = lines[1..-1] .inject(init) do |column_regions, line| line_text_elements = line.text_elements.clone.select { |te| te.text !~ ONLY_SPACES_RE } column_regions.each do |cr| overlaps = line_text_elements .select { |te| te.text !~ ONLY_SPACES_RE && cr.horizontally_overlaps?(te) } overlaps.inject(cr) do |memo, te| cr.merge!(te) end line_text_elements = line_text_elements - overlaps end column_regions += line_text_elements.map { |te| Tabula::ZoneEntity.new(*te.tlwh) } end regions.map { |r| r.right.round(2) }.uniq end |
.create_from_text_element(text_element) ⇒ Object
initialize a new TextChunk from a TextElement
9 10 11 12 13 14 |
# File 'lib/tabula/entities/text_chunk.rb', line 9 def self.create_from_text_element(text_element) raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement) tc = self.new(*text_element.tlwh) tc.text_elements = [text_element] return tc end |
.group_by_lines(text_chunks) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/tabula/entities/text_chunk.rb', line 16 def self.group_by_lines(text_chunks) bbwidth = text_chunks.max_by(&:right).right - text_chunks.min_by(&:left).left l = Line.new l << text_chunks.first lines = text_chunks[1..-1].inject([l]) do |lines, te| if lines.last.horizontal_overlap_ratio(te) < 0.01 # skip lines such that: # - are wider than the 90% of the width of the text_chunks bounding box # - it contains a single repeated character if lines.last.width / bbwidth > 0.9 \ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE } lines.pop end lines << Line.new end lines.last << te lines end if lines.last.width / bbwidth > 0.9 \ && l.text_elements.all? { |te| te.text =~ SAME_CHAR_RE } lines.pop end lines.map!(&:remove_sequential_spaces!) end |
Instance Method Details
#<<(text_element) ⇒ Object
add a TextElement to this TextChunk
81 82 83 84 |
# File 'lib/tabula/entities/text_chunk.rb', line 81 def <<(text_element) self.text_elements << text_element self.merge!(text_element) end |
#inspect ⇒ Object
108 109 110 |
# File 'lib/tabula/entities/text_chunk.rb', line 108 def inspect "#<TextChunk: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>" end |
#merge!(other) ⇒ Object
86 87 88 89 90 91 92 93 94 95 |
# File 'lib/tabula/entities/text_chunk.rb', line 86 def merge!(other) if other.instance_of?(TextChunk) if (self <=> other) < 0 self.text_elements = self.text_elements + other.text_elements else self.text_elements = other.text_elements + self.text_elements end end super(other) end |
#split_vertically!(y) ⇒ Object
split this TextChunk vertically (in place, returns the remaining chunk)
100 101 102 |
# File 'lib/tabula/entities/text_chunk.rb', line 100 def split_vertically!(y) raise "Not Implemented" end |
#text ⇒ Object
104 105 106 |
# File 'lib/tabula/entities/text_chunk.rb', line 104 def text self.text_elements.map(&:text).join end |
#to_h ⇒ Object
112 113 114 |
# File 'lib/tabula/entities/text_chunk.rb', line 112 def to_h super.merge(:text => self.text) end |