Class: Tabula::TextChunk

Inherits:
ZoneEntity
  • Object
show all
Defined in:
lib/tabula/entities/text_chunk.rb

Overview

a “collection” of TextElements

Instance Attribute Summary collapse

Attributes inherited from ZoneEntity

#texts

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from ZoneEntity

#<=>, #initialize, #points, #tlbr, #to_json

Constructor Details

This class inherits a constructor from Tabula::ZoneEntity

Instance Attribute Details

#fontObject

Returns the value of attribute font.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def font
  @font
end

#font_sizeObject

Returns the value of attribute font_size.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def font_size
  @font_size
end

#text_elementsObject

Returns the value of attribute text_elements.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def text_elements
  @text_elements
end

#width_of_spaceObject

Returns the value of attribute width_of_space.



5
6
7
# File 'lib/tabula/entities/text_chunk.rb', line 5

def width_of_space
  @width_of_space
end

Class Method Details

.column_positions(text_chunks) ⇒ Object

calculate estimated columns from an iterable of TextChunk



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/tabula/entities/text_chunk.rb', line 34

def self.column_positions(text_chunks)
  right = 0
  columns = []
  lines = TextChunk.(text_chunks)
  top = lines.first.text_elements.map(&:top).min

  text_chunks.each do |te|
    next if te.text =~ ONLY_SPACES_RE
    if te.top >= top
      left = te.left
      if (left > right)
        columns << right
        right = te.right
      elsif te.right > right
        right = te.right
      end
    end
  end
  columns
end

.create_from_text_element(text_element) ⇒ Object

initialize a new TextChunk from a TextElement

Raises:

  • (TypeError)


9
10
11
12
13
14
# File 'lib/tabula/entities/text_chunk.rb', line 9

def self.create_from_text_element(text_element)
  raise TypeError, "argument is not a TextElement" unless text_element.instance_of?(TextElement)
  tc = self.new(text_element.top, text_element.left, text_element.width, text_element.height)
  tc.text_elements = [text_element]
  return tc
end

.group_by_lines(text_chunks) ⇒ Object

group an iterable of TextChunk into a list of Line



18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/tabula/entities/text_chunk.rb', line 18

def self.(text_chunks)
  lines = []
  text_chunks.each do |te|
    next if te.text =~ ONLY_SPACES_RE
    l = lines.find { |line| line.horizontal_overlap_ratio(te) >= 0.01 }
    if l.nil?
      l = Line.new
      lines << l
    end
    l << te
  end
  lines
end

Instance Method Details

#<<(text_element) ⇒ Object

add a TextElement to this TextChunk



57
58
59
60
# File 'lib/tabula/entities/text_chunk.rb', line 57

def <<(text_element)
  self.text_elements << text_element
  self.merge!(text_element)
end

#inspectObject



106
107
108
# File 'lib/tabula/entities/text_chunk.rb', line 106

def inspect
  "#<TextChunk: #{self.top.round(2)},#{self.left.round(2)},#{self.bottom.round(2)},#{right.round(2)} '#{self.text}'>"
end

#merge!(other) ⇒ Object



62
63
64
65
66
67
68
69
70
71
# File 'lib/tabula/entities/text_chunk.rb', line 62

def merge!(other)
  if other.instance_of?(TextChunk)
    if self.horizontally_overlaps?(other) && other.top < self.top
      self.text_elements = other.text_elements + self.text_elements
    else
      self.text_elements = self.text_elements + other.text_elements
    end
  end
  super(other)
end

#split_vertically!(y) ⇒ Object

split this TextChunk vertically (in place, returns the remaining chunk)



76
77
78
# File 'lib/tabula/entities/text_chunk.rb', line 76

def split_vertically!(y)
  raise "Not Implemented"
end

#strip!Object

remove leading and trailing whitespace (changes geometry accordingly) TODO horrible implementation - fix.



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/tabula/entities/text_chunk.rb', line 84

def strip!
  acc = 0
  new_te = self.text_elements.drop_while { |te|
    te.text == ' ' && acc += 1
  }
  self.left += self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
  self.text_elements = new_te

  self.text_elements.reverse!
  acc = 0
  new_te = self.text_elements.drop_while { |te|
    te.text == ' ' && acc += 1
  }
  self.right -= self.text_elements.take(acc).inject(0) { |m, te| m += te.width }
  self.text_elements = new_te.reverse
  self
end

#textObject



102
103
104
# File 'lib/tabula/entities/text_chunk.rb', line 102

def text
  self.text_elements.map(&:text).join
end

#to_hObject



110
111
112
# File 'lib/tabula/entities/text_chunk.rb', line 110

def to_h
  super.merge(:text => self.text)
end