Class: Tabula::Page
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::Page
- Includes:
- HasCells
- Defined in:
- lib/tabula/entities/page.rb
Direct Known Subclasses
Constant Summary
Constants included from HasCells
HasCells::ANOTHER_MAGIC_NUMBER
Instance Attribute Summary collapse
-
#cells ⇒ Object
Returns the value of attribute cells.
-
#file_path ⇒ Object
readonly
Returns the value of attribute file_path.
- #min_char_height ⇒ Object
- #min_char_width ⇒ Object
-
#number_one_indexed ⇒ Object
readonly
Returns the value of attribute number_one_indexed.
-
#rotation ⇒ Object
readonly
Returns the value of attribute rotation.
Attributes inherited from ZoneEntity
Instance Method Summary collapse
- #collapse_oriented_rulings(lines) ⇒ Object
- #fill_in_cells!(options = {}) ⇒ Object
- #get_area(area) ⇒ Object
- #get_cell_text(area = nil) ⇒ Object
-
#get_ruling_lines!(options = {}) ⇒ Object
returns ruling lines, memoizes them in.
-
#get_table(options = {}) ⇒ Object
returns a Table object.
-
#get_text(area = nil) ⇒ Object
get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D.
- #horizontal_ruling_lines ⇒ Object
-
#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil) ⇒ Page
constructor
A new instance of Page.
-
#make_table(options = {}) ⇒ Object
for API backwards-compatibility reasons, this returns an array of arrays.
- #number(indexing_base = :one_indexed) ⇒ Object
-
#ruling_lines ⇒ Object
TODO no need for this, let’s choose one name.
- #snap_points! ⇒ Object
-
#spreadsheets(options = {}) ⇒ Object
returns the Spreadsheets; creating them if they’re not memoized.
- #to_json(options = {}) ⇒ Object
- #vertical_ruling_lines ⇒ Object
Methods included from HasCells
#add_spanning_cells!, #find_cells!, #find_spreadsheets_from_cells, #is_tabular?
Methods inherited from ZoneEntity
#<=>, #inspect, #merge!, #points, #tlbr
Constructor Details
#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil) ⇒ Page
Returns a new instance of Page.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/tabula/entities/page.rb', line 9 def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil) super(0, 0, width, height) @rotation = rotation if number < 1 raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid." end @ruling_lines = ruling_lines @file_path = file_path @number_one_indexed = number self.texts = texts @cells = [] @spreadsheets = nil @min_char_width = min_char_width @min_char_height = min_char_height end |
Instance Attribute Details
#cells ⇒ Object
Returns the value of attribute cells.
7 8 9 |
# File 'lib/tabula/entities/page.rb', line 7 def cells @cells end |
#file_path ⇒ Object (readonly)
Returns the value of attribute file_path.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def file_path @file_path end |
#min_char_height ⇒ Object
29 30 31 |
# File 'lib/tabula/entities/page.rb', line 29 def min_char_height @min_char_height ||= texts.map(&:height).min end |
#min_char_width ⇒ Object
25 26 27 |
# File 'lib/tabula/entities/page.rb', line 25 def min_char_width @min_char_width ||= texts.map(&:width).min end |
#number_one_indexed ⇒ Object (readonly)
Returns the value of attribute number_one_indexed.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def number_one_indexed @number_one_indexed end |
#rotation ⇒ Object (readonly)
Returns the value of attribute rotation.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def rotation @rotation end |
Instance Method Details
#collapse_oriented_rulings(lines) ⇒ Object
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
# File 'lib/tabula/entities/page.rb', line 244 def collapse_oriented_rulings(lines) # lines must all be of one orientation (i.e. horizontal, vertical) if lines.empty? return [] end lines.sort! {|a, b| a.position != b.position ? a.position <=> b.position : a.start <=> b.start } lines = lines.inject([lines.shift]) do |memo, next_line| last = memo.last if next_line.position == last.position && last.nearlyIntersects?(next_line) memo.last.start = next_line.start < last.start ? next_line.start : last.start memo.last.end = next_line.end < last.end ? last.end : next_line.end memo elsif next_line.length == 0 memo else memo << next_line end end end |
#fill_in_cells!(options = {}) ⇒ Object
122 123 124 125 126 127 128 129 |
# File 'lib/tabula/entities/page.rb', line 122 def fill_in_cells!(={}) spreadsheets().each do |spreadsheet| spreadsheet.cells.each do |cell| cell.text_elements = page.get_cell_text(cell) spreadsheet.cells_resolved = true end end end |
#get_area(area) ⇒ Object
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/tabula/entities/page.rb', line 33 def get_area(area) if area.is_a?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end texts = self.get_text(area) page_area = PageArea.new(file_path, area.width, area.height, rotation, number, texts, Ruling.crop_rulings_to_area(@ruling_lines, area), texts.map(&:width).min, texts.map(&:height).min) return page_area end |
#get_cell_text(area = nil) ⇒ Object
185 186 187 |
# File 'lib/tabula/entities/page.rb', line 185 def get_cell_text(area=nil) TextElement.merge_words(self.get_text(area)) end |
#get_ruling_lines!(options = {}) ⇒ Object
returns ruling lines, memoizes them in
155 156 157 158 159 160 161 162 163 164 |
# File 'lib/tabula/entities/page.rb', line 155 def get_ruling_lines!(={}) if !@ruling_lines.nil? && !@ruling_lines.empty? self.snap_points! @vertical_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:vertical?)) @horizontal_ruling_lines ||= self.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?)) @vertical_ruling_lines + @horizontal_ruling_lines else [] end end |
#get_table(options = {}) ⇒ Object
returns a Table object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/tabula/entities/page.rb', line 54 def get_table(={}) = {:vertical_rulings => []}.merge() if texts.empty? return [] end text_chunks = TextElement.merge_words(self.texts, ).sort lines = TextChunk.group_by_lines(text_chunks) unless [:vertical_rulings].empty? columns = [:vertical_rulings].map(&:left) #pixel locations, not entities separators = columns.sort.reverse else columns = TextChunk.column_positions(text_chunks) separators = columns[1..-1].sort.reverse end table = Table.new(lines.count, separators) lines.each_with_index do |line, i| line.text_elements.each do |te| j = separators.find_index { |s| te.left > s } || separators.count table.add_text_element(te, i, separators.count - j) end end table.lstrip_lines! table end |
#get_text(area = nil) ⇒ Object
get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D
170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/tabula/entities/page.rb', line 170 def get_text(area=nil) if area.instance_of?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end if area.nil? texts else texts.select do |t| area.contains(t) end end end |
#horizontal_ruling_lines ⇒ Object
144 145 146 147 |
# File 'lib/tabula/entities/page.rb', line 144 def horizontal_ruling_lines get_ruling_lines! @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines end |
#make_table(options = {}) ⇒ Object
for API backwards-compatibility reasons, this returns an array of arrays.
85 86 87 |
# File 'lib/tabula/entities/page.rb', line 85 def make_table(={}) get_table().rows end |
#number(indexing_base = :one_indexed) ⇒ Object
131 132 133 134 135 136 137 |
# File 'lib/tabula/entities/page.rb', line 131 def number(indexing_base=:one_indexed) if indexing_base == :zero_indexed return @number_one_indexed - 1 else return @number_one_indexed end end |
#ruling_lines ⇒ Object
TODO no need for this, let’s choose one name
140 141 142 |
# File 'lib/tabula/entities/page.rb', line 140 def ruling_lines get_ruling_lines! end |
#snap_points! ⇒ Object
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
# File 'lib/tabula/entities/page.rb', line 198 def snap_points! lines_to_points = {} points = [] @ruling_lines.each do |line| point1 = line.p1 #comptooters are the wurst point2 = line.p2 # for a given line, each call to #p1 and #p2 creates a new # Point2D::Float object, rather than returning the same one over and # over again. # so we have to get it, store it in memory as `point1` and `point2` # and then store those in various places (and now, modifying one will # modify the reference and thereby modify the other) lines_to_points[line] = [point1, point2] points += [point1, point2] end # lines are stored separately from their constituent points # so you can't modify the points and then modify the lines. # ah, but perhaps I can stick the points in a hash AND in an array # and then modify the lines by means of the points in the hash. [[:x, :x=, self.min_char_width], [:y, :y=, self.min_char_height]].each do |getter, setter, cell_size| sorted_points = points.sort_by(&getter) first_point = sorted_points.shift grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point| last = memo.last if (next_point.send(getter) - last.first.send(getter)).abs < cell_size memo[-1] << next_point else memo << [next_point] end memo end grouped_points.each do |group| uniq_locs = group.map(&getter).uniq avg_loc = uniq_locs.sum / uniq_locs.size group.each{|p| p.send(setter, avg_loc) } end end lines_to_points.each do |l, p1_p2| l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1] end end |
#spreadsheets(options = {}) ⇒ Object
returns the Spreadsheets; creating them if they’re not memoized
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/tabula/entities/page.rb', line 90 def spreadsheets(={}) unless @spreadsheets.nil? return @spreadsheets end get_ruling_lines!() self.find_cells!() spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons. #transform each spreadsheet area into a rectangle # and get the cells contained within it. spreadsheet_rectangle_areas = spreadsheet_areas.map{|a| a.getBounds } #getBounds2D is theoretically better, but returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it. @spreadsheets = spreadsheet_rectangle_areas.map do |rect| spr = Spreadsheet.new(rect.y, rect.x, rect.width, rect.height, self, #TODO: keep track of the cells, instead of getting them again inefficiently. [], vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) }, horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) } ) spr.cells = @cells.select{|c| spr.overlaps?(c) } spr.add_spanning_cells! spr end if [:fill_in_cells] fill_in_cells! end spreadsheets end |
#to_json(options = {}) ⇒ Object
189 190 191 192 193 194 195 196 |
# File 'lib/tabula/entities/page.rb', line 189 def to_json(={}) { :width => self.width, :height => self.height, :number => self.number, :rotation => self.rotation, :texts => self.texts }.to_json() end |
#vertical_ruling_lines ⇒ Object
149 150 151 152 |
# File 'lib/tabula/entities/page.rb', line 149 def vertical_ruling_lines get_ruling_lines! @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines end |