Class: Tabula::Page
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::Page
- Includes:
- HasCells
- Defined in:
- lib/tabula/entities/page.rb
Direct Known Subclasses
Constant Summary
Constants included from HasCells
HasCells::ARBITRARY_MAGIC_HEURISTIC_NUMBER
Instance Attribute Summary collapse
-
#cells ⇒ Object
Returns the value of attribute cells.
-
#file_path ⇒ Object
readonly
Returns the value of attribute file_path.
-
#min_char_height ⇒ Object
writeonly
Sets the attribute min_char_height.
-
#min_char_width ⇒ Object
writeonly
Sets the attribute min_char_width.
-
#number_one_indexed ⇒ Object
readonly
Returns the value of attribute number_one_indexed.
-
#rotation ⇒ Object
readonly
Returns the value of attribute rotation.
Attributes inherited from ZoneEntity
Instance Method Summary collapse
- #fill_in_cell_texts!(areas) ⇒ Object
- #fill_in_cells!(options = {}) ⇒ Object
- #get_area(area) ⇒ Object
- #get_cell_text(area = nil) ⇒ Object
- #get_min_char_height ⇒ Object
- #get_min_char_width ⇒ Object
-
#get_ruling_lines!(options = {}) ⇒ Object
returns ruling lines, memoizes them in.
-
#get_table(options = {}) ⇒ Object
returns a Table object.
-
#get_text(area = nil) ⇒ Object
get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D.
- #has_text? ⇒ Boolean
- #horizontal_ruling_lines ⇒ Object
-
#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil, spatial_index = nil) ⇒ Page
constructor
A new instance of Page.
-
#make_table(options = {}) ⇒ Object
for API backwards-compatibility reasons, this returns an array of arrays.
- #minimal_bounding_box_of_ruling_lines ⇒ Object
-
#minimal_bounding_box_of_text_elements ⇒ Object
is there a scenario under which we’d prefer to use this over ‘minimal_bounding_box_of_ruling_lines`? if so, what is it? If there are no ruling lines on the page _at all_, then adding this bounding box is useless.
- #number(indexing_base = :one_indexed) ⇒ Object
-
#ruling_lines ⇒ Object
TODO no need for this, let’s choose one name.
- #snap_points! ⇒ Object
- #spreadsheet_areas(options = {}) ⇒ Object
-
#spreadsheets(options = {}) ⇒ Object
returns the Spreadsheets; creating them if they’re not memoized.
- #to_json(options = {}) ⇒ Object
- #vertical_ruling_lines ⇒ Object
Methods included from HasCells
#find_cells!, #find_spreadsheets_from_cells, #heuristic_ratio, #is_tabular?
Methods inherited from ZoneEntity
#<=>, #inspect, #merge!, #points, #tlbr, #tlwh
Constructor Details
#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil, spatial_index = nil) ⇒ Page
Returns a new instance of Page.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/tabula/entities/page.rb', line 9 def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil) super(0, 0, width, height) @rotation = rotation if number < 1 raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid." end @ruling_lines = ruling_lines @file_path = file_path @number_one_indexed = number @cells = [] @spreadsheets = nil @min_char_width = min_char_width @min_char_height = min_char_height self.texts = texts @ruling_lines += minimal_bounding_box_of_ruling_lines.to_lines.map{|l| Ruling.new(l.getY1, l.getX1, l.getX2 - l.getX1, l.getY2 - l.getY1)}.select &:finite? if spatial_index.nil? @spatial_index = TextElementIndex.new self.texts.each { |te| @spatial_index << te } else @spatial_index = spatial_index end end |
Instance Attribute Details
#cells ⇒ Object
Returns the value of attribute cells.
7 8 9 |
# File 'lib/tabula/entities/page.rb', line 7 def cells @cells end |
#file_path ⇒ Object (readonly)
Returns the value of attribute file_path.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def file_path @file_path end |
#min_char_height=(value) ⇒ Object (writeonly)
Sets the attribute min_char_height
6 7 8 |
# File 'lib/tabula/entities/page.rb', line 6 def min_char_height=(value) @min_char_height = value end |
#min_char_width=(value) ⇒ Object (writeonly)
Sets the attribute min_char_width
6 7 8 |
# File 'lib/tabula/entities/page.rb', line 6 def min_char_width=(value) @min_char_width = value end |
#number_one_indexed ⇒ Object (readonly)
Returns the value of attribute number_one_indexed.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def number_one_indexed @number_one_indexed end |
#rotation ⇒ Object (readonly)
Returns the value of attribute rotation.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def rotation @rotation end |
Instance Method Details
#fill_in_cell_texts!(areas) ⇒ Object
246 247 248 249 250 251 252 253 254 |
# File 'lib/tabula/entities/page.rb', line 246 def fill_in_cell_texts!(areas) texts.each do |t| area = areas.find{|a| a.contains(t) } area.text_elements << t unless area.nil? end areas.each do |area| area.text_elements = TextElement.merge_words(area.text_elements) end end |
#fill_in_cells!(options = {}) ⇒ Object
177 178 179 180 181 182 183 184 |
# File 'lib/tabula/entities/page.rb', line 177 def fill_in_cells!(={}) spreadsheets().each do |spreadsheet| spreadsheet.cells.each do |cell| cell.text_elements = page.get_cell_text(cell) end spreadsheet.cells_resolved = true end end |
#get_area(area) ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/tabula/entities/page.rb', line 77 def get_area(area) if area.is_a?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end texts = self.get_text(area) page_area = PageArea.new(file_path, area.width, area.height, rotation, number, texts, Ruling.crop_rulings_to_area(@ruling_lines, area), texts.map(&:width).min, texts.map(&:height).min, @spatial_index) return page_area end |
#get_cell_text(area = nil) ⇒ Object
256 257 258 |
# File 'lib/tabula/entities/page.rb', line 256 def get_cell_text(area=nil) TextElement.merge_words(self.get_text(area)) end |
#get_min_char_height ⇒ Object
73 74 75 |
# File 'lib/tabula/entities/page.rb', line 73 def get_min_char_height @min_char_height ||= texts.map(&:height).min || ::Float::INFINITY end |
#get_min_char_width ⇒ Object
69 70 71 |
# File 'lib/tabula/entities/page.rb', line 69 def get_min_char_width @min_char_width ||= texts.map(&:width).min || ::Float::INFINITY end |
#get_ruling_lines!(options = {}) ⇒ Object
returns ruling lines, memoizes them in
214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/tabula/entities/page.rb', line 214 def get_ruling_lines!(={}) if @ruling_lines.nil? || @ruling_lines.empty? return [] end self.snap_points! @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) } @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?)) @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?)) @vertical_ruling_lines + @horizontal_ruling_lines end |
#get_table(options = {}) ⇒ Object
returns a Table object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/tabula/entities/page.rb', line 99 def get_table(={}) = {:vertical_rulings => []}.merge() if texts.empty? return Tabula::Table.new(0, []) end texts = self.texts.sort text_chunks = TextElement.merge_words(texts, ) lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top) columns = unless [:vertical_rulings].empty? [:vertical_rulings].map(&:left).sort #pixel locations, not entities else TextChunk.column_positions(lines).sort end table = Table.new(lines.count, columns) lines.each_with_index do |line, i| line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te| j = columns.find_index { |s| te.left <= s } || columns.count table.add_text_element(te, i, j) end end # fixes up the table a little bit, replacing nils with empty TextElements # and sorting the lines. # table.rows.each do |l| # l.text_elements = l.text_elements.map do |te| # te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) # end # end # table.rows.sort_by!(&:top) table end |
#get_text(area = nil) ⇒ Object
get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D
233 234 235 236 237 238 239 240 241 242 243 244 |
# File 'lib/tabula/entities/page.rb', line 233 def get_text(area=nil) if area.instance_of?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end if area.nil? texts else @spatial_index.contains(area) end end |
#has_text? ⇒ Boolean
194 195 196 |
# File 'lib/tabula/entities/page.rb', line 194 def has_text? !self.texts.empty? end |
#horizontal_ruling_lines ⇒ Object
203 204 205 206 |
# File 'lib/tabula/entities/page.rb', line 203 def horizontal_ruling_lines get_ruling_lines! @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines end |
#make_table(options = {}) ⇒ Object
for API backwards-compatibility reasons, this returns an array of arrays.
136 137 138 |
# File 'lib/tabula/entities/page.rb', line 136 def make_table(={}) get_table().rows end |
#minimal_bounding_box_of_ruling_lines ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/tabula/entities/page.rb', line 36 def minimal_bounding_box_of_ruling_lines max_x = 0 max_y = 0 min_x = ::Float::INFINITY min_y = ::Float::INFINITY horizontal_ruling_lines.each do |t| min_x = t.left if t.left < min_x max_x = t.right if t.right > max_x end vertical_ruling_lines.each do |t| min_y = t.top if t.top < min_y max_y = t.bottom if t.bottom > max_y end java.awt.geom.Rectangle2D::Float.new(min_x, min_y, max_x - min_x, max_y - min_y) end |
#minimal_bounding_box_of_text_elements ⇒ Object
is there a scenario under which we’d prefer to use this over ‘minimal_bounding_box_of_ruling_lines`? if so, what is it? If there are no ruling lines on the page _at all_, then adding this bounding box is useless.
55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/tabula/entities/page.rb', line 55 def minimal_bounding_box_of_text_elements max_x = 0 max_y = 0 min_x = ::Float::INFINITY min_y = ::Float::INFINITY @texts.each do |t| min_x = t.x if t.x < min_x min_y = t.y if t.y < min_y max_x = t.x if t.x > max_x max_y = t.y if t.y > max_y end java.awt.geom.Rectangle2D::Float.new(min_x, min_y, max_x - min_x, max_y - min_y) end |
#number(indexing_base = :one_indexed) ⇒ Object
186 187 188 189 190 191 192 |
# File 'lib/tabula/entities/page.rb', line 186 def number(indexing_base=:one_indexed) if indexing_base == :zero_indexed return @number_one_indexed - 1 else return @number_one_indexed end end |
#ruling_lines ⇒ Object
TODO no need for this, let’s choose one name
199 200 201 |
# File 'lib/tabula/entities/page.rb', line 199 def ruling_lines get_ruling_lines! end |
#snap_points! ⇒ Object
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
# File 'lib/tabula/entities/page.rb', line 269 def snap_points! lines_to_points = {} points = [] @ruling_lines.each do |line| point1 = line.p1 #comptooters are the wurst point2 = line.p2 # for a given line, each call to #p1 and #p2 creates a new # Point2D::Float object, rather than returning the same one over and # over again. # so we have to get it, store it in memory as `point1` and `point2` # and then store those in various places (and now, modifying one will # modify the reference and thereby modify the other) lines_to_points[line] = [point1, point2] points += [point1, point2] end # lines are stored separately from their constituent points # so you can't modify the points and then modify the lines. # ah, but perhaps I can stick the points in a hash AND in an array # and then modify the lines by means of the points in the hash. [[:x, :x=, self.get_min_char_width], [:y, :y=, self.get_min_char_height]].each do |getter, setter, cell_size| sorted_points = points.sort_by(&getter) first_point = sorted_points.shift grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point| last = memo.last if (next_point.send(getter) - last.first.send(getter)).abs < cell_size memo[-1] << next_point else memo << [next_point] end memo end grouped_points.each do |group| uniq_locs = group.map(&getter).uniq avg_loc = uniq_locs.sum / uniq_locs.size group.each{|p| p.send(setter, avg_loc) } end end lines_to_points.each do |l, p1_p2| l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1] end end |
#spreadsheet_areas(options = {}) ⇒ Object
165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/tabula/entities/page.rb', line 165 def spreadsheet_areas (={}) get_ruling_lines!() self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, ) spreadsheet_java_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons. #transform each spreadsheet area into a rectangle # and get the cells contained within it. # getBounds2D is theoretically better than getBounds, but it returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it. spreadsheet_java_areas.map{|a| a.getBounds } end |
#spreadsheets(options = {}) ⇒ Object
returns the Spreadsheets; creating them if they’re not memoized
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/tabula/entities/page.rb', line 141 def spreadsheets(={}) unless @spreadsheets.nil? return @spreadsheets end @spreadsheets = spreadsheet_areas().map do |rect| spr = Spreadsheet.new(rect.y, rect.x, rect.width, rect.height, self, #TODO: keep track of the cells, instead of getting them again inefficiently. [], vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) }, horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) } ) spr.cells = @cells.select{|c| spr.overlaps?(c) } spr.add_spanning_cells! spr end if [:fill_in_cells] fill_in_cells! end spreadsheets end |
#to_json(options = {}) ⇒ Object
260 261 262 263 264 265 266 267 |
# File 'lib/tabula/entities/page.rb', line 260 def to_json(={}) { :width => self.width, :height => self.height, :number => self.number, :rotation => self.rotation, :hasText => self.has_text? }.to_json() end |
#vertical_ruling_lines ⇒ Object
208 209 210 211 |
# File 'lib/tabula/entities/page.rb', line 208 def vertical_ruling_lines get_ruling_lines! @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines end |