Class: Tabula::Page
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::Page
- Includes:
- HasCells
- Defined in:
- lib/tabula/entities/page.rb
Direct Known Subclasses
Constant Summary
Constants included from HasCells
HasCells::ARBITRARY_MAGIC_HEURISTIC_NUMBER
Instance Attribute Summary collapse
-
#cells ⇒ Object
Returns the value of attribute cells.
-
#file_path ⇒ Object
readonly
Returns the value of attribute file_path.
- #min_char_height ⇒ Object
- #min_char_width ⇒ Object
-
#number_one_indexed ⇒ Object
readonly
Returns the value of attribute number_one_indexed.
-
#rotation ⇒ Object
readonly
Returns the value of attribute rotation.
Attributes inherited from ZoneEntity
Instance Method Summary collapse
- #fill_in_cell_texts!(areas) ⇒ Object
- #fill_in_cells!(options = {}) ⇒ Object
- #get_area(area) ⇒ Object
- #get_cell_text(area = nil) ⇒ Object
-
#get_ruling_lines!(options = {}) ⇒ Object
returns ruling lines, memoizes them in.
-
#get_table(options = {}) ⇒ Object
returns a Table object.
-
#get_text(area = nil) ⇒ Object
get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D.
- #horizontal_ruling_lines ⇒ Object
-
#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil, spatial_index = nil) ⇒ Page
constructor
A new instance of Page.
-
#make_table(options = {}) ⇒ Object
for API backwards-compatibility reasons, this returns an array of arrays.
- #number(indexing_base = :one_indexed) ⇒ Object
-
#ruling_lines ⇒ Object
TODO no need for this, let’s choose one name.
- #snap_points! ⇒ Object
-
#spreadsheets(options = {}) ⇒ Object
returns the Spreadsheets; creating them if they’re not memoized.
- #to_json(options = {}) ⇒ Object
- #vertical_ruling_lines ⇒ Object
Methods included from HasCells
#find_cells!, #find_spreadsheets_from_cells, #heuristic_ratio, #is_tabular?
Methods inherited from ZoneEntity
#<=>, #inspect, #merge!, #points, #tlbr, #tlwh
Constructor Details
#initialize(file_path, width, height, rotation, number, texts = [], ruling_lines = [], min_char_width = nil, min_char_height = nil, spatial_index = nil) ⇒ Page
Returns a new instance of Page.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/tabula/entities/page.rb', line 9 def initialize(file_path, width, height, rotation, number, texts=[], ruling_lines=[], min_char_width=nil, min_char_height=nil, spatial_index=nil) super(0, 0, width, height) @rotation = rotation if number < 1 raise ArgumentError, "Tabula::Page numbers are one-indexed; numbers < 1 are invalid." end @ruling_lines = ruling_lines @file_path = file_path @number_one_indexed = number @cells = [] @spreadsheets = nil @min_char_width = min_char_width @min_char_height = min_char_height self.texts = texts if spatial_index.nil? @spatial_index = TextElementIndex.new self.texts.each { |te| @spatial_index << te } else @spatial_index = spatial_index end end |
Instance Attribute Details
#cells ⇒ Object
Returns the value of attribute cells.
7 8 9 |
# File 'lib/tabula/entities/page.rb', line 7 def cells @cells end |
#file_path ⇒ Object (readonly)
Returns the value of attribute file_path.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def file_path @file_path end |
#min_char_height ⇒ Object
38 39 40 |
# File 'lib/tabula/entities/page.rb', line 38 def min_char_height @min_char_height ||= texts.map(&:height).min end |
#min_char_width ⇒ Object
34 35 36 |
# File 'lib/tabula/entities/page.rb', line 34 def min_char_width @min_char_width ||= texts.map(&:width).min end |
#number_one_indexed ⇒ Object (readonly)
Returns the value of attribute number_one_indexed.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def number_one_indexed @number_one_indexed end |
#rotation ⇒ Object (readonly)
Returns the value of attribute rotation.
5 6 7 |
# File 'lib/tabula/entities/page.rb', line 5 def rotation @rotation end |
Instance Method Details
#fill_in_cell_texts!(areas) ⇒ Object
203 204 205 206 207 208 209 210 211 |
# File 'lib/tabula/entities/page.rb', line 203 def fill_in_cell_texts!(areas) texts.each do |t| area = areas.find{|a| a.contains(t) } area.text_elements << t unless area.nil? end areas.each do |area| area.text_elements = TextElement.merge_words(area.text_elements) end end |
#fill_in_cells!(options = {}) ⇒ Object
138 139 140 141 142 143 144 145 |
# File 'lib/tabula/entities/page.rb', line 138 def fill_in_cells!(={}) spreadsheets().each do |spreadsheet| spreadsheet.cells.each do |cell| cell.text_elements = page.get_cell_text(cell) end spreadsheet.cells_resolved = true end end |
#get_area(area) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/tabula/entities/page.rb', line 42 def get_area(area) if area.is_a?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end texts = self.get_text(area) page_area = PageArea.new(file_path, area.width, area.height, rotation, number, texts, Ruling.crop_rulings_to_area(@ruling_lines, area), texts.map(&:width).min, texts.map(&:height).min, @spatial_index) return page_area end |
#get_cell_text(area = nil) ⇒ Object
213 214 215 |
# File 'lib/tabula/entities/page.rb', line 213 def get_cell_text(area=nil) TextElement.merge_words(self.get_text(area)) end |
#get_ruling_lines!(options = {}) ⇒ Object
returns ruling lines, memoizes them in
171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/tabula/entities/page.rb', line 171 def get_ruling_lines!(={}) if @ruling_lines.nil? || @ruling_lines.empty? return [] end self.snap_points! @ruling_lines.select! { |l| !(l.width == 0 && l.height == 0) } @vertical_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:vertical?)) @horizontal_ruling_lines ||= Ruling.collapse_oriented_rulings(@ruling_lines.select(&:horizontal?)) @vertical_ruling_lines + @horizontal_ruling_lines end |
#get_table(options = {}) ⇒ Object
returns a Table object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/tabula/entities/page.rb', line 64 def get_table(={}) = {:vertical_rulings => []}.merge() if texts.empty? return Tabula::Table.new(0, []) end texts = self.texts.sort text_chunks = TextElement.merge_words(texts, ) lines = TextChunk.group_by_lines(text_chunks.sort).sort_by(&:top) columns = unless [:vertical_rulings].empty? [:vertical_rulings].map(&:left).sort #pixel locations, not entities else TextChunk.column_positions(lines).sort end table = Table.new(lines.count, columns) lines.each_with_index do |line, i| line.text_elements.select { |te| te.text !~ ONLY_SPACES_RE }.each do |te| j = columns.find_index { |s| te.left <= s } || columns.count table.add_text_element(te, i, j) end end # fixes up the table a little bit, replacing nils with empty TextElements # and sorting the lines. # table.rows.each do |l| # l.text_elements = l.text_elements.map do |te| # te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil) # end # end # table.rows.sort_by!(&:top) table end |
#get_text(area = nil) ⇒ Object
get text insidea area area can be an Array ([top, left, width, height]) or a Rectangle2D
190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/tabula/entities/page.rb', line 190 def get_text(area=nil) if area.instance_of?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end if area.nil? texts else @spatial_index.contains(area) end end |
#horizontal_ruling_lines ⇒ Object
160 161 162 163 |
# File 'lib/tabula/entities/page.rb', line 160 def horizontal_ruling_lines get_ruling_lines! @horizontal_ruling_lines.nil? ? [] : @horizontal_ruling_lines end |
#make_table(options = {}) ⇒ Object
for API backwards-compatibility reasons, this returns an array of arrays.
101 102 103 |
# File 'lib/tabula/entities/page.rb', line 101 def make_table(={}) get_table().rows end |
#number(indexing_base = :one_indexed) ⇒ Object
147 148 149 150 151 152 153 |
# File 'lib/tabula/entities/page.rb', line 147 def number(indexing_base=:one_indexed) if indexing_base == :zero_indexed return @number_one_indexed - 1 else return @number_one_indexed end end |
#ruling_lines ⇒ Object
TODO no need for this, let’s choose one name
156 157 158 |
# File 'lib/tabula/entities/page.rb', line 156 def ruling_lines get_ruling_lines! end |
#snap_points! ⇒ Object
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
# File 'lib/tabula/entities/page.rb', line 226 def snap_points! lines_to_points = {} points = [] @ruling_lines.each do |line| point1 = line.p1 #comptooters are the wurst point2 = line.p2 # for a given line, each call to #p1 and #p2 creates a new # Point2D::Float object, rather than returning the same one over and # over again. # so we have to get it, store it in memory as `point1` and `point2` # and then store those in various places (and now, modifying one will # modify the reference and thereby modify the other) lines_to_points[line] = [point1, point2] points += [point1, point2] end # lines are stored separately from their constituent points # so you can't modify the points and then modify the lines. # ah, but perhaps I can stick the points in a hash AND in an array # and then modify the lines by means of the points in the hash. [[:x, :x=, self.min_char_width], [:y, :y=, self.min_char_height]].each do |getter, setter, cell_size| sorted_points = points.sort_by(&getter) first_point = sorted_points.shift grouped_points = sorted_points.inject([[first_point]] ) do |memo, next_point| last = memo.last if (next_point.send(getter) - last.first.send(getter)).abs < cell_size memo[-1] << next_point else memo << [next_point] end memo end grouped_points.each do |group| uniq_locs = group.map(&getter).uniq avg_loc = uniq_locs.sum / uniq_locs.size group.each{|p| p.send(setter, avg_loc) } end end lines_to_points.each do |l, p1_p2| l.java_send :setLine, [java.awt.geom.Point2D, java.awt.geom.Point2D], p1_p2[0], p1_p2[1] end end |
#spreadsheets(options = {}) ⇒ Object
returns the Spreadsheets; creating them if they’re not memoized
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/tabula/entities/page.rb', line 106 def spreadsheets(={}) unless @spreadsheets.nil? return @spreadsheets end get_ruling_lines!() self.find_cells!(self.horizontal_ruling_lines, self.vertical_ruling_lines, ) spreadsheet_areas = find_spreadsheets_from_cells #literally, java.awt.geom.Area objects. lol sorry. polygons. #transform each spreadsheet area into a rectangle # and get the cells contained within it. spreadsheet_rectangle_areas = spreadsheet_areas.map{|a| a.getBounds } #getBounds2D is theoretically better, but returns a Rectangle2D.Double, which doesn't have our Ruby sugar on it. @spreadsheets = spreadsheet_rectangle_areas.map do |rect| spr = Spreadsheet.new(rect.y, rect.x, rect.width, rect.height, self, #TODO: keep track of the cells, instead of getting them again inefficiently. [], vertical_ruling_lines.select{|vl| rect.intersectsLine(vl) }, horizontal_ruling_lines.select{|hl| rect.intersectsLine(hl) } ) spr.cells = @cells.select{|c| spr.overlaps?(c) } spr.add_spanning_cells! spr end if [:fill_in_cells] fill_in_cells! end spreadsheets end |
#to_json(options = {}) ⇒ Object
217 218 219 220 221 222 223 224 |
# File 'lib/tabula/entities/page.rb', line 217 def to_json(={}) { :width => self.width, :height => self.height, :number => self.number, :rotation => self.rotation, :texts => self.texts }.to_json() end |
#vertical_ruling_lines ⇒ Object
165 166 167 168 |
# File 'lib/tabula/entities/page.rb', line 165 def vertical_ruling_lines get_ruling_lines! @vertical_ruling_lines.nil? ? [] : @vertical_ruling_lines end |