Class: Tabula::Spreadsheet
- Inherits:
-
ZoneEntity
- Object
- Tabula.javajava.awtjava.awt.geomjava.awt.geom.Rectangle2Djava.awt.geom.Rectangle2D::Float
- ZoneEntity
- Tabula::Spreadsheet
- Includes:
- Tabular
- Defined in:
- lib/tabula/entities/spreadsheet.rb
Overview
the both should implement ‘cells`, `rows`, `cols`, `extraction_method`
Instance Attribute Summary collapse
-
#cells ⇒ Object
Returns the value of attribute cells.
-
#cells_resolved ⇒ Object
Returns the value of attribute cells_resolved.
-
#extraction_method ⇒ Object
readonly
Returns the value of attribute extraction_method.
-
#horizontal_ruling_lines ⇒ Object
Returns the value of attribute horizontal_ruling_lines.
-
#page ⇒ Object
readonly
Returns the value of attribute page.
-
#vertical_ruling_lines ⇒ Object
Returns the value of attribute vertical_ruling_lines.
Attributes inherited from ZoneEntity
Class Method Summary collapse
Instance Method Summary collapse
- #+(other) ⇒ Object
-
#add_spanning_cells! ⇒ Object
Chapter 2 of Spreadsheet extraction, Spanning Cells #.
-
#cols(evaluate_cells = true) ⇒ Object
call ‘cols` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.
-
#initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) ⇒ Spreadsheet
constructor
, lines).
-
#rows(evaluate_cells = true) ⇒ Object
call ‘rows` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.
- #ruling_lines ⇒ Object
- #ruling_lines=(lines) ⇒ Object
- #to_a ⇒ Object
- #to_csv ⇒ Object
- #to_json(*a) ⇒ Object
- #to_tsv ⇒ Object
Methods included from AbstractInterface
Methods inherited from ZoneEntity
#<=>, #inspect, #merge!, #points, #tlbr, #tlwh
Constructor Details
#initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) ⇒ Spreadsheet
, lines)
12 13 14 15 16 17 18 19 |
# File 'lib/tabula/entities/spreadsheet.rb', line 12 def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines) super(top, left, width, height) @cells = cells @page = page @vertical_ruling_lines = vertical_ruling_lines @horizontal_ruling_lines = horizontal_ruling_lines @extraction_method = "spreadsheet" end |
Instance Attribute Details
#cells ⇒ Object
Returns the value of attribute cells.
9 10 11 |
# File 'lib/tabula/entities/spreadsheet.rb', line 9 def cells @cells end |
#cells_resolved ⇒ Object
Returns the value of attribute cells_resolved.
9 10 11 |
# File 'lib/tabula/entities/spreadsheet.rb', line 9 def cells_resolved @cells_resolved end |
#extraction_method ⇒ Object (readonly)
Returns the value of attribute extraction_method.
10 11 12 |
# File 'lib/tabula/entities/spreadsheet.rb', line 10 def extraction_method @extraction_method end |
#horizontal_ruling_lines ⇒ Object
Returns the value of attribute horizontal_ruling_lines.
9 10 11 |
# File 'lib/tabula/entities/spreadsheet.rb', line 9 def horizontal_ruling_lines @horizontal_ruling_lines end |
#page ⇒ Object (readonly)
Returns the value of attribute page.
10 11 12 |
# File 'lib/tabula/entities/spreadsheet.rb', line 10 def page @page end |
#vertical_ruling_lines ⇒ Object
Returns the value of attribute vertical_ruling_lines.
9 10 11 |
# File 'lib/tabula/entities/spreadsheet.rb', line 9 def vertical_ruling_lines @vertical_ruling_lines end |
Class Method Details
.empty(page) ⇒ Object
21 22 23 |
# File 'lib/tabula/entities/spreadsheet.rb', line 21 def self.empty(page) Spreadsheet.new(0, 0, 0, 0, page, [], nil, nil) end |
Instance Method Details
#+(other) ⇒ Object
157 158 159 160 |
# File 'lib/tabula/entities/spreadsheet.rb', line 157 def +(other) raise ArgumentError, "Data can only be added if it's from the same PDF page" unless other.page == @page Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil ) end |
#add_spanning_cells! ⇒ Object
Chapter 2 of Spreadsheet extraction, Spanning Cells #
if c is a “spanning cell”, that is
if there are N>0 vertical lines strictly between this cell's left and right
insert N placeholder cells after it with zero size (but same top)
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/tabula/entities/spreadsheet.rb', line 78 def add_spanning_cells! #rounding: because Cell.new_from_points, using in #find_cells above, has # a float precision error where, for instance, a cell whose x2 coord is # supposed to be 160.137451171875 comes out as 160.13745498657227 because # of minus. :( vertical_uniq_locs = vertical_ruling_lines.map{|l| l.left.round(5)}.uniq #already sorted horizontal_uniq_locs = horizontal_ruling_lines.map{|l| l.top.round(5)}.uniq #already sorted cells.each do |c| vertical_rulings_spanned_over = vertical_uniq_locs.select{|l| l > c.left.round(5) && l < c.right.round(5) } horizontal_rulings_spanned_over = horizontal_uniq_locs.select{|t| t > c.top.round(5) && t < c.bottom.round(5) } unless vertical_rulings_spanned_over.empty? c.spanning = true vertical_rulings_spanned_over.each do |spanned_over_line_loc| placeholder = Cell.new(c.top, spanned_over_line_loc, 0, c.height) placeholder.placeholder = true cells << placeholder end end unless horizontal_rulings_spanned_over.empty? c.spanning = true horizontal_rulings_spanned_over.each do |spanned_over_line_loc| placeholder = Cell.new(spanned_over_line_loc, c.left, c.width, 0) placeholder.placeholder = true cells << placeholder end end #if there's a spanning cell that's spans over both rows and columns, then it has "double placeholder" cells # e.g. ------------------- # | C | C | C | C | (this is some pretty sweet ASCII art, eh?) # |-----------------| # | C | C | C | C | # |-----------------| # | C | SC P | C | where MC is the "spanning cell" that holds all the text within its bounds # |---- + ----| P is a "placeholder" cell with either zero width or zero height # | C | P DP | C | DP is a "double placeholder" cell with zero width and zero height # |---- + ----| C is an ordinary cell. # | C | P DP | C | # |-----------------| unless (double_placeholders = vertical_rulings_spanned_over.product(horizontal_rulings_spanned_over)).empty? double_placeholders.each do |vert_spanned_over, horiz_spanned_over| placeholder = Cell.new(horiz_spanned_over, vert_spanned_over, 0, 0) placeholder.placeholder = true cells << placeholder end end end end |
#cols(evaluate_cells = true) ⇒ Object
call ‘cols` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.
64 65 66 67 68 69 70 |
# File 'lib/tabula/entities/spreadsheet.rb', line 64 def cols(evaluate_cells=true) if evaluate_cells fill_in_cells! end cells.group_by{|cell| cell.left.round(5) }.sort_by(&:first).map{|x| x.last.sort_by(&:top) } end |
#rows(evaluate_cells = true) ⇒ Object
call ‘rows` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/tabula/entities/spreadsheet.rb', line 36 def rows(evaluate_cells=true) if evaluate_cells fill_in_cells! end array_of_rows = cells.group_by{|cell| cell.top.round(5) }.sort_by(&:first).map{|x| x.last.sort_by(&:left) } #here, insert another kind of placeholder for empty corners # like in 01001523B_China.pdf #TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols # puts array_of_rows[0].inspect if array_of_rows.size > 2 if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left) missing_spots.each do |missing_spot| missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0) missing_spot_placeholder.placeholder = true array_of_rows[0] << missing_spot_placeholder end end array_of_rows[0].sort_by!(&:left) end array_of_rows end |
#ruling_lines ⇒ Object
25 26 27 |
# File 'lib/tabula/entities/spreadsheet.rb', line 25 def ruling_lines @vertical_ruling_lines + @horizontal_ruling_lines end |
#ruling_lines=(lines) ⇒ Object
29 30 31 32 |
# File 'lib/tabula/entities/spreadsheet.rb', line 29 def ruling_lines=(lines) @vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) } @horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) } end |
#to_a ⇒ Object
130 131 132 133 |
# File 'lib/tabula/entities/spreadsheet.rb', line 130 def to_a fill_in_cells! rows.map{ |row_cells| row_cells.map(&:text) } end |
#to_csv ⇒ Object
135 136 137 138 139 140 |
# File 'lib/tabula/entities/spreadsheet.rb', line 135 def to_csv out = StringIO.new out.set_encoding("utf-8") Tabula::Writers.CSV(rows, out) out.string end |
#to_json(*a) ⇒ Object
149 150 151 152 153 154 155 |
# File 'lib/tabula/entities/spreadsheet.rb', line 149 def to_json(*a) { 'json_class' => self.class.name, 'extraction_method' => @extraction_method, 'data' => rows, }.to_json(*a) end |