Class: Tabula::Spreadsheet

Inherits:
ZoneEntity
  • Object
show all
Includes:
HasCells
Defined in:
lib/tabula/entities/spreadsheet.rb

Overview

the both should implement ‘cells`, `rows`, `cols`, `extraction_method`

Constant Summary

Constants included from HasCells

HasCells::ANOTHER_MAGIC_NUMBER

Instance Attribute Summary collapse

Attributes inherited from ZoneEntity

#texts

Instance Method Summary collapse

Methods included from HasCells

#add_spanning_cells!, #find_cells!, #find_spreadsheets_from_cells, #is_tabular?

Methods inherited from ZoneEntity

#<=>, #inspect, #merge!, #points, #tlbr

Constructor Details

#initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) ⇒ Spreadsheet

, lines)



12
13
14
15
16
17
18
19
# File 'lib/tabula/entities/spreadsheet.rb', line 12

def initialize(top, left, width, height, page, cells, vertical_ruling_lines, horizontal_ruling_lines) #, lines)
  super(top, left, width, height)
  @cells = cells
  @page = page
  @vertical_ruling_lines = vertical_ruling_lines
  @horizontal_ruling_lines = horizontal_ruling_lines
  @extraction_method = "spreadsheet"
end

Instance Attribute Details

#cellsObject

Returns the value of attribute cells.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def cells
  @cells
end

#cells_resolvedObject

Returns the value of attribute cells_resolved.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def cells_resolved
  @cells_resolved
end

#extraction_methodObject (readonly)

Returns the value of attribute extraction_method.



10
11
12
# File 'lib/tabula/entities/spreadsheet.rb', line 10

def extraction_method
  @extraction_method
end

#horizontal_ruling_linesObject

Returns the value of attribute horizontal_ruling_lines.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def horizontal_ruling_lines
  @horizontal_ruling_lines
end

#pageObject (readonly)

Returns the value of attribute page.



10
11
12
# File 'lib/tabula/entities/spreadsheet.rb', line 10

def page
  @page
end

#vertical_ruling_linesObject

Returns the value of attribute vertical_ruling_lines.



9
10
11
# File 'lib/tabula/entities/spreadsheet.rb', line 9

def vertical_ruling_lines
  @vertical_ruling_lines
end

Instance Method Details

#+(other) ⇒ Object

Raises:

  • (ArgumentError)


105
106
107
108
# File 'lib/tabula/entities/spreadsheet.rb', line 105

def +(other)
  raise ArgumentError unless other.page == @page
  Spreadsheet.new(nil, nil, nil, nil, @page, @cells + other.cells, nil, nil )
end

#cols(evaluate_cells = true) ⇒ Object

call ‘cols` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.



70
71
72
73
74
75
76
77
78
# File 'lib/tabula/entities/spreadsheet.rb', line 70

def cols(evaluate_cells=true)
  if evaluate_cells
    fill_in_cells!
  end
  lefts = cells.map(&:left).uniq.sort
  lefts.map do |left|
    cells.select{|c| c.left == left }.sort_by(&:top)
  end
end

#fill_in_cells!Object



30
31
32
33
34
35
36
37
# File 'lib/tabula/entities/spreadsheet.rb', line 30

def fill_in_cells!
  unless @cells_resolved
    @cells_resolved = true
    cells.each do |cell|
      cell.text_elements = @page.get_cell_text(cell)
    end
  end
end

#rows(evaluate_cells = true) ⇒ Object

call ‘rows` with `evaluate_cells` as `false` to defer filling in the text in each cell, which can be computationally intensive.



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/tabula/entities/spreadsheet.rb', line 41

def rows(evaluate_cells=true)
  if evaluate_cells
    fill_in_cells!
  end
  tops = cells.map(&:top).uniq.sort
  array_of_rows = tops.map do |top|
    cells.select{|c| c.top == top }.sort_by(&:left)
  end
  #here, insert another kind of placeholder for empty corners
  # like in 01001523B_China.pdf
  #TODO: support placeholders for "empty" cells in rows other than row 1, and in #cols
  # puts array_of_rows[0].inspect
  if array_of_rows.size > 2
    if array_of_rows[0].map(&:left).uniq.size < array_of_rows[1].map(&:left).uniq.size
      missing_spots = array_of_rows[1].map(&:left) - array_of_rows[0].map(&:left)
      # puts missing_spots.inspect
      missing_spots.each do |missing_spot|
        missing_spot_placeholder = Cell.new(array_of_rows[0][0].top, missing_spot, 0, 0)
        missing_spot_placeholder.placeholder = true
        array_of_rows[0] << missing_spot_placeholder
      end
    end
    array_of_rows[0].sort_by!(&:left)
  end
  array_of_rows
end

#ruling_linesObject



21
22
23
# File 'lib/tabula/entities/spreadsheet.rb', line 21

def ruling_lines
  @vertical_ruling_lines + @horizontal_ruling_lines
end

#ruling_lines=(lines) ⇒ Object



25
26
27
28
# File 'lib/tabula/entities/spreadsheet.rb', line 25

def ruling_lines=(lines)
  @vertical_ruling_lines = lines.select{|vl| vl.vertical? && spr.intersectsLine(vl) }
  @horizontal_ruling_lines = lines.select{|hl| hl.horizontal? && spr.intersectsLine(hl) }
end

#to_aObject



80
81
82
83
# File 'lib/tabula/entities/spreadsheet.rb', line 80

def to_a
  fill_in_cells!
  rows.map{ |row_cells| row_cells.map(&:text) }
end

#to_csvObject



85
86
87
88
89
# File 'lib/tabula/entities/spreadsheet.rb', line 85

def to_csv
  out = StringIO.new
  Tabula::Writers.CSV(rows, out)
  out.string
end

#to_json(*a) ⇒ Object



97
98
99
100
101
102
103
# File 'lib/tabula/entities/spreadsheet.rb', line 97

def to_json(*a)
  {
    'json_class'   => self.class.name,
    'extraction_method' => @extraction_method,
    'data' => rows,
  }.to_json(*a)
end

#to_tsvObject



91
92
93
94
95
# File 'lib/tabula/entities/spreadsheet.rb', line 91

def to_tsv
  out = StringIO.new
  Tabula::Writers.TSV(rows, out)
  out.string
end