Module: Tabula::HasCells
- Included in:
- Page
- Defined in:
- lib/tabula/entities/has_cells.rb
Overview
subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors; ruling_lines reader
Constant Summary collapse
- ARBITRARY_MAGIC_HEURISTIC_NUMBER =
0.65
Instance Method Summary collapse
-
#find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options = {}) ⇒ Object
finds cells from the ruling lines on the page.
-
#find_spreadsheets_from_cells ⇒ Object
TODO: returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells) maybe placeholders should be added after cells is split into spreadsheets.
- #heuristic_ratio ⇒ Object
- #is_tabular? ⇒ Boolean
Instance Method Details
#find_cells!(horizontal_ruling_lines, vertical_ruling_lines, options = {}) ⇒ Object
finds cells from the ruling lines on the page. implements Nurminen thesis algorithm cf. github.com/jazzido/tabula-extractor/issues/16 subclasses must define cells, vertical_ruling_lines, horizontal_ruling_lines accessors
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/tabula/entities/has_cells.rb', line 32 def find_cells!(horizontal_ruling_lines, vertical_ruling_lines, ={}) # All lines need to been sorted from up to down, # and left to right in ascending order cellsFound = [] intersection_points = Ruling.find_intersections(horizontal_ruling_lines, vertical_ruling_lines) # All crossing-points have been sorted from up to down, # and left to right in ascending order # depending on the Point2D default sort here. intersection_points_array = intersection_points.keys.sort intersection_points_array.each_with_index do |topLeft, i| # Fetch all points on the same vertical and horizontal # line with current crossing point horizontal, vertical = intersection_points[topLeft] # this lets us go to the next intersection_point in intersection_points_array # it is bad and I feel bad. catch :cellCreated do # CrossingPointsDirectlyBelow( topLeft ); x_points = intersection_points_array[i..-1].select{|pt| pt.x == topLeft.x && pt.y > topLeft.y } # CrossingPointsDirectlyToTheRight( topLeft ); y_points = intersection_points_array[i..-1].select{|pt| pt.y == topLeft.y && pt.x > topLeft.x } x_points.each do |x_point| # Skip to next crossing-point # if( NOT EdgeExistsBetween( topLeft, x_point)) next crossing- # point; next unless vertical.colinear?(x_point) y_points.each do |y_point| # if( NOT EdgeExistsBetween( topLeft, y_point)) next crossing- # point; next unless horizontal.colinear?(y_point) #Hypothetical bottom right point of rectangle btmRight = Point2D::Float.new(y_point.x, x_point.y) if intersection_points.include?(btmRight) btmRightHorizontal, btmRightVertical = intersection_points[btmRight] if btmRightHorizontal.colinear?( x_point ) && btmRightVertical.colinear?( y_point ) # Rectangle is confirmed to have 4 sides cellsFound << Cell.new_from_points( topLeft, btmRight, ) # Each crossing point can be the top left corner # of only a single rectangle #next crossing-point; we need to "next" out of the outer loop here # to avoid creating non-minimal cells, I htink. throw :cellCreated end end end end end #cellCreated end self.cells = cellsFound cellsFound end |
#find_spreadsheets_from_cells ⇒ Object
TODO: returns array of Spreadsheet objects constructed (or spreadsheet_areas => cells) maybe placeholders should be added after cells is split into spreadsheets
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'lib/tabula/entities/has_cells.rb', line 97 def find_spreadsheets_from_cells cells.sort! # via http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon points = Set.new cells.each do |cell| #TODO: keep track of cells for each point here for more efficiently keeping track of cells inside a polygon cell.points.each do |pt| if points.include?(pt) # Shared vertex, remove it. points.delete(pt) else points << pt end end end points = points.to_a #x first sort points_sort_x = points.sort{ |s, other| s.x_first_cmp(other) } points_sort_y = points.sort edges_h = {} edges_v = {} i = 0 while i < points.size do curr_y = points_sort_y[i].y while i < points.size && points_sort_y[i].y == curr_y do edges_h[points_sort_y[i]] = points_sort_y[i + 1] edges_h[points_sort_y[i + 1]] = points_sort_y[i] i += 2 end end i = 0 while i < points.size do curr_x = points_sort_x[i].x while i < points.size && points_sort_x[i].x == curr_x do edges_v[points_sort_x[i]] = points_sort_x[i + 1] edges_v[points_sort_x[i + 1]] = points_sort_x[i] i += 2 end end # Get all the polygons. polygons = [] while !edges_h.empty? # We can start with any point. #TODO: should the polygon be represented just by an ordered array of points? polygon = [[edges_h.shift[0], :horiz]] #popitem removes and returns a random key-value pair loop do curr, e = polygon.last if e == :horiz next_vertex = edges_v.delete(curr) polygon << [next_vertex, :vert] else next_vertex = edges_h.delete(curr) #pop removes and returns the value at key `curr` polygon << [next_vertex, :horiz] end if polygon[-1] == polygon[0] # Closed polygon polygon.pop() break end end # Remove implementation-markers (:horiz and :vert) from the polygon. polygon.map!{|point, _| point} polygon.each do |vertex| edges_h.delete(vertex) if edges_h.include?(vertex) edges_v.delete(vertex) if edges_v.include?(vertex) end polygons << polygon end # for efficiency's sake, we maybe ought to use java Polygon objects internally # for flexibility, we don't. polygons.map do |polygon| xpoints = [] ypoints = [] polygon.each do |pt| xpoints << pt.x ypoints << pt.y end Area.new(Polygon.new(xpoints.to_java(Java::int), ypoints.to_java(Java::int), xpoints.size)) #lol jruby end end |
#heuristic_ratio ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/tabula/entities/has_cells.rb', line 16 def heuristic_ratio #spreadsheet extraction spreadsheet = spreadsheets.first return Float::NAN if spreadsheet.nil? rows_defined_by_lines = spreadsheet.rows.size #rows filled in automatically columns_defined_by_lines = spreadsheet.cols.size table = self.get_table columns_defined_without_lines = table.cols.size rows_defined_without_lines = table.rows.size ((columns_defined_by_lines.to_f / columns_defined_without_lines) + (rows_defined_by_lines.to_f / rows_defined_without_lines)) / 2 end |
#is_tabular? ⇒ Boolean
11 12 13 14 |
# File 'lib/tabula/entities/has_cells.rb', line 11 def is_tabular? ratio = heuristic_ratio return ratio > ARBITRARY_MAGIC_HEURISTIC_NUMBER && ratio < (1 / ARBITRARY_MAGIC_HEURISTIC_NUMBER) end |