Class: Tabula::Extraction::ObjectExtractor
- Inherits:
-
Object
- Object
- Tabula::Extraction::ObjectExtractor
- Defined in:
- lib/tabula/extraction.rb
Direct Known Subclasses
Constant Summary collapse
- PRINTABLE_RE =
/[[:print:]]/
- DEFAULT_OPTIONS =
{ :line_color_filter => nil, :extract_ruling_lines => true }
Instance Attribute Summary collapse
-
#characters ⇒ Object
Returns the value of attribute characters.
-
#clipping_paths ⇒ Object
Returns the value of attribute clipping_paths.
-
#debug_clipping_paths ⇒ Object
Returns the value of attribute debug_clipping_paths.
-
#debug_text ⇒ Object
Returns the value of attribute debug_text.
-
#options ⇒ Object
Returns the value of attribute options.
Instance Method Summary collapse
- #clear! ⇒ Object
- #close! ⇒ Object
- #currentClippingPath ⇒ Object
- #drawImage(image, at) ⇒ Object
- #drawPage(page) ⇒ Object
- #ensure_open! ⇒ Object
- #ensurePageSize! ⇒ Object
- #extract(pages = nil) ⇒ Object
-
#extract_page(page_number) ⇒ Object
extract objects from a page.
- #fillPath(windingRule) ⇒ Object
- #getStroke ⇒ Object
-
#initialize(pdf_filename, pages = [1], password = '', options = {}) ⇒ ObjectExtractor
constructor
TODO: the
pages
constructor argument does not make sense now that we haveextract_page
andextract_pages
. - #page_count ⇒ Object
- #pageTransform ⇒ Object
- #processTextPosition(text) ⇒ Object
- #rulings ⇒ Object
- #setStroke(stroke) ⇒ Object
- #strokePath(filter_by_color = nil) ⇒ Object
- #transformPath(path) ⇒ Object
Constructor Details
#initialize(pdf_filename, pages = [1], password = '', options = {}) ⇒ ObjectExtractor
TODO: the pages
constructor argument does not make sense now that we have extract_page
and extract_pages
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/tabula/extraction.rb', line 36 def initialize(pdf_filename, pages=[1], password='', ={}) raise Errno::ENOENT unless File.exists?(pdf_filename) @pdf_filename = pdf_filename @pdf_file = Extraction.openPDF(pdf_filename, password) @all_pages = @pdf_file.getDocumentCatalog.getAllPages @pages = pages == :all ? (1..@all_pages.size) : pages super() self. = DEFAULT_OPTIONS.merge() self.characters = [] @debug_clipping_paths = false @clipping_path = nil @transformed_clipping_path = nil self.clipping_paths = [] @rulings = [] @min_char_width = @min_char_height = Float::MAX end |
Instance Attribute Details
#characters ⇒ Object
Returns the value of attribute characters.
25 26 27 |
# File 'lib/tabula/extraction.rb', line 25 def characters @characters end |
#clipping_paths ⇒ Object
Returns the value of attribute clipping_paths.
25 26 27 |
# File 'lib/tabula/extraction.rb', line 25 def clipping_paths @clipping_paths end |
#debug_clipping_paths ⇒ Object
Returns the value of attribute debug_clipping_paths.
25 26 27 |
# File 'lib/tabula/extraction.rb', line 25 def debug_clipping_paths @debug_clipping_paths end |
#debug_text ⇒ Object
Returns the value of attribute debug_text.
25 26 27 |
# File 'lib/tabula/extraction.rb', line 25 def debug_text @debug_text end |
#options ⇒ Object
Returns the value of attribute options.
25 26 27 |
# File 'lib/tabula/extraction.rb', line 25 def @options end |
Instance Method Details
#clear! ⇒ Object
109 110 111 112 113 114 115 |
# File 'lib/tabula/extraction.rb', line 109 def clear! self.characters.clear self.clipping_paths.clear @min_char_width = @min_char_height = Float::MAX @page_transform = nil @rulings.clear end |
#close! ⇒ Object
55 56 57 58 59 |
# File 'lib/tabula/extraction.rb', line 55 def close! self.ensure_open! @pdf_file.close @pdf_file_closed = true end |
#currentClippingPath ⇒ Object
253 254 255 256 257 258 259 260 261 262 263 264 265 |
# File 'lib/tabula/extraction.rb', line 253 def currentClippingPath cp = self.getGraphicsState.getCurrentClippingPath if cp == @clipping_path return @transformed_clipping_path_bounds end @clipping_path = cp @transformed_clipping_path = self.transformPath(cp) @transformed_clipping_path_bounds = @transformed_clipping_path.getBounds return @transformed_clipping_path_bounds end |
#drawImage(image, at) ⇒ Object
229 230 |
# File 'lib/tabula/extraction.rb', line 229 def drawImage(image, at) end |
#drawPage(page) ⇒ Object
124 125 126 127 128 129 130 131 132 |
# File 'lib/tabula/extraction.rb', line 124 def drawPage(page) self.page = page if !self.page.getContents.nil? ensurePageSize! self.processStream(self.page, self.page.findResources, self.page.getContents.getStream) end end |
#ensure_open! ⇒ Object
61 62 63 |
# File 'lib/tabula/extraction.rb', line 61 def ensure_open! raise "Document is closed" if @pdf_file_closed end |
#ensurePageSize! ⇒ Object
117 118 119 120 121 122 |
# File 'lib/tabula/extraction.rb', line 117 def ensurePageSize! if self.pageSize.nil? && !self.page.nil? mediaBox = self.page.findMediaBox self.pageSize = (mediaBox == nil ? nil : mediaBox.createDimension) end end |
#extract(pages = nil) ⇒ Object
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/tabula/extraction.rb', line 92 def extract(pages=nil) self.ensure_open! pages = if pages == :all (1..@all_pages.size) elsif pages.nil? @pages else pages end Enumerator.new do |y| pages.each do |i| y.yield self.extract_page(i) end end end |
#extract_page(page_number) ⇒ Object
extract objects from a page. Returns an instance of Tabula::Page
(page_number
is 1-based. i.e., first page is number 1)
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/tabula/extraction.rb', line 68 def extract_page(page_number) self.ensure_open! if page_number-1 >= @all_pages.size || (page_number) < 0 raise IndexError, "Page #{page_number} doesn't exist. Skipping. Valid pages are 1..#{@all_pages.size}" end page = @all_pages.get(page_number-1) contents = page.getContents return nil if contents.nil? self.clear! self.drawPage(page) Tabula::Page.new(@pdf_filename, page.findCropBox.width, page.findCropBox.height, page.getRotation.to_i, page_number, #one-indexed, just like +page_number+ is. self.characters, self.rulings, @min_char_width, @min_char_height) end |
#fillPath(windingRule) ⇒ Object
225 226 227 |
# File 'lib/tabula/extraction.rb', line 225 def fillPath(windingRule) self.strokePath(self.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil)) end |
#getStroke ⇒ Object
138 139 140 |
# File 'lib/tabula/extraction.rb', line 138 def getStroke @basicStroke end |
#page_count ⇒ Object
310 311 312 |
# File 'lib/tabula/extraction.rb', line 310 def page_count @all_pages.size end |
#pageTransform ⇒ Object
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
# File 'lib/tabula/extraction.rb', line 236 def pageTransform unless @page_transform.nil? return @page_transform end cb = page.findCropBox if !([90, -270, -90, 270].include?(page.getRotation)) @page_transform = AffineTransform.getScaleInstance(1, -1) @page_transform.translate(0, -cb.getHeight) else @page_transform = AffineTransform.getScaleInstance(-1, 1) @page_transform.rotate(page.getRotation * (Math::PI/180.0), cb.getLowerLeftX, cb.getLowerLeftY) end @page_transform end |
#processTextPosition(text) ⇒ Object
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
# File 'lib/tabula/extraction.rb', line 267 def processTextPosition(text) c = text.getCharacter h = text.getHeightDir.round(2) if c == ' ' # replace non-breaking space for space c = ' ' end te = Tabula::TextElement.new(text.getY.round(2) - h, text.getX.round(2), text.getWidthDirAdj, # ugly hack follows: we need spaces to have a height, so we can # test for vertical overlap. height == width seems a safe bet. text.getHeightDir, text.getFont, text.getFontSize, c, # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755 (text.getWidthOfSpace.nan? || text.getWidthOfSpace == 0) ? self.currentSpaceWidth : text.getWidthOfSpace, text.getDir) ccp_bounds = self.currentClippingPath if self.debug_clipping_paths && !self.clipping_paths.include?(ccp_bounds) self.clipping_paths << ::Tabula::ZoneEntity.new(ccp_bounds.getMinY, ccp_bounds.getMinX, ccp_bounds.getWidth, ccp_bounds.getHeight) end if te.width < @min_char_width @min_char_width = te.width end if te.height < @min_char_height @min_char_height = te.height end if c =~ PRINTABLE_RE && ccp_bounds.intersects(te) self.characters << te end end |
#rulings ⇒ Object
314 315 316 |
# File 'lib/tabula/extraction.rb', line 314 def rulings @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } } end |
#setStroke(stroke) ⇒ Object
134 135 136 |
# File 'lib/tabula/extraction.rb', line 134 def setStroke(stroke) @basicStroke = stroke end |
#strokePath(filter_by_color = nil) ⇒ Object
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# File 'lib/tabula/extraction.rb', line 143 def strokePath(filter_by_color=nil) unless self.[:extract_ruling_lines] self.getLinePath.reset return end path = self.pathToList(self.getLinePath) # skip paths whose first operation is not a MOVETO # or contains operations other than LINETO, MOVETO or CLOSE if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \ || path[1..-1].any? { |p| p.first != java.awt.geom.PathIterator::SEG_LINETO \ && p.first != java.awt.geom.PathIterator::SEG_MOVETO \ && p.first != java.awt.geom.PathIterator::SEG_CLOSE } self.getLinePath.reset return end ccp_bounds = self.currentClippingPath strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil) color_filter = self.[:line_color_filter] if !color_filter.nil? && !color_filter.call(strokeColorComps) self.getLinePath.reset return end # skip the first path operation save it as the starting position first = path.shift # last_move start_pos = last_move = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1]) end_pos = nil path.each do |p| case p[0] when java.awt.geom.PathIterator::SEG_LINETO end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1]) line = (start_pos <=> end_pos) == -1 \ ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \ : java.awt.geom.Line2D::Float.new(end_pos, start_pos) if line.intersects(ccp_bounds) # convert line to rectangle for clipping it to the current clippath # sucks, but awt doesn't have methods for this tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D @rulings << ::Tabula::Ruling.new(tmp.getY, tmp.getX, tmp.getWidth, tmp.getHeight, filter_by_color.to_a) end when java.awt.geom.PathIterator::SEG_MOVETO last_move = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1]) when java.awt.geom.PathIterator::SEG_CLOSE # according to PathIterator docs: # "the preceding subpath should be closed by appending a line segment # back to the point corresponding to the most recent SEG_MOVETO." line = (end_pos <=> last_move) == -1 \ ? java.awt.geom.Line2D::Float.new(end_pos, last_move) \ : java.awt.geom.Line2D::Float.new(last_move, end_pos) if line.intersects(ccp_bounds) # convert line to rectangle for clipping it to the current clippath # sucks, but awt doesn't have methods for this tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D @rulings << ::Tabula::Ruling.new(tmp.getY, tmp.getX, tmp.getWidth, tmp.getHeight, filter_by_color.to_a) end end start_pos = end_pos end self.getLinePath.reset end |
#transformPath(path) ⇒ Object
232 233 234 |
# File 'lib/tabula/extraction.rb', line 232 def transformPath(path) self.pageTransform.createTransformedShape(path) end |