Class: Tabula::Extraction::LineExtractor
- Inherits:
-
Object
- Object
- Tabula::Extraction::LineExtractor
- Defined in:
- lib/tabula/pdf_line_extractor.rb
Defined Under Namespace
Classes: AppendRectangleToPathOperator, CloseAndStrokePathOperator, CloseFillNonZeroAndStrokePathOperator, EndPathOperator, FillNonZeroRuleOperator, LineToOperator, MoveToOperator, StrokePathOperator
Constant Summary collapse
- DETECT_LINES_DEFAULTS =
{ :snapping_grid_cell_size => 2 }
- OPERATOR_PROCESSORS =
{ 'm' => MoveToOperator.new, 're' => AppendRectangleToPathOperator.new, 'l' => LineToOperator.new, 'S' => StrokePathOperator.new, 's' => StrokePathOperator.new, 'n' => EndPathOperator.new, 'b' => CloseFillNonZeroAndStrokePathOperator.new, 'b*' => CloseFillNonZeroAndStrokePathOperator.new, 'f' => CloseFillNonZeroAndStrokePathOperator.new, 'f*' => CloseFillNonZeroAndStrokePathOperator.new, 'BT' => org.apache.pdfbox.util.operator.BeginText.new, 'cm' => org.apache.pdfbox.util.operator.Concatenate.new, 'CS' => org.apache.pdfbox.util.operator.SetStrokingColorSpace.new, 'cs' => org.apache.pdfbox.util.operator.SetNonStrokingColorSpace.new, 'ET' => org.apache.pdfbox.util.operator.EndText.new, 'G' => org.apache.pdfbox.util.operator.SetStrokingGrayColor.new, 'g' => org.apache.pdfbox.util.operator.SetNonStrokingGrayColor.new, 'gs' => org.apache.pdfbox.util.operator.SetGraphicsStateParameters.new, 'K' => org.apache.pdfbox.util.operator.SetStrokingCMYKColor.new, 'k' => org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor.new, 'q' => org.apache.pdfbox.util.operator.GSave.new, 'Q' => org.apache.pdfbox.util.operator.GRestore.new, 'RG' => org.apache.pdfbox.util.operator.SetStrokingRGBColor.new, 'rg' => org.apache.pdfbox.util.operator.SetNonStrokingRGBColor.new, 'SC' => org.apache.pdfbox.util.operator.SetStrokingColor.new, 'sc' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new, 'SCN' => org.apache.pdfbox.util.operator.SetStrokingColor.new, 'scn' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new, 'T*' => org.apache.pdfbox.util.operator.NextLine.new, 'Tc' => org.apache.pdfbox.util.operator.SetCharSpacing.new, 'Td' => org.apache.pdfbox.util.operator.MoveText.new, 'TD' => org.apache.pdfbox.util.operator.MoveTextSetLeading.new, 'Tf' => org.apache.pdfbox.util.operator.SetTextFont.new, 'Tj' => org.apache.pdfbox.util.operator.ShowText.new, 'TJ' => org.apache.pdfbox.util.operator.ShowTextGlyph.new, 'TL' => org.apache.pdfbox.util.operator.SetTextLeading.new, 'Tm' => org.apache.pdfbox.util.operator.SetMatrix.new, 'Tr' => org.apache.pdfbox.util.operator.SetTextRenderingMode.new, 'Ts' => org.apache.pdfbox.util.operator.SetTextRise.new, 'Tw' => org.apache.pdfbox.util.operator.SetWordSpacing.new, 'Tz' => org.apache.pdfbox.util.operator.SetHorizontalTextScaling.new, "\'" => org.apache.pdfbox.util.operator.MoveAndShow.new, '\"' => org.apache.pdfbox.util.operator.SetMoveAndShow.new, }
Instance Attribute Summary collapse
-
#currentPath ⇒ Object
Returns the value of attribute currentPath.
-
#currentX ⇒ Object
Returns the value of attribute currentX.
-
#currentY ⇒ Object
Returns the value of attribute currentY.
-
#options ⇒ Object
Returns the value of attribute options.
-
#rulings ⇒ Object
Returns the value of attribute rulings.
Class Method Summary collapse
-
.collapse_horizontal_rulings(lines) ⇒ Object
lines should all be of one orientation (i.e. horizontal, vertical).
-
.collapse_vertical_rulings(lines) ⇒ Object
lines should all be of one orientation (i.e. horizontal, vertical).
-
.lines_in_pdf_page(pdf_path, page_number, options = {}) ⇒ Object
N.B.
Instance Method Summary collapse
- #addRuling(ruling, color = nil) ⇒ Object
- #clear! ⇒ Object
-
#fixY(y) ⇒ Object
fix the Y coordinate based on page rotation.
-
#initialize(options = {}) ⇒ LineExtractor
constructor
A new instance of LineExtractor.
-
#pageSize ⇒ Object
get current page size.
- #ScaledPoint(*args) ⇒ Object
- #TransformedPoint(x, y) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ LineExtractor
Returns a new instance of LineExtractor.
236 237 238 239 240 241 |
# File 'lib/tabula/pdf_line_extractor.rb', line 236 def initialize(={}) super() @options = .merge!(DETECT_LINES_DEFAULTS) self.clear! OPERATOR_PROCESSORS.each { |k,v| registerOperatorProcessor(k, v) } end |
Instance Attribute Details
#currentPath ⇒ Object
Returns the value of attribute currentPath.
17 18 19 |
# File 'lib/tabula/pdf_line_extractor.rb', line 17 def currentPath @currentPath end |
#currentX ⇒ Object
Returns the value of attribute currentX.
16 17 18 |
# File 'lib/tabula/pdf_line_extractor.rb', line 16 def currentX @currentX end |
#currentY ⇒ Object
Returns the value of attribute currentY.
16 17 18 |
# File 'lib/tabula/pdf_line_extractor.rb', line 16 def currentY @currentY end |
#options ⇒ Object
Returns the value of attribute options.
19 20 21 |
# File 'lib/tabula/pdf_line_extractor.rb', line 19 def @options end |
#rulings ⇒ Object
Returns the value of attribute rulings.
18 19 20 |
# File 'lib/tabula/pdf_line_extractor.rb', line 18 def rulings @rulings end |
Class Method Details
.collapse_horizontal_rulings(lines) ⇒ Object
lines should all be of one orientation (i.e. horizontal, vertical)
39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/tabula/pdf_line_extractor.rb', line 39 def self.collapse_horizontal_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical) lines.sort!{|a, b| a.top != b.top ? a.top <=> b.top : a.left <=> b.left } lines.inject([]) do |memo, next_line| if memo.last && next_line.top == memo.last.top && memo.last.nearlyIntersects?(next_line) memo.last.left = [next_line.left, memo.last.left].min memo.last.right = [next_line.right, memo.last.right].max memo else memo << next_line end end end |
.collapse_vertical_rulings(lines) ⇒ Object
lines should all be of one orientation (i.e. horizontal, vertical)
26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/tabula/pdf_line_extractor.rb', line 26 def self.collapse_vertical_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical) lines.sort!{|a, b| a.left != b.left ? a.left <=> b.left : a.top <=> b.top } lines.inject([]) do |memo, next_line| if memo.last && next_line.left == memo.last.left && memo.last.nearlyIntersects?(next_line) memo.last.top = [next_line.top, memo.last.top].min memo.last.bottom = [next_line.bottom, memo.last.bottom].max memo else memo << next_line end end end |
.lines_in_pdf_page(pdf_path, page_number, options = {}) ⇒ Object
N.B. for merge ‘spreadsheets` into `text-extractor-refactor` – only substantive change here is calling Tabula::Ruling::clean_rulings on LSD output in this method the rest is readability changes. page_number here is zero-indexed
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/tabula/pdf_line_extractor.rb', line 56 def self.lines_in_pdf_page(pdf_path, page_number, ={}) = .merge!(DETECT_LINES_DEFAULTS) if [:render_pdf] # only LSD rulings need to be "cleaned" with clean_rulings; might as well do this here # since there's no good reason want unclean lines Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(pdf_path, page_number, )) else pdf_file = ::Tabula::Extraction.openPDF(pdf_path) page = pdf_file.getDocumentCatalog.getAllPages[page_number] le = self.new() le.processStream(page, page.findResources, page.getContents.getStream) pdf_file.close rulings = le.rulings.map do |l, color| ::Tabula::Ruling.new(l.getP1.getY, l.getP1.getX, l.getP2.getX - l.getP1.getX, l.getP2.getY - l.getP1.getY, color) end rulings.reject! { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } } collapse_vertical_rulings(rulings.select(&:vertical?)) + collapse_horizontal_rulings(rulings.select(&:horizontal?)) end end |
Instance Method Details
#addRuling(ruling, color = nil) ⇒ Object
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
# File 'lib/tabula/pdf_line_extractor.rb', line 251 def addRuling(ruling, color=nil) color = color.nil? ? [0,0,0] : color if !page.getRotation.nil? && [90, -270, -90, 270].include?(page.getRotation) mb = page.findMediaBox ruling.rotate!(mb.getLowerLeftX, mb.getLowerLeftY, page.getRotation) trans = if page.getRotation == 90 || page.getRotation == -270 AffineTransform.getTranslateInstance(mb.getHeight, 0) else AffineTransform.getTranslateInstance(0, mb.getWidth) end ruling.transform!(trans) end # snapping to grid and joining lines that are close together ruling.snap!([:snapping_grid_cell_size]) self.rulings << [ruling, color] end |
#clear! ⇒ Object
243 244 245 246 247 248 249 |
# File 'lib/tabula/pdf_line_extractor.rb', line 243 def clear! self.rulings = [] self.currentX = -1 self.currentY = -1 self.currentPath = [] @pageSize = nil end |
#fixY(y) ⇒ Object
fix the Y coordinate based on page rotation
281 282 283 |
# File 'lib/tabula/pdf_line_extractor.rb', line 281 def fixY(y) pageSize.getHeight - y end |
#pageSize ⇒ Object
get current page size
275 276 277 |
# File 'lib/tabula/pdf_line_extractor.rb', line 275 def pageSize @pageSize ||= self.page.findMediaBox.createDimension end |
#ScaledPoint(*args) ⇒ Object
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
# File 'lib/tabula/pdf_line_extractor.rb', line 285 def ScaledPoint(*args) x, y = args[0], args[1] # if scale factor not provided, get it from current transformation matrix if args.size == 2 ctm = getGraphicsState.getCurrentTransformationMatrix at = ctm.createAffineTransform scaleX = at.getScaleX; scaleY = at.getScaleY else scaleX = args[2]; scaleY = args[3] end finalX = 0.0; finalY = 0.0; if scaleX > 0 finalX = x * scaleX; end if scaleY > 0 finalY = y * scaleY; end return java.awt.geom.Point2D::Float.new(finalX, finalY); end |
#TransformedPoint(x, y) ⇒ Object
311 312 313 314 315 316 317 |
# File 'lib/tabula/pdf_line_extractor.rb', line 311 def TransformedPoint(x, y) position = [x,y].to_java(:float) at = self.getGraphicsState.getCurrentTransformationMatrix.createAffineTransform at.transform(position, 0, position, 0, 1) position[1] = fixY(position[1]) java.awt.geom.Point2D::Float.new(position[0], position[1]) end |