Class: Tabula::Extraction::LineExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/tabula/pdf_line_extractor.rb

Defined Under Namespace

Classes: AppendRectangleToPathOperator, CloseAndStrokePathOperator, CloseFillNonZeroAndStrokePathOperator, EndPathOperator, FillNonZeroRuleOperator, LineToOperator, MoveToOperator, StrokePathOperator

Constant Summary collapse

DETECT_LINES_DEFAULTS =
{
  :snapping_grid_cell_size => 2
}
OPERATOR_PROCESSORS =
{
  'm' => MoveToOperator.new,
  're' => AppendRectangleToPathOperator.new,
  'l' => LineToOperator.new,
  'S' => StrokePathOperator.new,
  's' => StrokePathOperator.new,
  'n' => EndPathOperator.new,
  'b' => CloseFillNonZeroAndStrokePathOperator.new,
  'b*' => CloseFillNonZeroAndStrokePathOperator.new,
  'f' => CloseFillNonZeroAndStrokePathOperator.new,
  'f*' => CloseFillNonZeroAndStrokePathOperator.new,
  'BT' => org.apache.pdfbox.util.operator.BeginText.new,
  'cm' => org.apache.pdfbox.util.operator.Concatenate.new,
  'CS' => org.apache.pdfbox.util.operator.SetStrokingColorSpace.new,
  'cs' => org.apache.pdfbox.util.operator.SetNonStrokingColorSpace.new,
  'ET' => org.apache.pdfbox.util.operator.EndText.new,
  'G' => org.apache.pdfbox.util.operator.SetStrokingGrayColor.new,
  'g' => org.apache.pdfbox.util.operator.SetNonStrokingGrayColor.new,
  'gs' => org.apache.pdfbox.util.operator.SetGraphicsStateParameters.new,
  'K' => org.apache.pdfbox.util.operator.SetStrokingCMYKColor.new,
  'k' => org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor.new,
  'q' => org.apache.pdfbox.util.operator.GSave.new,
  'Q' => org.apache.pdfbox.util.operator.GRestore.new,
  'RG' => org.apache.pdfbox.util.operator.SetStrokingRGBColor.new,
  'rg' => org.apache.pdfbox.util.operator.SetNonStrokingRGBColor.new,
  'SC' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
  'sc' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
  'SCN' => org.apache.pdfbox.util.operator.SetStrokingColor.new,
  'scn' => org.apache.pdfbox.util.operator.SetNonStrokingColor.new,
  'T*' => org.apache.pdfbox.util.operator.NextLine.new,
  'Tc' => org.apache.pdfbox.util.operator.SetCharSpacing.new,
  'Td' => org.apache.pdfbox.util.operator.MoveText.new,
  'TD' => org.apache.pdfbox.util.operator.MoveTextSetLeading.new,
  'Tf' => org.apache.pdfbox.util.operator.SetTextFont.new,
  'Tj' => org.apache.pdfbox.util.operator.ShowText.new,
  'TJ' => org.apache.pdfbox.util.operator.ShowTextGlyph.new,
  'TL' => org.apache.pdfbox.util.operator.SetTextLeading.new,
  'Tm' => org.apache.pdfbox.util.operator.SetMatrix.new,
  'Tr' => org.apache.pdfbox.util.operator.SetTextRenderingMode.new,
  'Ts' => org.apache.pdfbox.util.operator.SetTextRise.new,
  'Tw' => org.apache.pdfbox.util.operator.SetWordSpacing.new,
  'Tz' => org.apache.pdfbox.util.operator.SetHorizontalTextScaling.new,
  "\'" => org.apache.pdfbox.util.operator.MoveAndShow.new,
  '\"' => org.apache.pdfbox.util.operator.SetMoveAndShow.new,
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ LineExtractor

Returns a new instance of LineExtractor.



236
237
238
239
240
241
# File 'lib/tabula/pdf_line_extractor.rb', line 236

def initialize(options={})
  super()
  @options = options.merge!(DETECT_LINES_DEFAULTS)
  self.clear!
  OPERATOR_PROCESSORS.each { |k,v| registerOperatorProcessor(k, v) }
end

Instance Attribute Details

#currentPathObject

Returns the value of attribute currentPath.



17
18
19
# File 'lib/tabula/pdf_line_extractor.rb', line 17

def currentPath
  @currentPath
end

#currentXObject

Returns the value of attribute currentX.



16
17
18
# File 'lib/tabula/pdf_line_extractor.rb', line 16

def currentX
  @currentX
end

#currentYObject

Returns the value of attribute currentY.



16
17
18
# File 'lib/tabula/pdf_line_extractor.rb', line 16

def currentY
  @currentY
end

#optionsObject

Returns the value of attribute options.



19
20
21
# File 'lib/tabula/pdf_line_extractor.rb', line 19

def options
  @options
end

#rulingsObject

Returns the value of attribute rulings.



18
19
20
# File 'lib/tabula/pdf_line_extractor.rb', line 18

def rulings
  @rulings
end

Class Method Details

.collapse_horizontal_rulings(lines) ⇒ Object

lines should all be of one orientation (i.e. horizontal, vertical)



39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/tabula/pdf_line_extractor.rb', line 39

def self.collapse_horizontal_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
  lines.sort!{|a, b| a.top != b.top ? a.top <=> b.top : a.left <=> b.left }
  lines.inject([]) do |memo, next_line|
    if memo.last && next_line.top == memo.last.top && memo.last.nearlyIntersects?(next_line)
      memo.last.left = [next_line.left, memo.last.left].min
      memo.last.right = [next_line.right, memo.last.right].max
      memo
    else
      memo << next_line
    end
  end
end

.collapse_vertical_rulings(lines) ⇒ Object

lines should all be of one orientation (i.e. horizontal, vertical)



26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/tabula/pdf_line_extractor.rb', line 26

def self.collapse_vertical_rulings(lines) #lines should all be of one orientation (i.e. horizontal, vertical)
  lines.sort!{|a, b| a.left != b.left ? a.left <=> b.left : a.top <=> b.top }
  lines.inject([]) do |memo, next_line|
    if memo.last && next_line.left == memo.last.left && memo.last.nearlyIntersects?(next_line)
      memo.last.top = [next_line.top, memo.last.top].min
      memo.last.bottom = [next_line.bottom, memo.last.bottom].max
      memo
    else
      memo << next_line
    end
  end
end

.lines_in_pdf_page(pdf_path, page_number, options = {}) ⇒ Object

N.B. for merge ‘spreadsheets` into `text-extractor-refactor` – only substantive change here is calling Tabula::Ruling::clean_rulings on LSD output in this method the rest is readability changes. page_number here is zero-indexed



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/tabula/pdf_line_extractor.rb', line 56

def self.lines_in_pdf_page(pdf_path, page_number, options={})
  options = options.merge!(DETECT_LINES_DEFAULTS)
  if options[:render_pdf]
    # only LSD rulings need to be "cleaned" with clean_rulings; might as well do this here
    # since there's no good reason want unclean lines
    Tabula::Ruling::clean_rulings(Tabula::LSD::detect_lines_in_pdf_page(pdf_path, page_number, options))
  else
    pdf_file = ::Tabula::Extraction.openPDF(pdf_path)
    page = pdf_file.getDocumentCatalog.getAllPages[page_number]
    le = self.new(options)
    le.processStream(page, page.findResources, page.getContents.getStream)
    pdf_file.close
    rulings = le.rulings.map do |l, color|
      ::Tabula::Ruling.new(l.getP1.getY,
                           l.getP1.getX,
                           l.getP2.getX - l.getP1.getX,
                           l.getP2.getY - l.getP1.getY,
                           color)
    end
    rulings.reject! { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
    collapse_vertical_rulings(rulings.select(&:vertical?)) + collapse_horizontal_rulings(rulings.select(&:horizontal?))
  end
end

Instance Method Details

#addRuling(ruling, color = nil) ⇒ Object



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# File 'lib/tabula/pdf_line_extractor.rb', line 251

def addRuling(ruling, color=nil)
  color = color.nil? ? [0,0,0] : color
  if !page.getRotation.nil? && [90, -270, -90, 270].include?(page.getRotation)

    mb = page.findMediaBox

    ruling.rotate!(mb.getLowerLeftX, mb.getLowerLeftY, page.getRotation)

    trans = if page.getRotation == 90 || page.getRotation == -270
              AffineTransform.getTranslateInstance(mb.getHeight, 0)
            else
              AffineTransform.getTranslateInstance(0, mb.getWidth)
            end
    ruling.transform!(trans)
  end

  # snapping to grid and joining lines that are close together
  ruling.snap!(options[:snapping_grid_cell_size])

  self.rulings << [ruling, color]
end

#clear!Object



243
244
245
246
247
248
249
# File 'lib/tabula/pdf_line_extractor.rb', line 243

def clear!
  self.rulings = []
  self.currentX = -1
  self.currentY = -1
  self.currentPath = []
  @pageSize = nil
end

#fixY(y) ⇒ Object

fix the Y coordinate based on page rotation



281
282
283
# File 'lib/tabula/pdf_line_extractor.rb', line 281

def fixY(y)
  pageSize.getHeight - y
end

#pageSizeObject

get current page size



275
276
277
# File 'lib/tabula/pdf_line_extractor.rb', line 275

def pageSize
  @pageSize ||= self.page.findMediaBox.createDimension
end

#ScaledPoint(*args) ⇒ Object



285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# File 'lib/tabula/pdf_line_extractor.rb', line 285

def ScaledPoint(*args)
  x, y = args[0], args[1]

  # if scale factor not provided, get it from current transformation matrix
  if args.size == 2
    ctm = getGraphicsState.getCurrentTransformationMatrix
    at = ctm.createAffineTransform
    scaleX = at.getScaleX; scaleY = at.getScaleY
  else
    scaleX = args[2]; scaleY = args[3]
  end

  finalX = 0.0;
  finalY = 0.0;

  if scaleX > 0
    finalX = x * scaleX;
  end
  if scaleY > 0
    finalY = y * scaleY;
  end

  return java.awt.geom.Point2D::Float.new(finalX, finalY);

end

#TransformedPoint(x, y) ⇒ Object



311
312
313
314
315
316
317
# File 'lib/tabula/pdf_line_extractor.rb', line 311

def TransformedPoint(x, y)
  position = [x,y].to_java(:float)
  at = self.getGraphicsState.getCurrentTransformationMatrix.createAffineTransform
  at.transform(position, 0, position, 0, 1)
  position[1] = fixY(position[1])
  java.awt.geom.Point2D::Float.new(position[0], position[1])
end