Class: Tabula::Extraction::ObjectExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/tabula/extraction.rb

Direct Known Subclasses

SpreadsheetExtractor

Constant Summary collapse

PRINTABLE_RE =
/[[:print:]]/
DEFAULT_OPTIONS =
{
  :line_color_filter => nil,
  :extract_ruling_lines => true
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(pdf_filename, pages = [1], password = '', options = {}) ⇒ ObjectExtractor

Returns a new instance of ObjectExtractor.

Raises:

  • (Errno::ENOENT)


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/tabula/extraction.rb', line 34

def initialize(pdf_filename, pages=[1], password='', options={})
  raise Errno::ENOENT unless File.exists?(pdf_filename)
  @pdf_filename = pdf_filename
  @pdf_file = Extraction.openPDF(pdf_filename, password)
  @all_pages = @pdf_file.getDocumentCatalog.getAllPages
  @pages = pages == :all ?  (1..@all_pages.size) : pages

  super()

  self.options = DEFAULT_OPTIONS.merge(options)
  self.characters = []
  @debug_clipping_paths = false
  @clipping_path = nil
  @transformed_clipping_path = nil
  self.clipping_paths = []
  @rulings = []
  @min_char_width = @min_char_height = 1000000
end

Instance Attribute Details

#charactersObject

Returns the value of attribute characters.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def characters
  @characters
end

#clipping_pathsObject

Returns the value of attribute clipping_paths.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def clipping_paths
  @clipping_paths
end

#debug_clipping_pathsObject

Returns the value of attribute debug_clipping_paths.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def debug_clipping_paths
  @debug_clipping_paths
end

#debug_textObject

Returns the value of attribute debug_text.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def debug_text
  @debug_text
end

#optionsObject

Returns the value of attribute options.



25
26
27
# File 'lib/tabula/extraction.rb', line 25

def options
  @options
end

Instance Method Details

#clear!Object



80
81
82
83
84
85
# File 'lib/tabula/extraction.rb', line 80

def clear!
  self.characters.clear
  self.clipping_paths.clear
  @page_transform = nil
  @rulings.clear
end

#currentClippingPathObject



186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/tabula/extraction.rb', line 186

def currentClippingPath
  cp = self.getGraphicsState.getCurrentClippingPath

  if cp == @clipping_path
    return @transformed_clipping_path_bounds
  end

  @clipping_path = cp
  @transformed_clipping_path = self.transformPath(cp)
  @transformed_clipping_path_bounds = @transformed_clipping_path.getBounds

  return @transformed_clipping_path_bounds
end

#drawImage(image, at) ⇒ Object



162
163
# File 'lib/tabula/extraction.rb', line 162

def drawImage(image, at)
end

#drawPage(page) ⇒ Object



94
95
96
97
98
99
100
101
102
# File 'lib/tabula/extraction.rb', line 94

def drawPage(page)
  self.page = page
  if !self.page.getContents.nil?
    ensurePageSize!
    self.processStream(self.page,
                       self.page.findResources,
                       self.page.getContents.getStream)
  end
end

#ensurePageSize!Object



87
88
89
90
91
92
# File 'lib/tabula/extraction.rb', line 87

def ensurePageSize!
  if self.pageSize.nil? && !self.page.nil?
    mediaBox = self.page.findMediaBox
    self.pageSize = (mediaBox == nil ? nil : mediaBox.createDimension)
  end
end

#extractObject



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/tabula/extraction.rb', line 53

def extract
  Enumerator.new do |y|
    begin
      @pages.each do |i|
        page = @all_pages.get(i-1)
        contents = page.getContents
        next if contents.nil?

        self.clear!
        self.drawPage(page)
        p = Tabula::Page.new(@pdf_filename,
                             page.findCropBox.width,
                             page.findCropBox.height,
                             page.getRotation.to_i,
                             i, #one-indexed, just like `i` is.
                             self.characters,
                             self.rulings,
                             @min_char_width,
                             @min_char_height)
        y.yield p
      end
    ensure
      @pdf_file.close
    end # begin
  end
end

#fillPath(windingRule) ⇒ Object



158
159
160
# File 'lib/tabula/extraction.rb', line 158

def fillPath(windingRule)
  self.strokePath(self.getGraphicsState.getNonStrokingColor.getJavaColor.getRGBColorComponents(nil))
end

#getStrokeObject



108
109
110
# File 'lib/tabula/extraction.rb', line 108

def getStroke
  @basicStroke
end

#page_countObject



244
245
246
# File 'lib/tabula/extraction.rb', line 244

def page_count
  @all_pages.size
end

#pageTransformObject



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/tabula/extraction.rb', line 169

def pageTransform
  unless @page_transform.nil?
    return @page_transform
  end

  cb = page.findCropBox
  if !([90, -270, -90, 270].include?(page.getRotation))
    @page_transform = AffineTransform.getScaleInstance(1, -1)
    @page_transform.translate(0, -cb.getHeight)
  else
    @page_transform = AffineTransform.getScaleInstance(-1, 1)
    @page_transform.rotate(page.getRotation * (Math::PI/180.0),
                           cb.getLowerLeftX, cb.getLowerLeftY)
  end
  @page_transform
end

#processTextPosition(text) ⇒ Object



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/tabula/extraction.rb', line 200

def processTextPosition(text)
  c = text.getCharacter
  h = text.getHeightDir.round(2)

  if c == ' ' || c == ' ' # replace non-breaking space for space
    c = ' '
    h = text.getWidth.round(2)
  end

  te = Tabula::TextElement.new(text.getY.round(2) - h,
                               text.getX.round(2),
                               text.getWidth.round(2),
                               # ugly hack follows: we need spaces to have a height, so we can
                               # test for vertical overlap. height == width seems a safe bet.
                               h,
                               text.getFont,
                               text.getFontSize.round(2),
                               c,
                               # workaround a possible bug in PDFBox: https://issues.apache.org/jira/browse/PDFBOX-1755
                               text.getWidthOfSpace == 0 ? self.currentSpaceWidth : text.getWidthOfSpace,
                               text.getDir)

  ccp_bounds = self.currentClippingPath

  if self.debug_clipping_paths && !self.clipping_paths.include?(ccp_bounds)
    self.clipping_paths << ::Tabula::ZoneEntity.new(ccp_bounds.getMinY,
                                                    ccp_bounds.getMinX,
                                                    ccp_bounds.getWidth,
                                                    ccp_bounds.getHeight)
  end

  if te.width < @min_char_width
    @min_char_width = te.width
  end

  if te.height < @min_char_height
    @min_char_height = te.height
  end

  if c =~ PRINTABLE_RE && ccp_bounds.intersects(te)
    self.characters << te
  end
end

#rulingsObject



248
249
250
251
# File 'lib/tabula/extraction.rb', line 248

def rulings
  return [] if @rulings.empty?
  @rulings.reject { |l| (l.left == l.right && l.top == l.bottom) || [l.top, l.left, l.bottom, l.right].any? { |p| p < 0 } }
end

#setStroke(stroke) ⇒ Object



104
105
106
# File 'lib/tabula/extraction.rb', line 104

def setStroke(stroke)
  @basicStroke = stroke
end

#strokePath(filter_by_color = nil) ⇒ Object



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/tabula/extraction.rb', line 113

def strokePath(filter_by_color=nil)
  unless self.options[:extract_ruling_lines]
    self.getLinePath.reset
    return
  end

  path = self.pathToList(self.getLinePath)

  if path[0][0] != java.awt.geom.PathIterator::SEG_MOVETO \
    || path[1..-1].any? { |p| p.first != java.awt.geom.PathIterator::SEG_LINETO && p.first != java.awt.geom.PathIterator::SEG_MOVETO && p.first != java.awt.geom.PathIterator::SEG_CLOSE }
    self.getLinePath.reset
    return
  end

  ccp_bounds = self.currentClippingPath

  strokeColorComps = filter_by_color || self.getGraphicsState.getStrokingColor.getJavaColor.getRGBColorComponents(nil)
  color_filter = self.options[:line_color_filter]

  first = path.shift
  start_pos = java.awt.geom.Point2D::Float.new(first[1][0], first[1][1])

  path.each do |p|
    end_pos = java.awt.geom.Point2D::Float.new(p[1][0], p[1][1])
    line = (start_pos <=> end_pos) == -1 \
      ? java.awt.geom.Line2D::Float.new(start_pos, end_pos) \
      : java.awt.geom.Line2D::Float.new(end_pos, start_pos)

    if p[0] == java.awt.geom.PathIterator::SEG_LINETO \
      && (color_filter.nil? ? true : color_filter.call(strokeColorComps)) \
      && line.intersects(ccp_bounds)
      # convert line to rectangle for clipping it to the current clippath
      # sucks, but awt doesn't have methods for this
      tmp = line.getBounds2D.createIntersection(ccp_bounds).getBounds2D
      @rulings << ::Tabula::Ruling.new(tmp.getY,
                                       tmp.getX,
                                       tmp.getWidth,
                                       tmp.getHeight,
                                       filter_by_color.to_a)
    end
    start_pos = end_pos
  end
  self.getLinePath.reset
end

#transformPath(path) ⇒ Object



165
166
167
# File 'lib/tabula/extraction.rb', line 165

def transformPath(path)
  self.pageTransform.createTransformedShape(path)
end