Module: Tabula

Defined in:: lib/tabula.rb,
lib/tabula/version.rb,
lib/tabula/writers.rb,
lib/tabula/extraction.rb,
lib/tabula/pdf_render.rb,
lib/tabula/entities/cell.rb,
lib/tabula/entities/line.rb,
lib/tabula/entities/page.rb,
lib/tabula/table_guesser.rb,
lib/tabula/entities/table.rb,
lib/tabula/entities/ruling.rb,
lib/tabula/table_extractor.rb,
lib/tabula/entities/has_cells.rb,
lib/tabula/entities/page_area.rb,
lib/tabula/entities/text_chunk.rb,
lib/tabula/entities/spreadsheet.rb,
lib/tabula/entities/zone_entity.rb,
lib/tabula/entities/text_element.rb,
lib/tabula/line_segment_detector.rb,
lib/tabula/spreadsheet_extractor.rb,
lib/tabula/entities/text_element_index.rb

Defined Under Namespace

Modules: Extraction, HasCells, LSD, Render, TableGuesser, Writers Classes: Cell, Line, Page, PageArea, Ruling, Spreadsheet, Table, TextChunk, TextElement, TextElementIndex, ZoneEntity

Constant Summary collapse

PDFBOX =

'pdfbox-app-2.0.0-SNAPSHOT.jar'

ONLY_SPACES_RE =

Regexp.new('^\s+$')

VERSION =

'0.7.2'

Class Method Summary collapse

.extract_table(pdf_path, page, area, options = {}) ⇒ Object

extract a table from file pdf_path, pages and area.
.group_by_lines(text_chunks) ⇒ Object
.make_table(page, area, options = {}) ⇒ Object

Returns an array of Tabula::Line.
.merge_words(text_elements, options = {}) ⇒ Object

Class Method Details

.extract_table(pdf_path, page, area, options = {}) ⇒ `Object`

extract a table from file pdf_path, pages and area

pages can be a single integer (1-based) or an array of integers

Options

:password - Password if encrypted PDF (default: empty) :detect_ruling_lines - Try to detect vertical (default: true) :vertical_rulings - List of positions for vertical rulings. Overrides :detect_ruling_lines. (default: [])

# File 'lib/tabula/table_extractor.rb', line 27

def Tabula.extract_table(pdf_path, page, area, options={})
  options = {
    :password => '',
    :detect_ruling_lines => true,
    :vertical_rulings => [],
    :extraction_method => "guess",
  }.merge(options)

  if area.instance_of?(Array)
    top, left, bottom, right = area
    area = Tabula::ZoneEntity.new(top, left,
                                  right - left, bottom - top)
  end

  if page.is_a?(Integer)
    page = [page]
  end

  pdf_page = Extraction::ObjectExtractor.new(pdf_path,
                                             page,
                                             options[:password]) \
    .extract.next

  if ["spreadsheet", "original"].include? options[:extraction_method]
    use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
  else
    use_spreadsheet_extraction_method = pdf_page.is_tabular?
  end

  if use_spreadsheet_extraction_method
    table = pdf_page.get_area(area).spreadsheets.inject(&:+)
  else
    use_detected_lines = false
    if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
      detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
                                                              area)

      # only use lines if at least 80% of them cover at least 90%
      # of the height of area of interest

      # TODO this heuristic SUCKS
      # what if only a couple columns is delimited with vertical rulings?
      # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
      # idea: detect columns without considering rulings, detect vertical rulings
      # calculate ratio and try to come up with a threshold
      use_detected_lines = detected_vertical_rulings.size > 2 \
      && (detected_vertical_rulings.count { |vl|
            vl.height / area.height > 0.9
          } / detected_vertical_rulings.size.to_f) >= 0.8

    end

    table = pdf_page.get_area(area).get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])

    # fixes up the table a little bit, replacing nils with empty TextElements
    # and sorting the lines.
    table.lines.each do |l|
      l.text_elements = l.text_elements.map do |te|
        te || TextElement.new(nil, nil, nil, nil, nil, nil, '', nil)
      end
    end
    table.lines.sort_by! { |l| l.text_elements.map { |te| te.top or 0 }.max }
    table
  end
end

.group_by_lines(text_chunks) ⇒ `Object`

# File 'lib/tabula/table_extractor.rb', line 8

def Tabula.group_by_lines(text_chunks)
  warn 'Tabula.group_by_lines is DEPRECATED. Use Tabula::TextChunk.group_by_lines instead.'
  TextChunk.group_by_lines(text_chunks)
end

.make_table(page, area, options = {}) ⇒ `Object`

Returns an array of Tabula::Line

# File 'lib/tabula/table_extractor.rb', line 14

def Tabula.make_table(page, area, options={})
  warn 'Tabula.make_table is DEPRECATED. Use Tabula::Page#make_table instead.'
  page.get_area(area).make_table(options)
end

.merge_words(text_elements, options = {}) ⇒ `Object`

# File 'lib/tabula/table_extractor.rb', line 3

def Tabula.merge_words(text_elements, options={})
  warn 'Tabula.merge_words is DEPRECATED. Use Tabula::TextElement.merge_words instead'
  TextElement.merge_words(text_elements, options)
end