Module: Tabula
- Defined in:
- lib/tabula.rb,
lib/tabula/version.rb,
lib/tabula/writers.rb,
lib/tabula/extraction.rb,
lib/tabula/entities/cell.rb,
lib/tabula/entities/line.rb,
lib/tabula/entities/page.rb,
lib/tabula/table_guesser.rb,
lib/tabula/entities/table.rb,
lib/tabula/entities/ruling.rb,
lib/tabula/table_extractor.rb,
lib/tabula/entities/tabular.rb,
lib/tabula/entities/has_cells.rb,
lib/tabula/entities/page_area.rb,
lib/tabula/entities/text_chunk.rb,
lib/tabula/entities/spreadsheet.rb,
lib/tabula/entities/zone_entity.rb,
lib/tabula/entities/text_element.rb,
lib/tabula/spreadsheet_extractor.rb,
lib/tabula/entities/text_element_index.rb
Defined Under Namespace
Modules: AbstractInterface, Extraction, HasCells, TableGuesser, Tabular, Writers Classes: Cell, Line, Page, PageArea, Ruling, Spreadsheet, Table, TextChunk, TextElement, TextElementIndex, ZoneEntity
Constant Summary collapse
- PDFBOX =
'pdfbox-app-2.0.0-SNAPSHOT.jar'
- ONLY_SPACES_RE =
Regexp.new('^\s+$')
- SAME_CHAR_RE =
Regexp.new('^(.)\1+$')
- VERSION =
'0.8.0'
Class Method Summary collapse
-
.extract_table(pdf_path, page, area, options = {}) ⇒ Object
extract a table from file
pdf_path
,pages
andarea
. - .group_by_lines(text_chunks) ⇒ Object
-
.make_table(page, area, options = {}) ⇒ Object
Returns an array of Tabula::Line.
- .merge_words(text_elements, options = {}) ⇒ Object
Class Method Details
.extract_table(pdf_path, page, area, options = {}) ⇒ Object
extract a table from file pdf_path
, pages
and area
pages
can be a single integer (1-based) or an array of integers
Options
:password
- Password if encrypted PDF (default: empty) :detect_ruling_lines
- Try to detect vertical (default: true) :vertical_rulings
- List of positions for vertical rulings. Overrides :detect_ruling_lines
. (default: [])
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/tabula/table_extractor.rb', line 27 def Tabula.extract_table(pdf_path, page, area, ={}) = { :password => '', :detect_ruling_lines => true, :vertical_rulings => [], :extraction_method => "guess", }.merge() if area.instance_of?(Array) top, left, bottom, right = area area = Tabula::ZoneEntity.new(top, left, right - left, bottom - top) end if page.is_a?(Integer) page = [page] end extractor = Extraction::ObjectExtractor.new(pdf_path, page, [:password]) pdf_page = extractor.extract.next extractor.close! if ["spreadsheet", "original"].include? [:extraction_method] use_spreadsheet_extraction_method = [:extraction_method] == "spreadsheet" else use_spreadsheet_extraction_method = pdf_page.is_tabular? end if use_spreadsheet_extraction_method return (spreadsheets = pdf_page.get_area(area).spreadsheets).empty? ? Spreadsheet.empty(pdf_page) : spreadsheets.inject(&:+) end use_detected_lines = false if [:detect_ruling_lines] && [:vertical_rulings].empty? detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines, area) # only use lines if at least 80% of them cover at least 90% # of the height of area of interest # TODO this heuristic SUCKS # what if only a couple columns is delimited with vertical rulings? # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column) # idea: detect columns without considering rulings, detect vertical rulings # calculate ratio and try to come up with a threshold use_detected_lines = detected_vertical_rulings.size > 2 \ && (detected_vertical_rulings.count { |vl| vl.height / area.height > 0.9 } / detected_vertical_rulings.size.to_f) >= 0.8 end pdf_page .get_area(area) .get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : [:vertical_rulings]) end |
.group_by_lines(text_chunks) ⇒ Object
8 9 10 11 |
# File 'lib/tabula/table_extractor.rb', line 8 def Tabula.group_by_lines(text_chunks) warn 'Tabula.group_by_lines is DEPRECATED. Use Tabula::TextChunk.group_by_lines instead.' TextChunk.group_by_lines(text_chunks) end |
.make_table(page, area, options = {}) ⇒ Object
Returns an array of Tabula::Line
14 15 16 17 |
# File 'lib/tabula/table_extractor.rb', line 14 def Tabula.make_table(page, area, ={}) warn 'Tabula.make_table is DEPRECATED. Use Tabula::Page#make_table instead.' page.get_area(area).make_table() end |
.merge_words(text_elements, options = {}) ⇒ Object
3 4 5 6 |
# File 'lib/tabula/table_extractor.rb', line 3 def Tabula.merge_words(text_elements, ={}) warn 'Tabula.merge_words is DEPRECATED. Use Tabula::TextElement.merge_words instead' TextElement.merge_words(text_elements, ) end |