Module: Mindee::PDF::PdfProcessor
- Defined in:
- lib/mindee/pdf/pdf_processor.rb
Overview
PDF document processing
Constant Summary collapse
- DEFAULT_OPTIONS =
Default options for pdf documents processing.
{ page_indexes: [0], operation: :KEEP_ONLY, on_min_pages: 0, }.freeze
Class Method Summary collapse
-
.get_page(pdf_doc, page_id) ⇒ StringIO
Retrieves a PDF document's page.
- .indexes_from_keep(page_indexes, all_pages) ⇒ Object
- .indexes_from_remove(page_indexes, all_pages) ⇒ Object
- .open_pdf(io_stream) ⇒ Origami::PDF
- .parse(io_stream, options) ⇒ StringIO
Class Method Details
.get_page(pdf_doc, page_id) ⇒ StringIO
Retrieves a PDF document's page.
84 85 86 87 88 89 90 91 92 93 |
# File 'lib/mindee/pdf/pdf_processor.rb', line 84 def self.get_page(pdf_doc, page_id) stream = StringIO.new pdf_doc.save(stream) = { page_indexes: [page_id - 1], } parse(stream, ) end |
.indexes_from_keep(page_indexes, all_pages) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/mindee/pdf/pdf_processor.rb', line 46 def self.indexes_from_keep(page_indexes, all_pages) pages_to_keep = Set.new page_indexes.each do |idx| idx = (all_pages.length - (idx + 2)) if idx.negative? page = all_pages[idx] next if page.nil? pages_to_keep << page end all_pages.to_set - pages_to_keep end |
.indexes_from_remove(page_indexes, all_pages) ⇒ Object
60 61 62 63 64 65 66 67 68 69 |
# File 'lib/mindee/pdf/pdf_processor.rb', line 60 def self.indexes_from_remove(page_indexes, all_pages) pages_to_remove = Set.new page_indexes.each do |idx| idx = (all_pages.length - (idx + 2)) if idx.negative? page = all_pages[idx] next if page.nil? pages_to_remove << page end end |
.open_pdf(io_stream) ⇒ Origami::PDF
73 74 75 76 77 |
# File 'lib/mindee/pdf/pdf_processor.rb', line 73 def self.open_pdf(io_stream) pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET }) io_stream.seek(0) pdf_parser.parse(io_stream) end |
.parse(io_stream, options) ⇒ StringIO
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/mindee/pdf/pdf_processor.rb', line 22 def self.parse(io_stream, ) = DEFAULT_OPTIONS.merge() current_pdf = open_pdf(io_stream) pages_count = current_pdf.pages.size return if [:on_min_pages] > pages_count all_pages = (0..pages_count - 1).to_a case [:operation] when :KEEP_ONLY pages_to_remove = indexes_from_keep([:page_indexes], all_pages) when :REMOVE pages_to_remove = indexes_from_remove([:page_indexes], all_pages) else raise "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" end current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a current_pdf.to_io_stream end |