Module: Mindee::PDF::PdfProcessor

Defined in:
lib/mindee/pdf/pdf_processor.rb

Overview

PDF document processing

Constant Summary collapse

DEFAULT_OPTIONS =

Default options for pdf documents processing.

{
  page_indexes: [0],
  operation: :KEEP_ONLY,
  on_min_pages: 0,
}.freeze

Class Method Summary collapse

Class Method Details

.get_page(pdf_doc, page_id) ⇒ StringIO

Retrieves a PDF document's page.

Parameters:

  • pdf_doc (Origami::PDF)

    Origami PDF handle.

  • page_id (Integer)

    Page ID.

Returns:

  • (StringIO)


84
85
86
87
88
89
90
91
92
93
# File 'lib/mindee/pdf/pdf_processor.rb', line 84

def self.get_page(pdf_doc, page_id)
  stream = StringIO.new
  pdf_doc.save(stream)

  options = {
    page_indexes: [page_id - 1],
  }

  parse(stream, options)
end

.indexes_from_keep(page_indexes, all_pages) ⇒ Object

Parameters:

  • page_indexes (Array)
  • all_pages (Array)


46
47
48
49
50
51
52
53
54
55
56
# File 'lib/mindee/pdf/pdf_processor.rb', line 46

def self.indexes_from_keep(page_indexes, all_pages)
  pages_to_keep = Set.new
  page_indexes.each do |idx|
    idx = (all_pages.length - (idx + 2)) if idx.negative?
    page = all_pages[idx]
    next if page.nil?

    pages_to_keep << page
  end
  all_pages.to_set - pages_to_keep
end

.indexes_from_remove(page_indexes, all_pages) ⇒ Object

Parameters:

  • page_indexes (Array)
  • all_pages (Array)


60
61
62
63
64
65
66
67
68
69
# File 'lib/mindee/pdf/pdf_processor.rb', line 60

def self.indexes_from_remove(page_indexes, all_pages)
  pages_to_remove = Set.new
  page_indexes.each do |idx|
    idx = (all_pages.length - (idx + 2)) if idx.negative?
    page = all_pages[idx]
    next if page.nil?

    pages_to_remove << page
  end
end

.open_pdf(io_stream) ⇒ Origami::PDF

Parameters:

  • io_stream (StringIO)

Returns:

  • (Origami::PDF)


73
74
75
76
77
# File 'lib/mindee/pdf/pdf_processor.rb', line 73

def self.open_pdf(io_stream)
  pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
  io_stream.seek(0)
  pdf_parser.parse(io_stream)
end

.parse(io_stream, options) ⇒ StringIO

Parameters:

  • io_stream (StreamIO)
  • options (Hash)

Returns:

  • (StringIO)


22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/mindee/pdf/pdf_processor.rb', line 22

def self.parse(io_stream, options)
  options = DEFAULT_OPTIONS.merge(options)

  current_pdf = open_pdf(io_stream)
  pages_count = current_pdf.pages.size
  return if options[:on_min_pages] > pages_count

  all_pages = (0..pages_count - 1).to_a

  case options[:operation]
  when :KEEP_ONLY
    pages_to_remove = indexes_from_keep(options[:page_indexes], all_pages)
  when :REMOVE
    pages_to_remove = indexes_from_remove(options[:page_indexes], all_pages)
  else
    raise "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'"
  end

  current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a
  current_pdf.to_io_stream
end