Class: Mindee::PDF::PDFExtractor::PDFExtractor

Inherits:

Object

Object
Mindee::PDF::PDFExtractor::PDFExtractor

show all

Defined in:: lib/mindee/pdf/pdf_extractor.rb

Overview

Pdf extraction class.

Instance Method Summary collapse

#cut_pages(page_indexes) ⇒ StringIO
Creates a new Pdf from pages and save it into a buffer.
#extract_invoices(page_indexes, strict: false) ⇒ Array<Mindee::PDF::PDFExtractor::ExtractedPDF>
Extracts invoices as complete PDFs from the document.
#extract_sub_documents(page_indexes) ⇒ Array<Mindee::PDF::PDFExtractor::ExtractedPDF>
Extract the sub-documents from the main pdf, based on the given list of page indexes.
#initialize(local_input) ⇒ PDFExtractor constructor
A new instance of PDFExtractor.
#page_count ⇒ Integer
Retrieves the page count for the Pdf object.

Constructor Details

#initialize(local_input) ⇒ `PDFExtractor`

Returns a new instance of PDFExtractor.

Parameters:

local_input (Mindee::Input::Source::LocalInputSource)

# File 'lib/mindee/pdf/pdf_extractor.rb', line 11

def initialize(local_input)
  @filename = local_input.filename
  if local_input.pdf?
    @source_pdf = local_input.io_stream
  else
    pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
    io_buffer = StringIO.new
    pdf_image.save(io_buffer)

    @source_pdf = io_buffer
  end
end

Instance Method Details

#cut_pages(page_indexes) ⇒ `StringIO`

Creates a new Pdf from pages and save it into a buffer.

Parameters:

page_indexes (Array<Integer>) —
List of page number to use for merging in the original Pdf.

Returns:

(StringIO) —
The buffer containing the new Pdf.

# File 'lib/mindee/pdf/pdf_extractor.rb', line 33

def cut_pages(page_indexes)
  options = PageOptions.new(params: {
                              page_indexes: page_indexes,
                            })

  Mindee::PDF::PDFProcessor.parse(@source_pdf, options)
end

#extract_invoices(page_indexes, strict: false) ⇒ `Array<Mindee::PDF::PDFExtractor::ExtractedPDF>`

Extracts invoices as complete PDFs from the document.

Parameters:

page_indexes (Array<Array<Integer>, InvoiceSplitterV1InvoicePageGroup>)
strict (bool) (defaults to: false)

Returns:

(Array<Mindee::PDF::PDFExtractor::ExtractedPDF>)

Raises:

(Errors::MindeePDFError)

# File 'lib/mindee/pdf/pdf_extractor.rb', line 76

def extract_invoices(page_indexes, strict: false)
  raise Errors::MindeePDFError, 'No indexes provided.' if page_indexes.empty?

  if page_indexes[0].is_a?(Array) && page_indexes[0].all? { |i| i.is_a?(Integer) }
    page_indexes_as_array = page_indexes # @type var page_indexes : Array[Array[Integer]]
    return extract_sub_documents(page_indexes_as_array)
  end
  p_ids = page_indexes # @type var page_indexes: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroups
  return extract_sub_documents(p_ids.map(&:page_indexes)) unless strict

  correct_page_indexes = []
  current_list = []
  previous_confidence = nil
  p_ids.each_with_index do |p_i, i|
    page_index = p_i # @type var page_index: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup
    confidence = page_index.confidence.to_f
    page_list = page_index.page_indexes

    if confidence >= 0.5 && previous_confidence.nil?
      current_list = page_list
    elsif confidence >= 0.5 && i < p_ids.length - 1
      correct_page_indexes << current_list
      current_list = page_list
    elsif confidence < 0.5 && i == p_ids.length - 1
      current_list.concat page_list
      correct_page_indexes << current_list
    else
      correct_page_indexes << current_list
      correct_page_indexes << page_list
    end
    previous_confidence = confidence
  end
  extract_sub_documents(correct_page_indexes)
end

#extract_sub_documents(page_indexes) ⇒ `Array<Mindee::PDF::PDFExtractor::ExtractedPDF>`

Extract the sub-documents from the main pdf, based on the given list of page indexes.

Parameters:

page_indexes (Array<Array<Integer>>) —
List of page number to use for merging in the original Pdf.

Returns:

(Array<Mindee::PDF::PDFExtractor::ExtractedPDF>) —
The buffer containing the new Pdf.

# File 'lib/mindee/pdf/pdf_extractor.rb', line 44

def extract_sub_documents(page_indexes)
  extracted_pdfs = []
  extension = File.extname(@filename)
  basename = File.basename(@filename, extension)
  page_indexes.each do |page_index_list|
    if page_index_list.nil? || page_index_list.empty?
      raise Errors::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}"
    end

    page_index_list.each do |page_index|
      if (page_index > page_count) || page_index.negative?
        raise Errors::MindeePDFError,
              "Index #{page_index} is out of range."
      end
    end
    formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
    field_filename = "#{basename}_#{format('%03d',
                                           page_index_list[0] + 1)}-#{formatted_max_index}#{extension}"
    extracted_pdf = Mindee::PDF::PDFExtractor::ExtractedPDF.new(cut_pages(page_index_list),
                                                                field_filename)
    extracted_pdfs << extracted_pdf
  end
  extracted_pdfs
end

#page_count ⇒ `Integer`

Retrieves the page count for the Pdf object.

Returns:

(Integer)



26
27
28

# File 'lib/mindee/pdf/pdf_extractor.rb', line 26

def page_count
  Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size
end

Class: Mindee::PDF::PDFExtractor::PDFExtractor

Overview

Instance Method Summary collapse

Constructor Details

#initialize(local_input) ⇒ PDFExtractor

Instance Method Details

#cut_pages(page_indexes) ⇒ StringIO

#extract_invoices(page_indexes, strict: false) ⇒ Array<Mindee::PDF::PDFExtractor::ExtractedPDF>

#extract_sub_documents(page_indexes) ⇒ Array<Mindee::PDF::PDFExtractor::ExtractedPDF>

#page_count ⇒ Integer

#initialize(local_input) ⇒ `PDFExtractor`

#cut_pages(page_indexes) ⇒ `StringIO`

#extract_invoices(page_indexes, strict: false) ⇒ `Array<Mindee::PDF::PDFExtractor::ExtractedPDF>`

#extract_sub_documents(page_indexes) ⇒ `Array<Mindee::PDF::PDFExtractor::ExtractedPDF>`

#page_count ⇒ `Integer`