Module: ActAsPageExtractor

Extended by:: ActiveSupport::Concern

Defined in:: lib/act_as_page_extractor/version.rb,
lib/act_as_page_extractor.rb,
lib/act_as_page_extractor/modules/tools.rb,
lib/act_as_page_extractor/modules/saving.rb,
lib/act_as_page_extractor/modules/interface.rb,
lib/act_as_page_extractor/modules/unzipping.rb,
lib/act_as_page_extractor/modules/extracting.rb,
lib/act_as_page_extractor/modules/validating.rb,
lib/generators/act_as_page_extractor/migration_generator.rb

Overview

:nocov:

Defined Under Namespace

Constant Summary collapse

EXTRACTING_STATES =

{
  new: 'new',
  extracting: 'extracting',
  extracted: 'extracted',
  'error.extraction': 'error.extraction'
}.freeze

TMP_EXTRACTION_FILE_STORAGE =

"#{Dir.pwd}/tmp/page_extraction".freeze

FILE_STORAGE =

"#{Dir.pwd}/public".freeze

PDF_STORAGE =

"#{FILE_STORAGE}/uploads/extracted/pdf".freeze

VERSION =

"0.6.4"

VALIDATE_COMPRESS_TYPES =

['zip', 'rar', '7z', 'gzip'].freeze

VALIDATE_DOC_TYPES =

['txt', 'pdf', 'doc', 'docx',
'rtf', 'odt', 'htm', 'html'].freeze

Class Method Summary collapse

Instance Method Summary collapse

#cleanup_pages ⇒ Object
#convert_to_pdf ⇒ Object
#convert_to_text ⇒ Object
#debug_info ⇒ Object

:nocov:.
#extract_pages ⇒ Object
#initialized ⇒ Object
#is_extracted ⇒ Object

:nocov:.
#origin_file_name ⇒ Object
#page_extract! ⇒ Object
#pdf_path ⇒ Object
#remove_files ⇒ Object
#remove_last_byte(file_name) ⇒ Object

fix for openoffice/jodconverter: delete last ugly byte in converted text page.
#save_pdf ⇒ Object
#save_to_db ⇒ Object
#timeout_wrapper ⇒ Object

:nocov:.
#unzip_document ⇒ Object
#update_state ⇒ Object
#valid_document ⇒ Object
#validate_compress_types ⇒ Object
#validate_doc_types ⇒ Object
#validate_size ⇒ Object

Class Method Details

.start_extraction ⇒ `Object`



16
17
18

# File 'lib/act_as_page_extractor/modules/interface.rb', line 16

def self.start_extraction
  document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end

.statistics ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/interface.rb', line 20

def self.statistics
  totals_documents = document_class.count
  supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
  {
    total: totals_documents,
    supported_documents: supported_documents,
    unsupported_documents: totals_documents - supported_documents,
    states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
  }
end

Instance Method Details

#cleanup_pages ⇒ `Object`



38
39
40

# File 'lib/act_as_page_extractor/modules/tools.rb', line 38

def cleanup_pages
  self.extracted_pages.destroy_all
end

#convert_to_pdf ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/extracting.rb', line 14

def convert_to_pdf
   @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
     @document_path
   else
    if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
      pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
      pdf_path if File.exist?(pdf_path)
    end
  end
end

#convert_to_text ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/extracting.rb', line 25

def convert_to_text
  begin
    @pdf_pages = PdfUtils.info(@pdf_path).pages
    if @pdf_pages
      if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
      else
        # :nocov:
        @pdf_pages = nil
        raise
        # :nocov:
      end
    end
  # :nocov:
  rescue
  end
  # :nocov:
end

#debug_info ⇒ `Object`

:nocov:

# File 'lib/act_as_page_extractor/modules/tools.rb', line 43

def debug_info
  # ap "@tmp_dir"
  # ap @tmp_dir
  # ap "@copy_document_path"
  # ap @copy_document_path
  # ap "@document_path"
    # ap @document_path
  # ap "@pdf_path"
  # ap @pdf_path
  # ap "@pdf_pages"
    # ap @pdf_pages
end

#extract_pages ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/extracting.rb', line 9

def extract_pages
  convert_to_pdf
  convert_to_text
end

#initialized ⇒ `Object`

# File 'lib/act_as_page_extractor.rb', line 54

def initialized
  # add all need callbacks
    #on destroy remove pdf

  #Add to Readme!!
  #rails g act_as_page_extractor:migration Document category_id user_id
  # add to [Document] model:
  # has_many :extracted_pages, dependent: :destroy
  create_pdf_dir
end

#is_extracted ⇒ `Object`

:nocov:



16
17
18

# File 'lib/act_as_page_extractor/modules/tools.rb', line 16

def is_extracted
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
end

#origin_file_name ⇒ `Object`



2
3
4

# File 'lib/act_as_page_extractor/modules/interface.rb', line 2

def origin_file_name
  self.send(:extracted_filename).url.to_s.split('/').last
end

#page_extract! ⇒ `Object`

# File 'lib/act_as_page_extractor.rb', line 65

def page_extract!
  initialized
  cleanup_pages
  create_tmp_dir
  begin
    copy_document
    unzip_document
    if valid_document
      extract_pages
      save_to_db
    end
  ensure
    update_state
    save_pdf
    debug_info
    finish
  end
end

#pdf_path ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/interface.rb', line 6

def pdf_path
  if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
    "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
  end
end

#remove_files ⇒ `Object`



12
13
14

# File 'lib/act_as_page_extractor/modules/interface.rb', line 12

def remove_files
  FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
end

#remove_last_byte(file_name) ⇒ `Object`

fix for openoffice/jodconverter: delete last ugly byte in converted text page

# File 'lib/act_as_page_extractor/modules/saving.rb', line 38

def remove_last_byte(file_name)
  file = File.new(file_name, 'a+')
  if file.size > 0
    file.seek(file.size - 1)
    last_byte = file.getc
    file.truncate(file.size - 1) if last_byte == "\f"
  end
  file.close
end

#save_pdf ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/saving.rb', line 2

def save_pdf
  if save_as_pdf &&
     is_extracted &&
     @document_path.split('.').last&.downcase != 'pdf'

    if @pdf_path
      FileUtils.cp(@pdf_path, pdf_storage)
    end
  end
end

#save_to_db ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/saving.rb', line 13

def save_to_db
  self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
  ExtractedPage.transaction do
    @pdf_pages&.times&.each do |pdf_page|
      page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
      remove_last_byte(page_filename)
      content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")

      page_attributes = {
        page:        content,
        page_number: pdf_page + 1
      }

      page_attributes[extracted_document_id] = self.id

      additional_fields.each do |additional_field|
        page_attributes[additional_field] = self.send(additional_field.to_sym)
      end

      ExtractedPage.create(page_attributes)
    end
  end
end

#timeout_wrapper ⇒ `Object`

:nocov:

# File 'lib/act_as_page_extractor/modules/tools.rb', line 5

def timeout_wrapper
  result = nil
  begin
    result = Timeout::timeout(60*5) { yield }
  rescue
  ensure
    result
  end
end

#unzip_document ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 2

def unzip_document
  @document_path = @copy_document_path
  if validate_compress_types
    result = TotalCompressor.decompress(@copy_document_path)
    if result[:success] && result[:files].length == 1
      origin_document_name = @origin_document_path.split("/").last.split('.').first
      unpacked_document = result[:files].first.split('/').last
      unpacked_document_format = unpacked_document.split('.').last
      @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
      File.rename(result[:files].first, @document_path)
    end
  end
end

#update_state ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/tools.rb', line 20

def update_state
  updated_attributes = if is_extracted
    {
      page_extraction_state: EXTRACTING_STATES[:extracted],
      page_extraction_pages: @pdf_pages
    }
  else
    {
      page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
      page_extraction_pages: 0
    }
  end.merge({
      page_extraction_doctype: @document_path&.split('.')&.last,
      page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
    })
  self.update(updated_attributes)
end

#valid_document ⇒ `Object`



6
7
8

# File 'lib/act_as_page_extractor/modules/validating.rb', line 6

def valid_document
  validate_size && validate_doc_types
end

#validate_compress_types ⇒ `Object`



15
16
17

# File 'lib/act_as_page_extractor/modules/validating.rb', line 15

def validate_compress_types
  VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
end

#validate_doc_types ⇒ `Object`



19
20
21

# File 'lib/act_as_page_extractor/modules/validating.rb', line 19

def validate_doc_types
  VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
end

#validate_size ⇒ `Object`

# File 'lib/act_as_page_extractor/modules/validating.rb', line 10

def validate_size
  mb = 2**20
  File.size(@copy_document_path) <= 1*mb
end

Module: ActAsPageExtractor

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.start_extraction ⇒ Object

.statistics ⇒ Object

Instance Method Details

#cleanup_pages ⇒ Object

#convert_to_pdf ⇒ Object

#convert_to_text ⇒ Object

#debug_info ⇒ Object

#extract_pages ⇒ Object

#initialized ⇒ Object

#is_extracted ⇒ Object

#origin_file_name ⇒ Object

#page_extract! ⇒ Object

#pdf_path ⇒ Object

#remove_files ⇒ Object

#remove_last_byte(file_name) ⇒ Object

#save_pdf ⇒ Object

#save_to_db ⇒ Object

#timeout_wrapper ⇒ Object

#unzip_document ⇒ Object

#update_state ⇒ Object

#valid_document ⇒ Object

#validate_compress_types ⇒ Object

#validate_doc_types ⇒ Object

#validate_size ⇒ Object

.start_extraction ⇒ `Object`

.statistics ⇒ `Object`

#cleanup_pages ⇒ `Object`

#convert_to_pdf ⇒ `Object`

#convert_to_text ⇒ `Object`

#debug_info ⇒ `Object`

#extract_pages ⇒ `Object`

#initialized ⇒ `Object`

#is_extracted ⇒ `Object`

#origin_file_name ⇒ `Object`

#page_extract! ⇒ `Object`

#pdf_path ⇒ `Object`

#remove_files ⇒ `Object`

#remove_last_byte(file_name) ⇒ `Object`

#save_pdf ⇒ `Object`

#save_to_db ⇒ `Object`

#timeout_wrapper ⇒ `Object`

#unzip_document ⇒ `Object`

#update_state ⇒ `Object`

#valid_document ⇒ `Object`

#validate_compress_types ⇒ `Object`

#validate_doc_types ⇒ `Object`

#validate_size ⇒ `Object`