Module: ActAsPageExtractor

Extended by:
ActiveSupport::Concern
Defined in:
lib/act_as_page_extractor/version.rb,
lib/act_as_page_extractor.rb,
lib/act_as_page_extractor/modules/tools.rb,
lib/act_as_page_extractor/modules/saving.rb,
lib/act_as_page_extractor/modules/interface.rb,
lib/act_as_page_extractor/modules/unzipping.rb,
lib/act_as_page_extractor/modules/extracting.rb,
lib/act_as_page_extractor/modules/validating.rb,
lib/generators/act_as_page_extractor/migration_generator.rb

Overview

:nocov:

Defined Under Namespace

Modules: ClassMethods, Generators

Constant Summary collapse

EXTRACTING_STATES =
{
  new: 'new',
  extracting: 'extracting',
  extracted: 'extracted',
  'error.extraction': 'error.extraction'
}.freeze
TMP_EXTRACTION_FILE_STORAGE =
"#{Dir.pwd}/tmp/page_extraction".freeze
FILE_STORAGE =
"#{Dir.pwd}/public".freeze
PDF_STORAGE =
"#{FILE_STORAGE}/uploads/extracted/pdf".freeze
VERSION =
"0.6.4"
VALIDATE_COMPRESS_TYPES =
['zip', 'rar', '7z', 'gzip'].freeze
VALIDATE_DOC_TYPES =
['txt', 'pdf', 'doc', 'docx',
'rtf', 'odt', 'htm', 'html'].freeze

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.start_extractionObject



16
17
18
# File 'lib/act_as_page_extractor/modules/interface.rb', line 16

def self.start_extraction
  document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end

.statisticsObject



20
21
22
23
24
25
26
27
28
29
# File 'lib/act_as_page_extractor/modules/interface.rb', line 20

def self.statistics
  totals_documents = document_class.count
  supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
  {
    total: totals_documents,
    supported_documents: supported_documents,
    unsupported_documents: totals_documents - supported_documents,
    states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
  }
end

Instance Method Details

#cleanup_pagesObject



38
39
40
# File 'lib/act_as_page_extractor/modules/tools.rb', line 38

def cleanup_pages
  self.extracted_pages.destroy_all
end

#convert_to_pdfObject



14
15
16
17
18
19
20
21
22
23
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 14

def convert_to_pdf
   @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
     @document_path
   else
    if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
      pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
      pdf_path if File.exist?(pdf_path)
    end
  end
end

#convert_to_textObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 25

def convert_to_text
  begin
    @pdf_pages = PdfUtils.info(@pdf_path).pages
    if @pdf_pages
      if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
      else
        # :nocov:
        @pdf_pages = nil
        raise
        # :nocov:
      end
    end
  # :nocov:
  rescue
  end
  # :nocov:
end

#debug_infoObject

:nocov:



43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/act_as_page_extractor/modules/tools.rb', line 43

def debug_info
  # ap "@tmp_dir"
  # ap @tmp_dir
  # ap "@copy_document_path"
  # ap @copy_document_path
  # ap "@document_path"
    # ap @document_path
  # ap "@pdf_path"
  # ap @pdf_path
  # ap "@pdf_pages"
    # ap @pdf_pages
end

#extract_pagesObject



9
10
11
12
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 9

def extract_pages
  convert_to_pdf
  convert_to_text
end

#initializedObject



54
55
56
57
58
59
60
61
62
63
# File 'lib/act_as_page_extractor.rb', line 54

def initialized
  # add all need callbacks
    #on destroy remove pdf

  #Add to Readme!!
  #rails g act_as_page_extractor:migration Document category_id user_id
  # add to [Document] model:
  # has_many :extracted_pages, dependent: :destroy
  create_pdf_dir
end

#is_extractedObject

:nocov:



16
17
18
# File 'lib/act_as_page_extractor/modules/tools.rb', line 16

def is_extracted
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
end

#origin_file_nameObject



2
3
4
# File 'lib/act_as_page_extractor/modules/interface.rb', line 2

def origin_file_name
  self.send(:extracted_filename).url.to_s.split('/').last
end

#page_extract!Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/act_as_page_extractor.rb', line 65

def page_extract!
  initialized
  cleanup_pages
  create_tmp_dir
  begin
    copy_document
    unzip_document
    if valid_document
      extract_pages
      save_to_db
    end
  ensure
    update_state
    save_pdf
    debug_info
    finish
  end
end

#pdf_pathObject



6
7
8
9
10
# File 'lib/act_as_page_extractor/modules/interface.rb', line 6

def pdf_path
  if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
    "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
  end
end

#remove_filesObject



12
13
14
# File 'lib/act_as_page_extractor/modules/interface.rb', line 12

def remove_files
  FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
end

#remove_last_byte(file_name) ⇒ Object

fix for openoffice/jodconverter: delete last ugly byte in converted text page



38
39
40
41
42
43
44
45
46
# File 'lib/act_as_page_extractor/modules/saving.rb', line 38

def remove_last_byte(file_name)
  file = File.new(file_name, 'a+')
  if file.size > 0
    file.seek(file.size - 1)
    last_byte = file.getc
    file.truncate(file.size - 1) if last_byte == "\f"
  end
  file.close
end

#save_pdfObject



2
3
4
5
6
7
8
9
10
11
# File 'lib/act_as_page_extractor/modules/saving.rb', line 2

def save_pdf
  if save_as_pdf &&
     is_extracted &&
     @document_path.split('.').last&.downcase != 'pdf'

    if @pdf_path
      FileUtils.cp(@pdf_path, pdf_storage)
    end
  end
end

#save_to_dbObject



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/act_as_page_extractor/modules/saving.rb', line 13

def save_to_db
  self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
  ExtractedPage.transaction do
    @pdf_pages&.times&.each do |pdf_page|
      page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
      remove_last_byte(page_filename)
      content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")

      page_attributes = {
        page:        content,
        page_number: pdf_page + 1
      }

      page_attributes[extracted_document_id] = self.id

      additional_fields.each do |additional_field|
        page_attributes[additional_field] = self.send(additional_field.to_sym)
      end

      ExtractedPage.create(page_attributes)
    end
  end
end

#timeout_wrapperObject

:nocov:



5
6
7
8
9
10
11
12
13
# File 'lib/act_as_page_extractor/modules/tools.rb', line 5

def timeout_wrapper
  result = nil
  begin
    result = Timeout::timeout(60*5) { yield }
  rescue
  ensure
    result
  end
end

#unzip_documentObject



2
3
4
5
6
7
8
9
10
11
12
13
14
# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 2

def unzip_document
  @document_path = @copy_document_path
  if validate_compress_types
    result = TotalCompressor.decompress(@copy_document_path)
    if result[:success] && result[:files].length == 1
      origin_document_name = @origin_document_path.split("/").last.split('.').first
      unpacked_document = result[:files].first.split('/').last
      unpacked_document_format = unpacked_document.split('.').last
      @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
      File.rename(result[:files].first, @document_path)
    end
  end
end

#update_stateObject



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/act_as_page_extractor/modules/tools.rb', line 20

def update_state
  updated_attributes = if is_extracted
    {
      page_extraction_state: EXTRACTING_STATES[:extracted],
      page_extraction_pages: @pdf_pages
    }
  else
    {
      page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
      page_extraction_pages: 0
    }
  end.merge({
      page_extraction_doctype: @document_path&.split('.')&.last,
      page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
    })
  self.update(updated_attributes)
end

#valid_documentObject



6
7
8
# File 'lib/act_as_page_extractor/modules/validating.rb', line 6

def valid_document
  validate_size && validate_doc_types
end

#validate_compress_typesObject



15
16
17
# File 'lib/act_as_page_extractor/modules/validating.rb', line 15

def validate_compress_types
  VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
end

#validate_doc_typesObject



19
20
21
# File 'lib/act_as_page_extractor/modules/validating.rb', line 19

def validate_doc_types
  VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
end

#validate_sizeObject



10
11
12
13
# File 'lib/act_as_page_extractor/modules/validating.rb', line 10

def validate_size
  mb = 2**20
  File.size(@copy_document_path) <= 1*mb
end