Module: ActAsPageExtractor
- Extended by:
- ActiveSupport::Concern
- Defined in:
- lib/act_as_page_extractor/version.rb,
lib/act_as_page_extractor.rb,
lib/act_as_page_extractor/modules/tools.rb,
lib/act_as_page_extractor/modules/saving.rb,
lib/act_as_page_extractor/modules/interface.rb,
lib/act_as_page_extractor/modules/unzipping.rb,
lib/act_as_page_extractor/modules/extracting.rb,
lib/act_as_page_extractor/modules/validating.rb,
lib/generators/act_as_page_extractor/migration_generator.rb
Overview
Defined Under Namespace
Modules: ClassMethods, Generators
Constant Summary
collapse
{
new: 'new',
extracting: 'extracting',
extracted: 'extracted',
'error.extraction': 'error.extraction'
}.freeze
"#{Dir.pwd}/tmp/page_extraction".freeze
- FILE_STORAGE =
"#{Dir.pwd}/public".freeze
- PDF_STORAGE =
"#{FILE_STORAGE}/uploads/extracted/pdf".freeze
- VERSION =
"0.6.4"
- VALIDATE_COMPRESS_TYPES =
['zip', 'rar', '7z', 'gzip'].freeze
- VALIDATE_DOC_TYPES =
['txt', 'pdf', 'doc', 'docx',
'rtf', 'odt', 'htm', 'html'].freeze
Class Method Summary
collapse
Instance Method Summary
collapse
Class Method Details
16
17
18
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 16
def self.
document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end
|
.statistics ⇒ Object
20
21
22
23
24
25
26
27
28
29
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 20
def self.statistics
totals_documents = document_class.count
supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
{
total: totals_documents,
supported_documents: supported_documents,
unsupported_documents: totals_documents - supported_documents,
states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
}
end
|
Instance Method Details
#cleanup_pages ⇒ Object
38
39
40
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 38
def cleanup_pages
self..destroy_all
end
|
#convert_to_pdf ⇒ Object
14
15
16
17
18
19
20
21
22
23
|
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 14
def convert_to_pdf
@pdf_path = if 'pdf' == @document_path.split('.').last.downcase
@document_path
else
if timeout_wrapper{ Docsplit.(@document_path, output: @tmp_dir)}
pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
pdf_path if File.exist?(pdf_path)
end
end
end
|
#convert_to_text ⇒ Object
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 25
def convert_to_text
begin
@pdf_pages = PdfUtils.info(@pdf_path).pages
if @pdf_pages
if timeout_wrapper{ Docsplit::(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
else
@pdf_pages = nil
raise
end
end
rescue
end
end
|
#debug_info ⇒ Object
43
44
45
46
47
48
49
50
51
52
53
54
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 43
def debug_info
end
|
9
10
11
12
|
# File 'lib/act_as_page_extractor/modules/extracting.rb', line 9
def
convert_to_pdf
convert_to_text
end
|
#initialized ⇒ Object
54
55
56
57
58
59
60
61
62
63
|
# File 'lib/act_as_page_extractor.rb', line 54
def initialized
create_pdf_dir
end
|
16
17
18
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 16
def
@pdf_pages.to_i > 0 && self..count == @pdf_pages
end
|
#origin_file_name ⇒ Object
2
3
4
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 2
def origin_file_name
self.send(:extracted_filename).url.to_s.split('/').last
end
|
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
# File 'lib/act_as_page_extractor.rb', line 65
def
initialized
cleanup_pages
create_tmp_dir
begin
copy_document
unzip_document
if valid_document
save_to_db
end
ensure
update_state
save_pdf
debug_info
finish
end
end
|
#pdf_path ⇒ Object
6
7
8
9
10
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 6
def pdf_path
if == EXTRACTING_STATES[:extracted] && &.downcase != 'pdf'
"#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
end
end
|
#remove_files ⇒ Object
12
13
14
|
# File 'lib/act_as_page_extractor/modules/interface.rb', line 12
def remove_files
FileUtils::rm_rf(pdf_path) if File.exist?(pdf_path.to_s)
end
|
#remove_last_byte(file_name) ⇒ Object
fix for openoffice/jodconverter: delete last ugly byte in converted text page
38
39
40
41
42
43
44
45
46
|
# File 'lib/act_as_page_extractor/modules/saving.rb', line 38
def remove_last_byte(file_name)
file = File.new(file_name, 'a+')
if file.size > 0
file.seek(file.size - 1)
last_byte = file.getc
file.truncate(file.size - 1) if last_byte == "\f"
end
file.close
end
|
#save_pdf ⇒ Object
2
3
4
5
6
7
8
9
10
11
|
# File 'lib/act_as_page_extractor/modules/saving.rb', line 2
def save_pdf
if save_as_pdf &&
&&
@document_path.split('.').last&.downcase != 'pdf'
if @pdf_path
FileUtils.cp(@pdf_path, pdf_storage)
end
end
end
|
#save_to_db ⇒ Object
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
# File 'lib/act_as_page_extractor/modules/saving.rb', line 13
def save_to_db
self.update(page_extraction_state: EXTRACTING_STATES[:extracting])
ExtractedPage.transaction do
@pdf_pages&.times&.each do |pdf_page|
page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
remove_last_byte(page_filename)
content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")
page_attributes = {
page: content,
page_number: pdf_page + 1
}
page_attributes[] = self.id
additional_fields.each do |additional_field|
page_attributes[additional_field] = self.send(additional_field.to_sym)
end
ExtractedPage.create(page_attributes)
end
end
end
|
#timeout_wrapper ⇒ Object
5
6
7
8
9
10
11
12
13
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 5
def timeout_wrapper
result = nil
begin
result = Timeout::timeout(60*5) { yield }
rescue
ensure
result
end
end
|
#unzip_document ⇒ Object
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# File 'lib/act_as_page_extractor/modules/unzipping.rb', line 2
def unzip_document
@document_path = @copy_document_path
if validate_compress_types
result = TotalCompressor.decompress(@copy_document_path)
if result[:success] && result[:files].length == 1
origin_document_name = @origin_document_path.split("/").last.split('.').first
unpacked_document = result[:files].first.split('/').last
unpacked_document_format = unpacked_document.split('.').last
@document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
File.rename(result[:files].first, @document_path)
end
end
end
|
#update_state ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
# File 'lib/act_as_page_extractor/modules/tools.rb', line 20
def update_state
updated_attributes = if
{
page_extraction_state: EXTRACTING_STATES[:extracted],
page_extraction_pages: @pdf_pages
}
else
{
page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
page_extraction_pages: 0
}
end.merge({
page_extraction_doctype: @document_path&.split('.')&.last,
page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
})
self.update(updated_attributes)
end
|
#valid_document ⇒ Object
6
7
8
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 6
def valid_document
validate_size && validate_doc_types
end
|
#validate_compress_types ⇒ Object
15
16
17
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 15
def validate_compress_types
VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
end
|
#validate_doc_types ⇒ Object
19
20
21
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 19
def validate_doc_types
VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
end
|
#validate_size ⇒ Object
10
11
12
13
|
# File 'lib/act_as_page_extractor/modules/validating.rb', line 10
def validate_size
mb = 2**20
File.size(@copy_document_path) <= 1*mb
end
|