Class: Paperclip::Document::Processors::Reader

Inherits:
Paperclip::Document::Processor show all
Defined in:
lib/paperclip/document/processors/reader.rb

Overview

This processor extract the OCR text of the file

Instance Attribute Summary collapse

Attributes inherited from Paperclip::Document::Processor

#instance, #tmp_dir

Instance Method Summary collapse

Methods inherited from Paperclip::Document::Processor

#basename, #file_path

Constructor Details

#initialize(file, options = {}, attachment = nil) ⇒ Reader

Returns a new instance of Reader.



10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/paperclip/document/processors/reader.rb', line 10

def initialize(file, options = {}, attachment = nil)
  super(file, options, attachment)
  if @options[:text_column].nil? and text_column?
    @options[:text_column] = default_text_column
  end
  @language = @options[:language]
  @text_column = @options[:text_column]
  unless @text_column
    raise Paperclip::Error, "No content text column given"
  end
  @clean = (RUBY_VERSION >= "2.0" ? false : options.has_key?(:clean) ? !!options[:clean] : true)
end

Instance Attribute Details

#cleanObject

Returns the value of attribute clean.



8
9
10
# File 'lib/paperclip/document/processors/reader.rb', line 8

def clean
  @clean
end

#languageObject

Returns the value of attribute language.



8
9
10
# File 'lib/paperclip/document/processors/reader.rb', line 8

def language
  @language
end

#text_columnObject

Returns the value of attribute text_column.



8
9
10
# File 'lib/paperclip/document/processors/reader.rb', line 8

def text_column
  @text_column
end

Instance Method Details

#default_text_columnObject

Returns the name of the default text column



49
50
51
# File 'lib/paperclip/document/processors/reader.rb', line 49

def default_text_column
  @attachment.name.to_s + "_content_text"
end

#makeObject

Extract the text of all the document



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/paperclip/document/processors/reader.rb', line 24

def make
  destination_path = tmp_dir.to_s
  options = {output: destination_path, clean: @clean}
  options[:language] = (language.is_a?(Proc) ? language.call(attachment.instance) : language)
  Docsplit.extract_text(file_path.to_s, options)
  
  destination_file = File.join(destination_path, basename + ".txt")
  instance = @attachment.instance
  f = File.open(destination_file)
  instance[text_column] = f.read
  instance.run_callbacks(:save) { false }
  f.close

  return file
end

#text_column?Boolean

Check if the default text column is present

Returns:

  • (Boolean)


41
42
43
44
45
46
# File 'lib/paperclip/document/processors/reader.rb', line 41

def text_column?
  expected_column = default_text_column
  return instance.class.columns.detect do |column|
    column.name.to_s == expected_column
  end
end