Module: Documentalist

Defined in:
lib/dependencies.rb,
lib/documentalist.rb,
lib/backends/net_pbm.rb,
lib/backends/odf_merge.rb,
lib/backends/pdf_tools.rb,
lib/backends/open_office.rb,
lib/backends/wkhtmltopdf.rb

Defined Under Namespace

Modules: Dependencies, NetPBM, ODFMerge, OpenOffice, PdfTools, WkHtmlToPdf Classes: Error

Constant Summary collapse

BACKENDS =
{
  # Find a better pattern to pick backend, this one smells pretty bad
  :WkHtmlToPdf => {[:html, :htm] => :pdf},
  :OpenOffice => {[:odt, :doc, :rtf, :docx, :txt, :wps] => [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]},
  :NetPBM => {:ppm => [:jpg, :jpeg]},
  :PdfTools => {:pdf => :txt},
}
@@config =
{}
@@logger =
nil

Class Method Summary collapse

Class Method Details

.backend_for_conversion(origin, destination) ⇒ Object

Finds the relevant server to perform the conversion



44
45
46
47
48
49
50
51
# File 'lib/documentalist.rb', line 44

def self.backend_for_conversion(origin, destination)
  origin = origin.to_s.gsub(/.*\./, "").to_sym
  destination = destination.to_s.gsub(/.*\./, "").to_sym

  BACKENDS.map { |b| [send(:const_get, b[0]), b[1]] }.detect do |s, conversions|
    conversions.keys.flatten.include?(origin) and conversions.values.flatten.include?(destination)
  end.to_a.first
end

.check_dependenciesObject

Checks the dependencies for backends



162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/documentalist.rb', line 162

def self.check_dependencies
  puts "Checking backends system dependencies"

  Documentalist.constants.each do |backend|
    backend = Documentalist.const_get backend.to_sym

    if backend.respond_to? :check_dependencies
      puts "Checking dependencies for #{backend.to_s}"
      backend.send :check_dependencies
    end
  end
end

.configObject



12
13
14
15
# File 'lib/documentalist.rb', line 12

def self.config
  default_config! unless config?
  @@config
end

.config=(hash) ⇒ Object



17
18
19
20
# File 'lib/documentalist.rb', line 17

def self.config=(hash)
  # We want to symbolize keys ourselves since we're not depending on Active Support
  @@config = symbolize hash
end

.config?Boolean

Returns:

  • (Boolean)


22
23
24
# File 'lib/documentalist.rb', line 22

def self.config?
  @@config != {}
end

.config_from_yaml!(file, options = {}) ⇒ Object



30
31
32
33
# File 'lib/documentalist.rb', line 30

def self.config_from_yaml!(file, options = {})
  self.config = YAML::load(File.open(file))
  self.config = config[options[:section].to_sym] if options[:section]
end

.convert(file = nil, options = {}) ⇒ Object

Takes all conversion requests and dispatches them appropriately



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/documentalist.rb', line 54

def self.convert(file=nil, options={})
  if options[:input] and options[:input_format] and file.nil?
    file = File.join(Dir.tmpdir, "#{rand(10**9)}.#{options[:input_format].to_s}")
    File.open(file, 'w') { |f| f.write(options[:input]) }
  end

  raise Documentalist::Error.new("#{file} does not exist !") unless File.exist?(file)

  if options[:to_format]
    options[:to] = file.gsub(/#{"\\" + File.extname(file)}$/, ".#{options[:to_format].to_s}")
  elsif options[:to]
    options[:to_format] = File.extname(options[:to]).gsub(/\./, "").to_sym
  elsif options[:stream]
    options[:to_format] = options[:stream]
    options[:to] = File.join(Dir.tmpdir, "#{rand(10**9)}.#{options[:to_format]}")
  else
    raise Documentalist::Error.new("No destination, format, or stream format was given")
  end

  options[:from_format] = File.extname(file).gsub(/\./, "").to_sym

  backend = backend_for_conversion(options[:from_format], options[:to_format])
  backend.convert(file, options)

  # TODO : that would fails removing the file since the input parameter gets overridden
  # we'll live with it for now
  if options[:input] and options[:input_format] and file.nil?
    FileUtils.rm(file)
  end

  if options[:stream]
    data = File.read(options[:to])
    FileUtils.rm(options[:to])
    yield(data) if block_given?
    data
  else
    yield(options[:to]) if block_given?
    options[:to]
  end
end

.default_config!Object



26
27
28
# File 'lib/documentalist.rb', line 26

def self.default_config!
  config_from_yaml! File.join(File.dirname(__FILE__), %w{.. config default.yml})
end

.extract_images(file) {|image_files| ... } ⇒ Object

Yields:

  • (image_files)


106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/documentalist.rb', line 106

def self.extract_images(file)
  temp_dir = File.join(Dir.tmpdir, rand(10**9).to_s)
  
  if File.extname(file) == '.pdf'
    temp_file = File.join(temp_dir, File.basename(file))

    FileUtils.mkdir_p temp_dir
    FileUtils.cp file, temp_file
    
    system "pdfimages #{temp_file} '#{File.join(temp_dir, "img")}'"

    Dir.glob(File.join(temp_dir, "*.ppm")).each do |ppm_image|
      #raise ppm_image
      Documentalist.convert(ppm_image, :to_format => :jpeg)
    end
  else
    Documentalist.convert file, :to_format => :html
  end

  image_files = Dir.glob(File.join(temp_dir, "*.{jpg,jpeg,bmp,tif,tiff,gif,png}"))

  yield(image_files) if block_given?
  image_files
end

.extract_text(file) ⇒ Object



95
96
97
98
99
100
101
102
103
104
# File 'lib/documentalist.rb', line 95

def self.extract_text(file)
  converted = convert(file, :to_format => :txt)
  
  if converted and File.exist?(converted)
    text = Kconv.toutf8(File.open(converted).read)
    FileUtils.rm(converted)
    yield(text) if block_given?
    text
  end
end

.loggerObject

Returns the logger object used to log documentalist operations



151
152
153
154
155
156
157
158
159
# File 'lib/documentalist.rb', line 151

def self.logger
  unless @@logger
    Documentalist.config[:log_file] ||= File.join(File.dirname(File.expand_path(__FILE__)), %w{.. documentalist.log})
    @@logger = Logger.new(Documentalist.config[:log_file])
    @@logger.level = Logger.const_get(config[:log_level] ? config[:log_level].upcase : "WARN")
  end

  @@logger
end

.odf_merge(template, options = {}) ⇒ Object

Merge an ODF document with an arbitrary hash of data



7
8
9
# File 'lib/backends/odf_merge.rb', line 7

def self.odf_merge(template, options = {})
  ODFMerge.merge_template(template, options)
end

.symbolize(hash) ⇒ Object

Returns a new hash with recursively symbolized keys



176
177
178
179
180
181
# File 'lib/documentalist.rb', line 176

def self.symbolize(hash)
  hash.each_key do |key|
    hash[key.to_sym] = hash.delete key
    hash[key.to_sym] = symbolize(hash[key.to_sym]) if hash[key.to_sym].is_a?(Hash)
  end
end

.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil}) ⇒ Object

Runs a block with a system-enforced timeout and optionally retry with an optional sleep between attempts of running the given block. All times are in seconds.



134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/documentalist.rb', line 134

def self.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil})
  if block_given?
    attempts = options[:attempts] || 1
    begin
      SystemTimer.timeout time_limit do
        yield
      end
    rescue Timeout::Error
      attempts -= 1
      sleep(options[:sleep]) if options[:sleep]
      retry unless attempts.zero?
      raise
    end
  end
end