Class: Tsumetogi::TextExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/tsumetogi/text_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(pdf_path, config = nil) ⇒ TextExtractor

Returns a new instance of TextExtractor.



5
6
7
8
9
10
# File 'lib/tsumetogi/text_extractor.rb', line 5

def initialize(pdf_path, config = nil)
  @pdf_path = pdf_path
  @config = config || Tsumetogi::Config.new
  @text_path = @config.text_path
  @text_path ||= "#{File.dirname(@pdf_path)}/#{File.basename(@pdf_path, ".*")}.txt"
end

Instance Method Details

#extractObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/tsumetogi/text_extractor.rb', line 12

def extract
  Tsumetogi.logger.debug "extracting text from #{File.basename(@pdf_path)} to #{@text_path}"

  crop_options = []
  unless [@config.crop_x, @config.crop_y, @config.crop_w, @config.crop_h].all?(&:zero?)
    crop_options += ["-x", @config.crop_x.to_s]
    crop_options += ["-y", @config.crop_y.to_s]
    crop_options += ["-W", @config.crop_w.to_s]
    crop_options += ["-H", @config.crop_h.to_s]
  end

  cmd = ["pdftotext"]
  cmd += ["-r", @config.resolution.to_s]
  cmd += crop_options
  cmd += [@pdf_path, @text_path]

  Tsumetogi.logger.debug cmd.join(" ")
  system *cmd
end