Class: ParseFile

Inherits:
Object
  • Object
show all
Defined in:
lib/parsefile.rb

Instance Method Summary collapse

Constructor Details

#initialize(file, input_dir, output_dir, tika) ⇒ ParseFile

Returns a new instance of ParseFile.



8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/parsefile.rb', line 8

def initialize(file, input_dir, output_dir, tika)
  @path = file
  @input_dir = input_dir
  @output_dir = output_dir
	# Pass URL of a Tika server
	if tika
 @tika = tika
	# Use OKFNs service over normal HTTP... ZOMG... O.o
	else
 @tika = nil
	end
end

Instance Method Details

#gen_outputObject



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/parsefile.rb', line 41

def gen_output
  outhash = Hash.new
  outhash[:full_path] = @path
  outhash.merge!(@metadata)
  begin
    outhash[:text] = @text.to_s.encode('UTF-8', {
                                         :invalid => :replace,
                                         :undef   => :replace,
                                         :replace => '?'
                                       })
    return JSON.pretty_generate(outhash)
  rescue
    binding.pry
  end
end

#parse_fileObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/parsefile.rb', line 21

def parse_file
  begin
 puts "sending file: " + @path
	
    path_fix = @path.unpack('C*').pack('U*')
    m = ExtractMetadata.new(path_fix, @input_dir, @output_dir)
    @metadata = m.extract

    o = OCRFile.new(@path, @input_dir, @output_dir, @metadata[:rel_path], @tika)
    @text = o.ocr

    gen_output
  rescue
 # TODO: use a global debug / log
    # binding.pry
    error_file = @path + "\n"
    IO.write(@output_dir+"/error_log.txt", error_file, mode: 'a')
  end
end