Module: RTesseract::Box

Extended by:
Base
Defined in:
lib/rtesseract/box.rb

Class Method Summary collapse

Methods included from Base

temp_file_path

Class Method Details

.parse(content) ⇒ Object



16
17
18
# File 'lib/rtesseract/box.rb', line 16

def parse(content)
  content.lines.map { |line| parse_line(line) }.compact
end

.parse_confidence(line) ⇒ Object



45
46
47
# File 'lib/rtesseract/box.rb', line 45

def parse_confidence(line)
  line.match(/(?<=;)(.*?)(?=')/).to_s.split
end

.parse_line(line) ⇒ Object



20
21
22
23
24
25
26
27
28
# File 'lib/rtesseract/box.rb', line 20

def parse_line(line)
  return unless line.match?(/oc(rx|r)_word/)

  word = line.to_s.scan(/>(.*)</).flatten.first.to_s

  return if word.strip == ''

  word_info(word, parse_position(line), parse_confidence(line))
end

.parse_position(line) ⇒ Object



41
42
43
# File 'lib/rtesseract/box.rb', line 41

def parse_position(line)
  line.match(/(?<=title)(.*?)(?=;)/).to_s.split
end

.run(source, errors, options) ⇒ Object



8
9
10
11
12
13
14
# File 'lib/rtesseract/box.rb', line 8

def run(source, errors, options)
  options = options.merge({ tessedit_create_hocr: 1 })

  RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path|
    parse(File.read("#{output_path}.hocr"))
  end
end

.word_info(word, positions, confidence) ⇒ Object



30
31
32
33
34
35
36
37
38
39
# File 'lib/rtesseract/box.rb', line 30

def word_info(word, positions, confidence)
  {
    word: word,
    confidence: confidence[-1].to_i,
    x_start: positions[1].to_i,
    y_start: positions[2].to_i,
    x_end: positions[3].to_i,
    y_end: positions[4].to_i
  }
end