Class: OCRSDK::PDF

# File 'lib/ocrsdk/pdf.rb', line 9

def recognizeable?
  reader = PDF::Reader.new @image_path

  images = 0
  text   = 0
  chars  = Set.new
  reader.pages.each do |page|
    text   += page.text.length
    chars  += page.text.split('').map(&:ord).uniq
    images += page.xobjects.map {|k, v| v.hash[:Subtype]}.count(:Image)
  end

  # count number of distinct characters
  # in case of "searchable", but incorrectly recognized document
  images * 20 > text || chars.length < 10
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError
  false
end

Class: OCRSDK::PDF

Constant Summary

Constants included from Verifiers::Profile

Constants included from Verifiers::Format

Constants included from Verifiers::Language

Instance Method Summary collapse

Methods inherited from Image

Methods included from Verifiers::Profile

Methods included from Verifiers::Format

Methods included from Verifiers::Language

Methods inherited from AbstractEntity

Constructor Details

Instance Method Details

#recognizeable? ⇒ Boolean

#recognizeable? ⇒ `Boolean`