Class: PDFToHTMLR::PdfFile

Inherits:
Object
  • Object
show all
Defined in:
lib/pdftohtmlr.rb

Overview

Provides facilities for converting PDFs to HTML from Ruby code.

Direct Known Subclasses

PdfFilePath, PdfFileUrl

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input_path, target_path = nil, user_pwd = nil, owner_pwd = nil) ⇒ PdfFile

Returns a new instance of PdfFile.



33
34
35
36
37
38
# File 'lib/pdftohtmlr.rb', line 33

def initialize(input_path, target_path=nil, user_pwd=nil, owner_pwd=nil)
  @path = input_path
  @target = target_path
  @user_pwd = user_pwd
  @owner_pwd = owner_pwd      
end

Instance Attribute Details

#formatObject (readonly)

Returns the value of attribute format.



31
32
33
# File 'lib/pdftohtmlr.rb', line 31

def format
  @format
end

#owner_pwdObject (readonly)

Returns the value of attribute owner_pwd.



30
31
32
# File 'lib/pdftohtmlr.rb', line 30

def owner_pwd
  @owner_pwd
end

#pathObject (readonly)

Returns the value of attribute path.



27
28
29
# File 'lib/pdftohtmlr.rb', line 27

def path
  @path
end

#targetObject (readonly)

Returns the value of attribute target.



28
29
30
# File 'lib/pdftohtmlr.rb', line 28

def target
  @target
end

#user_pwdObject (readonly)

Returns the value of attribute user_pwd.



29
30
31
# File 'lib/pdftohtmlr.rb', line 29

def user_pwd
  @user_pwd
end

Instance Method Details

#convertObject

Convert the PDF document to HTML. Returns a string



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/pdftohtmlr.rb', line 41

def convert()
  errors = ""
  output = ""
  
  if @user_pwd 
    cmd = "pdftohtml -stdout #{@format} -upw #{@user_pwd}" + ' "' + @path + '"'    
  elsif @owner_pwd 
    cmd = "pdftohtml -stdout #{@format} -opw #{@owner_pwd}" + ' "' + @path + '"'
  else
    cmd = "pdftohtml -stdout #{@format}" + ' "' + @path + '"'
  end
  
  output = `#{cmd} 2>&1`

  if (output.include?("Error: May not be a PDF file"))
    raise PDFToHTMLRError, "Error: May not be a PDF file (continuing anyway)"
  elsif (output.include?("Error:"))
    raise PDFToHTMLRError, output.split("\n").first.to_s.chomp
  else
    return output
  end
end

#convert_to_documentObject

Convert the PDF document to HTML. Returns a Nokogiri::HTML:Document



65
66
67
# File 'lib/pdftohtmlr.rb', line 65

def convert_to_document() 
  Nokogiri::HTML.parse(convert())
end

#convert_to_xmlObject



69
70
71
72
# File 'lib/pdftohtmlr.rb', line 69

def convert_to_xml()
  @format = "-xml"
  convert()
end

#convert_to_xml_documentObject



74
75
76
77
# File 'lib/pdftohtmlr.rb', line 74

def convert_to_xml_document()
  @format = "-xml"
  Nokogiri::XML.parse(convert())
end