Class: PDF::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/filter/lzw.rb,
lib/pdf/reader/glyph_hash.rb,
lib/pdf/reader/page_state.rb,
lib/pdf/reader/filter/null.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/filter/flate.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/text_receiver.rb,
lib/pdf/reader/filter/ascii85.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/filter/ascii_hex.rb,
lib/pdf/reader/filter/depredict.rb,
lib/pdf/reader/resource_methods.rb,
lib/pdf/reader/abstract_strategy.rb,
lib/pdf/reader/filter/run_length.rb,
lib/pdf/reader/metadata_strategy.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb,
lib/pdf/reader/standard_security_handler.rb

Overview

Copyright © 2010 James Healy ([email protected])

Defined Under Namespace

Modules: Filter, ResourceMethods Classes: AbstractStrategy, Buffer, CMap, Encoding, EncryptedPDFError, Error, Font, FormXObject, GlyphHash, InvalidObjectError, LZW, MalformedPDFError, MetadataStrategy, ObjectCache, ObjectHash, ObjectStream, Page, PageState, PageTextReceiver, PagesStrategy, Parser, PrintReceiver, Reference, RegisterReceiver, StandardSecurityHandler, Stream, TextReceiver, Token, UnsupportedFeatureError, XRef

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = nil, opts = {}) ⇒ Reader

creates a new document reader for the provided PDF.

input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename

reader = PDF::Reader.new("somefile.pdf")

File.open("somefile.pdf","rb") do |file|
  reader = PDF::Reader.new(file)
end

If the source file is encrypted you can provide a password for decrypting

reader = PDF::Reader.new("somefile.pdf", :password => "apples")


111
112
113
114
115
116
117
# File 'lib/pdf/reader.rb', line 111

def initialize(input = nil, opts = {})
  if input # support the deprecated Reader API
    @cache   = PDF::Reader::ObjectCache.new
    opts.merge!(:cache => @cache)
    @objects = PDF::Reader::ObjectHash.new(input, opts)
  end
end

Instance Attribute Details

#objectsObject (readonly)

lowlevel hash-like access to all objects in the underlying PDF



94
95
96
# File 'lib/pdf/reader.rb', line 94

def objects
  @objects
end

Class Method Details

.file(name, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, sending events to the given receiver.



167
168
169
170
171
# File 'lib/pdf/reader.rb', line 167

def self.file(name, receivers, opts = {})
  File.open(name,"rb") do |f|
    new.parse(f, receivers, opts)
  end
end

.object_file(name, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, returning an unmarshalled ruby version of represents the requested pdf object



190
191
192
193
194
# File 'lib/pdf/reader.rb', line 190

def self.object_file(name, id, gen = 0)
  File.open(name,"rb") { |f|
    new.object(f, id.to_i, gen.to_i)
  }
end

.object_string(str, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, returning an unmarshalled ruby version of represents the requested pdf object



202
203
204
205
206
# File 'lib/pdf/reader.rb', line 202

def self.object_string(str, id, gen = 0)
  StringIO.open(str) { |s|
    new.object(s, id.to_i, gen.to_i)
  }
end

.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object

syntactic sugar for opening a PDF file. Accepts the same arguments as new().

PDF::Reader.open("somefile.pdf") do |reader|
  puts reader.pdf_version
end

or

PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
  puts reader.pdf_version
end

Yields:



157
158
159
# File 'lib/pdf/reader.rb', line 157

def self.open(input, opts = {}, &block)
  yield PDF::Reader.new(input, opts)
end

.string(str, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, sending events to the given receiver.



178
179
180
181
182
# File 'lib/pdf/reader.rb', line 178

def self.string(str, receivers, opts = {})
  StringIO.open(str) do |s|
    new.parse(s, receivers, opts)
  end
end

Instance Method Details

#infoObject



119
120
121
122
# File 'lib/pdf/reader.rb', line 119

def info
  dict = @objects.deref(@objects.trailer[:Info])
  doc_strings_to_utf8(dict)
end

#metadataObject



124
125
126
127
128
129
130
131
132
133
# File 'lib/pdf/reader.rb', line 124

def 
  stream = @objects.deref(root[:Metadata])
  if stream.nil?
    nil
  else
    xml = stream.unfiltered_data
    xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding)
    xml
  end
end

#object(io, id, gen) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, return the contents of a single object



270
271
272
273
274
# File 'lib/pdf/reader.rb', line 270

def object (io, id, gen)
  @objects = ObjectHash.new(io)

  @objects.deref(Reference.new(id, gen))
end

#page(num) ⇒ Object

returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page

reader = PDF::Reader.new("somefile.pdf")
page   = reader.page(10)

puts page.text

See the docs for PDF::Reader::Page to read more about the methods available on each page

Raises:

  • (ArgumentError)


240
241
242
243
244
# File 'lib/pdf/reader.rb', line 240

def page(num)
  num = num.to_i
  raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
end

#page_countObject



135
136
137
138
# File 'lib/pdf/reader.rb', line 135

def page_count
  pages = @objects.deref(root[:Pages])
  @page_count ||= @objects.deref(pages[:Count])
end

#pagesObject

returns an array of PDF::Reader::Page objects, one for each page in the source PDF.

reader = PDF::Reader.new("somefile.pdf")

reader.pages.each do |page|
  puts page.fonts
  puts page.images
  puts page.text
end

See the docs for PDF::Reader::Page to read more about the methods available on each page



222
223
224
225
226
# File 'lib/pdf/reader.rb', line 222

def pages
  (1..self.page_count).map { |num|
    PDF::Reader::Page.new(@objects, num, :cache => @cache)
  }
end

#parse(io, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, parse it.



252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/pdf/reader.rb', line 252

def parse(io, receivers, opts = {})
  ohash    = ObjectHash.new(io)

  options = {:pages => true, :raw_text => false, :metadata => true}
  options.merge!(opts)

  strategies.each do |s|
    s.new(ohash, receivers, options).process
  end

  self
end

#pdf_versionObject



140
141
142
# File 'lib/pdf/reader.rb', line 140

def pdf_version
  @objects.pdf_version
end