Class: PDF::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/text_receiver.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/abstract_strategy.rb,
lib/pdf/reader/metadata_strategy.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb

Overview

Copyright © 2010 James Healy ([email protected])

Defined Under Namespace

Classes: AbstractStrategy, Buffer, CMap, Encoding, Error, Filter, Font, FormXObject, InvalidObjectError, LZW, MalformedPDFError, MetadataStrategy, ObjectCache, ObjectHash, ObjectStream, Page, PageTextReceiver, PagesStrategy, Parser, PrintReceiver, Reference, RegisterReceiver, Stream, TextReceiver, Token, UnsupportedFeatureError, XRef

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = nil) ⇒ Reader

creates a new document reader for the provided PDF.

input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename

reader = PDF::Reader.new("somefile.pdf")

File.open("somefile.pdf","rb") do |file|
  reader = PDF::Reader.new(file)
end


105
106
107
108
109
110
111
112
113
# File 'lib/pdf/reader.rb', line 105

def initialize(input = nil)
  if input # support the deprecated Reader API
    @objects = PDF::Reader::ObjectHash.new(input)
    @page_count  = get_page_count
    @pdf_version = @objects.pdf_version
    @info        = @objects.deref(@objects.trailer[:Info])
    @metadata    = 
  end
end

Instance Attribute Details

#infoObject (readonly)

Returns the value of attribute info.



92
93
94
# File 'lib/pdf/reader.rb', line 92

def info
  @info
end

#metadataObject (readonly)

Returns the value of attribute metadata.



92
93
94
# File 'lib/pdf/reader.rb', line 92

def 
  @metadata
end

#objectsObject (readonly)

lowlevel hash-like access to all objects in the underlying PDF



90
91
92
# File 'lib/pdf/reader.rb', line 90

def objects
  @objects
end

#page_countObject (readonly)

Returns the value of attribute page_count.



92
93
94
# File 'lib/pdf/reader.rb', line 92

def page_count
  @page_count
end

#pdf_versionObject (readonly)

Returns the value of attribute pdf_version.



92
93
94
# File 'lib/pdf/reader.rb', line 92

def pdf_version
  @pdf_version
end

Class Method Details

.file(name, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 0.11.0 and will

eventually be removed

Parse the file with the given name, sending events to the given receiver.



132
133
134
135
136
# File 'lib/pdf/reader.rb', line 132

def self.file(name, receivers, opts = {})
  File.open(name,"rb") do |f|
    new.parse(f, receivers, opts)
  end
end

.object_file(name, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 0.11.0 and will

eventually be removed

Parse the file with the given name, returning an unmarshalled ruby version of represents the requested pdf object



155
156
157
158
159
# File 'lib/pdf/reader.rb', line 155

def self.object_file(name, id, gen = 0)
  File.open(name,"rb") { |f|
    new.object(f, id.to_i, gen.to_i)
  }
end

.object_string(str, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 0.11.0 and will

eventually be removed

Parse the given string, returning an unmarshalled ruby version of represents the requested pdf object



167
168
169
170
171
# File 'lib/pdf/reader.rb', line 167

def self.object_string(str, id, gen = 0)
  StringIO.open(str) { |s|
    new.object(s, id.to_i, gen.to_i)
  }
end

.open(input) {|PDF::Reader.new(input)| ... } ⇒ Object

syntactic sugar for opening a PDF file. Accepts the same arguments as new().

PDF::Reader.open("somefile.pdf") do |reader|
  puts reader.pdf_version
end

Yields:



122
123
124
# File 'lib/pdf/reader.rb', line 122

def self.open(input, &block)
  yield PDF::Reader.new(input)
end

.string(str, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 0.11.0 and will

eventually be removed

Parse the given string, sending events to the given receiver.



143
144
145
146
147
# File 'lib/pdf/reader.rb', line 143

def self.string(str, receivers, opts = {})
  StringIO.open(str) do |s|
    new.parse(s, receivers, opts)
  end
end

Instance Method Details

#object(io, id, gen) ⇒ Object

DEPRECATED: this method was deprecated in version 0.11.0 and will

eventually be removed

Given an IO object that contains PDF data, return the contents of a single object



239
240
241
242
243
# File 'lib/pdf/reader.rb', line 239

def object (io, id, gen)
  @objects = ObjectHash.new(io)

  @objects.deref(Reference.new(id, gen))
end

#page(num) ⇒ Object

returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page

reader = PDF::Reader.new("somefile.pdf")
page   = reader.page(10)

puts page.text

See the docs for PDF::Reader::Page to read more about the methods available on each page

Raises:

  • (ArgumentError)


205
206
207
208
209
# File 'lib/pdf/reader.rb', line 205

def page(num)
  num = num.to_i
  raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
  PDF::Reader::Page.new(@objects, num)
end

#pagesObject

returns an array of PDF::Reader::Page objects, one for each page in the source PDF.

reader = PDF::Reader.new("somefile.pdf")

reader.pages.each do |page|
  puts page.fonts
  puts page.images
  puts page.text
end

See the docs for PDF::Reader::Page to read more about the methods available on each page



187
188
189
190
191
# File 'lib/pdf/reader.rb', line 187

def pages
  (1..@page_count).map { |num|
    PDF::Reader::Page.new(@objects, num)
  }
end

#parse(io, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 0.11.0 and will

eventually be removed

Given an IO object that contains PDF data, parse it.



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# File 'lib/pdf/reader.rb', line 217

def parse(io, receivers, opts = {})
  ohash    = ObjectHash.new(io)

  if ohash.trailer[:Encrypt]
    raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
  end

  options = {:pages => true, :raw_text => false, :metadata => true}
  options.merge!(opts)

  strategies.each do |s|
    s.new(ohash, receivers, options).process
  end

  self
end