Class: PDF::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/text_run.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/filter/lzw.rb,
lib/pdf/reader/glyph_hash.rb,
lib/pdf/reader/filter/null.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/page_layout.rb,
lib/pdf/reader/filter/flate.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/page_receiver.rb,
lib/pdf/reader/text_receiver.rb,
lib/pdf/reader/filter/ascii85.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/font_descriptor.rb,
lib/pdf/reader/filter/ascii_hex.rb,
lib/pdf/reader/filter/depredict.rb,
lib/pdf/reader/resource_methods.rb,
lib/pdf/reader/abstract_strategy.rb,
lib/pdf/reader/filter/run_length.rb,
lib/pdf/reader/metadata_strategy.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb,
lib/pdf/reader/synchronized_cache.rb,
lib/pdf/reader/transformation_matrix.rb,
lib/pdf/reader/standard_security_handler.rb,
lib/pdf/reader/width_calculator/built_in.rb,
lib/pdf/reader/width_calculator/composite.rb,
lib/pdf/reader/width_calculator/true_type.rb,
lib/pdf/reader/width_calculator/type_zero.rb,
lib/pdf/reader/width_calculator/type_one_or_three.rb

Overview

Copyright © 2010 James Healy ([email protected])

Defined Under Namespace

Modules: Filter, ResourceMethods, WidthCalculator Classes: AbstractStrategy, Buffer, CMap, Encoding, EncryptedPDFError, Error, Font, FontDescriptor, FormXObject, GlyphHash, InvalidObjectError, LZW, MalformedPDFError, MetadataStrategy, ObjectCache, ObjectHash, ObjectStream, Page, PageLayout, PageReceiver, PageTextReceiver, PagesStrategy, Parser, PrintReceiver, Reference, RegisterReceiver, StandardSecurityHandler, Stream, SynchronizedCache, TextReceiver, TextRun, Token, TransformationMatrix, UnsupportedFeatureError, XRef

Constant Summary collapse

STANDARD_ENCODING_TO_NAME =

Definition of the StandardEncoding taken from the Adobe website

{
  32 => :space, 33 => :exclam, 34 => :quotedbl, 35 => :numbersign, 36 => :dollar,
  37 => :percent, 38 => :ampersand, 39 => :quoteright, 40 => :parenleft, 41 => :parenright,
  42 => :asterisk, 43 => :plus, 44 => :comma, 45 => :hyphen, 46 => :period, 47 => :slash,
  48 => :zero, 49 => :one, 50 => :two, 51 => :three, 52 => :four, 53 => :five, 54 => :six,
  55 => :seven, 56 => :eight, 57 => :nine, 58 => :colon, 59 => :semicolon, 60 => :less,
  61 => :equal, 62 => :greater, 63 => :question, 64 => :at, 65 => :A, 66 => :B, 67 => :C,
  68 => :D, 69 => :E, 70 => :F, 71 => :G, 72 => :H, 73 => :I, 74 => :J, 75 => :K, 76 => :L,
  77 => :M, 78 => :N, 79 => :O, 80 => :P, 81 => :Q, 82 => :R, 83 => :S, 84 => :T, 85 => :U,
  86 => :V, 87 => :W, 88 => :X, 89 => :Y, 90 => :Z, 91 => :bracketleft, 92 => :backslash,
  93 => :bracketright, 94 => :asciicircum, 95 => :underscore, 96 => :quoteleft, 97 => :a,
  98 => :b, 99 => :c, 100 => :d, 101 => :e, 102 => :f, 103 => :g, 104 => :h, 105 => :i,
  106 => :j, 107 => :k, 108 => :l, 109 => :m, 110 => :n, 111 => :o, 112 => :p, 113 => :q,
  114 => :r, 115 => :s, 116 => :t, 117 => :u, 118 => :v, 119 => :w, 120 => :x, 121 => :y,
  122 => :z, 123 => :braceleft, 124 => :bar, 125 => :braceright, 126 => :asciitilde,
  161 => :exclamdown, 162 => :cent, 163 => :sterling, 164 => :fraction, 165 => :yen,
  166 => :florin, 167 => :section, 168 => :currency, 169 => :quotesingle, 170 => :quotedblleft,
  171 => :guillemotleft, 172 => :guilsinglleft, 173 => :guilsinglright, 174 => :fi, 175 => :fl,
  177 => :endash, 178 => :dagger, 179 => :daggerdbl, 180 => :periodcentered, 182 => :paragraph,
  183 => :bullet, 184 => :quotesinglbase, 185 => :quotedblbase, 186 => :quotedblright,
  187 => :guillemotright, 188 => :ellipsis, 189 => :perthousand, 191 => :questiondown,
  193 => :grave, 194 => :acute, 195 => :circumflex, 196 => :tilde, 197 => :macron, 198 => :breve,
  199 => :dotaccent, 200 => :dieresis, 202 => :ring, 203 => :cedilla, 205 => :hungarumlaut,
  206 => :ogonek, 207 => :caron, 208 => :emdash, 225 => :AE, 227 => :ordfeminine, 232 => :Lslash,
  233 => :Oslash, 234 => :OE, 235 => :ordmasculine, 241 => :ae, 245 => :dotlessi, 248 => :lslash,
  249 => :oslash, 250 => :oe, 251 => :germandbls,
}
MAC_ROMAN_ENCODING_TO_NAME =
{
  32 => :space, 33 => :exclam, 34 => :quotedbl, 35 => :numbersign, 36 => :dollar,
  37 => :percent, 38 => :ampersand, 39 => :quotesingle, 40 => :parenleft, 41 => :parenright,
  42 => :asterisk, 43 => :plus, 44 => :comma, 45 => :hyphen, 46 => :period,
  47 => :slash, 48 => :zero, 49 => :one, 50 => :two, 51 => :three, 52 => :four,
  53 => :five, 54 => :six, 55 => :seven, 56 => :eight, 57 => :nine, 58 => :colon,
  59 => :semicolon, 60 => :less, 61 => :equal, 62 => :greater, 63 => :question,
  64 => :at, 65 => :A, 66 => :B, 67 => :C, 68 => :D, 69 => :E, 70 => :F, 71 => :G,
  72 => :H, 73 => :I, 74 => :J, 75 => :K, 76 => :L, 77 => :M, 78 => :N, 79 => :O,
  80 => :P, 81 => :Q, 82 => :R, 83 => :S, 84 => :T, 85 => :U, 86 => :V, 87 => :W,
  88 => :X, 89 => :Y, 90 => :Z, 91 => :bracketleft, 92 => :backslash, 93 => :bracketright,
  94 => :asciicircum, 95 => :underscore, 96 => :grave, 97 => :a, 98 => :b, 99 => :c,
  100 => :d, 101 => :e, 102 => :f, 103 => :g, 104 => :h, 105 => :i, 106 => :j,
  107 => :k, 108 => :l, 109 => :m, 110 => :n, 111 => :o, 112 => :p, 113 => :q, 114 => :r,
  115 => :s, 116 => :t, 117 => :u, 118 => :v, 119 => :w, 120 => :x, 121 => :y, 122 => :z,
  123 => :braceleft, 124 => :bar, 125 => :braceright, 126 => :asciitilde,
  128 => :Adieresis, 129 => :Aring, 130 => :Ccedilla, 131 => :Eacute, 132 => :Ntilde,
  133 => :Odieresis, 134 => :Udieresis, 135 => :aacute, 136 => :agrave, 137 => :acircumflex,
  138 => :adieresis, 139 => :atilde, 140 => :aring, 141 => :ccedilla, 142 => :eacute,
  143 => :egrave, 144 => :ecircumflex, 145 => :edieresis, 146 => :iacute, 147 => :igrave,
  148 => :icircumflex, 149 => :idieresis, 150 => :ntilde, 151 => :oacute, 152 => :ograve,
  153 => :ocircumflex, 154 => :odieresis, 155 => :otilde, 156 => :uacute, 157 => :ugrave,
  158 => :ucircumflex, 159 => :udieresis, 160 => :dagger, 161 => :degree, 162 => :cent,
  163 => :sterling, 164 => :section, 165 => :bullet, 166 => :paragraph, 167 => :germandbls,
  168 => :registered, 169 => :copyright, 170 => :trademark, 171 => :acute, 172 => :dieresis,
  173 => :notequal, 174 => :AE, 175 => :Oslash, 176 => :infinity, 177 => :plusminus,
  178 => :lessequal, 179 => :greaterequal, 180 => :yen, 181 => :mu, 182 => :partialdiff,
  183 => :summation, 184 => :product, 185 => :pi, 186 => :integral, 187 => :ordfeminine,
  188 => :ordmasculine, 189 => :Omega, 190 => :ae, 191 => :oslash, 192 => :questiondown,
  193 => :exclamdown, 194 => :logicalnot, 195 => :radical, 196 => :florin, 197 => :approxequal,
  198 => :Delta, 199 => :guillemotleft, 200 => :guillemotright, 201 => :ellipsis,
  202 => :nobreakspace, 203 => :Agrave, 204 => :Atilde, 205 => :Otilde, 206 => :OE,
  207 => :oe, 208 => :endash, 209 => :emdash, 210 => :quotedblleft, 211 => :quotedblright,
  212 => :quoteleft, 213 => :quoteright, 214 => :divide, 215 => :lozenge, 216 => :ydieresis,
  217 => :Ydieresis, 218 => :fraction, 219 => :currency, 220 => :guilsinglleft,
  221 => :guilsinglright, 222 => :fi, 223 => :fl, 224 => :daggerdbl, 225 => :periodcentered,
  226 => :quotesinglbase, 227 => :quotedblbase, 228 => :perthousane, 229 => :Acircumflex,
  230 => :Ecircumflex, 231 => :Aacute, 232 => :Edieresis, 233 => :Egrave, 234 => :Iacute,
  235 => :Icircumflex, 236 => :Idieresis, 237 => :Igrave, 238 => :Oacute, 239 => :Ocircumflex,
  240 => :apple, 241 => :Ograve, 242 => :Uacute, 243 => :Ucircumflex, 244 => :Ugrave,
  245 => :dotlessi, 246 => :circumflex, 247 => :tilde, 248 => :macron, 249 => :breve,
  250 => :dotaccent, 251 => :ring, 252 => :cedilla, 253 => :hungarumlaut, 254 => :ogonek, 255 => :caron, 
}
WIN_ANSI_ENCODING_TO_NAME =

NOTE: the windows encoding has some additional key/value pairs which are not included in the following hash:

nbspace             160/a0   <- same as no break space
sfthyphen           173/ad   <- same as hyphen
middot              183/b7   <- same as period center
{
  :space => 32, :exclam => 33, :quotedbl => 34, :numbersign => 35, :dollar => 36, :percent => 37,
  :ampersand => 38, :quotesingle => 39, :parenleft => 40, :parenright => 41, :asterisk => 42, :plus => 43,
  :comma => 44, :hyphen => 45, :period => 46, :slash => 47, :zero => 48, :one => 49,
  :two => 50, :three => 51, :four => 52, :five => 53, :six => 54, :seven => 55,
  :eight => 56, :nine => 57, :colon => 58, :semicolon => 59, :less => 60, :equal => 61,
  :greater => 62, :question => 63, :at => 64, :A => 65, :B => 66, :C => 67,
  :D => 68, :E => 69, :F => 70, :G => 71, :H => 72, :I => 73,
  :J => 74, :K => 75, :L => 76, :M => 77, :N => 78, :O => 79,
  :P => 80, :Q => 81, :R => 82, :S => 83, :T => 84, :U => 85,
  :V => 86, :W => 87, :X => 88, :Y => 89, :Z => 90, :bracketleft => 91,
  :backslash => 92, :bracketright => 93, :asciicircum => 94, :underscore => 95, :grave => 96, :a => 97,
  :b => 98, :c => 99, :d => 100, :e => 101, :f => 102, :g => 103,
  :h => 104, :i => 105, :j => 106, :k => 107, :l => 108, :m => 109,
  :n => 110, :o => 111, :p => 112, :q => 113, :r => 114, :s => 115,
  :t => 116, :u => 117, :v => 118, :w => 119, :x => 120, :y => 121,
  :z => 122, :braceleft => 123, :bar => 124, :braceright => 125, :asciitilde => 126, :Adieresis => 196,
  :Aring => 197, :Ccedilla => 199, :Eacute => 201, :Ntilde => 209, :Odieresis => 214, :Udieresis => 220,
  :aacute => 225, :agrave => 224, :acircumflex => 226, :adieresis => 228, :atilde => 227, :aring => 229,
  :ccedilla => 231, :eacute => 233, :egrave => 232, :ecircumflex => 234, :edieresis => 235, :iacute => 237,
  :igrave => 236, :icircumflex => 238, :idieresis => 239, :ntilde => 241, :oacute => 243, :ograve => 242,
  :ocircumflex => 244, :odieresis => 246, :otilde => 245, :uacute => 250, :ugrave => 249, :ucircumflex => 251,
  :udieresis => 252, :dagger => 134, :degree => 176, :cent => 162, :sterling => 163, :section => 167,
  :bullet => 149, :paragraph => 182, :germandbls => 223, :registered => 174, :copyright => 169, :trademark => 153,
  :acute => 180, :dieresis => 168, :AE => 198, :Oslash => 216, :plusminus => 177, :yen => 165,
  :mu => 181, :ordfeminine => 170, :ordmasculine => 186, :ae => 230, :oslash => 248, :questiondown => 191,
  :exclamdown => 161, :logicalnot => 172, :florin => 131, :guillemotleft => 171, :guillemotright => 187, :ellipsis => 133,
  :nobreakspace => 160, :Agrave => 192, :Atilde => 195, :Otilde => 213, :OE => 140, :oe => 156,
  :endash => 150, :emdash => 151, :quotedblleft => 147, :quotedblright => 148, :quoteleft => 145, :quoteright => 146,
  :divide => 247, :ydieresis => 255, :Ydieresis => 159, :currency => 164, :guilsinglleft => 139, :guilsinglright => 155,
  :daggerdbl => 135, :periodcentered => 183, :quotesinglbase => 130, :quotedblbase => 132, :perthousane => 137, :Acircumflex => 194,
  :Ecircumflex => 202, :Aacute => 193, :Edieresis => 203, :Egrave => 200, :Iacute => 205, :Icircumflex => 206,
  :Idieresis => 207, :Igrave => 204, :Oacute => 211, :Ocircumflex => 212, :Ograve => 210, :Uacute => 218,
  :Ucircumflex => 219, :Ugrave => 217, :circumflex => 136, :tilde => 152, :macron => 175, :cedilla => 184,
  :Scaron => 138, :scaron => 154, :brokenbar => 166, :Eth => 208, :eth => 240, :Yacute => 221,
  :yacute => 253, :Thorn => 222, :thorn => 254, :multiply => 215, :onesuperior => 185, :twosuperior => 178,
  :threesuperior => 179, :onehalf => 189, :onequarter => 188, :threequarters => 190,
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = nil, opts = {}) ⇒ Reader

creates a new document reader for the provided PDF.

input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename

reader = PDF::Reader.new("somefile.pdf")

File.open("somefile.pdf","rb") do |file|
  reader = PDF::Reader.new(file)
end

If the source file is encrypted you can provide a password for decrypting

reader = PDF::Reader.new("somefile.pdf", :password => "apples")


113
114
115
116
117
118
119
# File 'lib/pdf/reader.rb', line 113

def initialize(input = nil, opts = {})
  if input # support the deprecated Reader API
    @cache   = PDF::Reader::ObjectCache.new
    opts.merge!(:cache => @cache)
    @objects = PDF::Reader::ObjectHash.new(input, opts)
  end
end

Instance Attribute Details

#objectsObject (readonly)

lowlevel hash-like access to all objects in the underlying PDF



96
97
98
# File 'lib/pdf/reader.rb', line 96

def objects
  @objects
end

Class Method Details

.file(name, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, sending events to the given receiver.



169
170
171
172
173
# File 'lib/pdf/reader.rb', line 169

def self.file(name, receivers, opts = {})
  File.open(name,"rb") do |f|
    new.parse(f, receivers, opts)
  end
end

.object_file(name, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, returning an unmarshalled ruby version of represents the requested pdf object



192
193
194
195
196
# File 'lib/pdf/reader.rb', line 192

def self.object_file(name, id, gen = 0)
  File.open(name,"rb") { |f|
    new.object(f, id.to_i, gen.to_i)
  }
end

.object_string(str, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, returning an unmarshalled ruby version of represents the requested pdf object



204
205
206
207
208
# File 'lib/pdf/reader.rb', line 204

def self.object_string(str, id, gen = 0)
  StringIO.open(str) { |s|
    new.object(s, id.to_i, gen.to_i)
  }
end

.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object

syntactic sugar for opening a PDF file. Accepts the same arguments as new().

PDF::Reader.open("somefile.pdf") do |reader|
  puts reader.pdf_version
end

or

PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
  puts reader.pdf_version
end

Yields:



159
160
161
# File 'lib/pdf/reader.rb', line 159

def self.open(input, opts = {}, &block)
  yield PDF::Reader.new(input, opts)
end

.string(str, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, sending events to the given receiver.



180
181
182
183
184
# File 'lib/pdf/reader.rb', line 180

def self.string(str, receivers, opts = {})
  StringIO.open(str) do |s|
    new.parse(s, receivers, opts)
  end
end

Instance Method Details

#infoObject



121
122
123
124
# File 'lib/pdf/reader.rb', line 121

def info
  dict = @objects.deref(@objects.trailer[:Info])
  doc_strings_to_utf8(dict)
end

#metadataObject



126
127
128
129
130
131
132
133
134
135
# File 'lib/pdf/reader.rb', line 126

def 
  stream = @objects.deref(root[:Metadata])
  if stream.nil?
    nil
  else
    xml = stream.unfiltered_data
    xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding)
    xml
  end
end

#object(io, id, gen) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, return the contents of a single object



274
275
276
277
278
# File 'lib/pdf/reader.rb', line 274

def object (io, id, gen)
  @objects = ObjectHash.new(io)

  @objects.deref(Reference.new(id, gen))
end

#page(num) ⇒ Object

returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page

reader = PDF::Reader.new("somefile.pdf")
page   = reader.page(10)

puts page.text

See the docs for PDF::Reader::Page to read more about the methods available on each page



242
243
244
245
246
247
248
# File 'lib/pdf/reader.rb', line 242

def page(num)
  num = num.to_i
  if num < 1 || num > self.page_count
    raise ArgumentError, "valid pages are 1 .. #{self.page_count}"
  end
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
end

#page_countObject



137
138
139
140
# File 'lib/pdf/reader.rb', line 137

def page_count
  pages = @objects.deref(root[:Pages])
  @page_count ||= @objects.deref(pages[:Count])
end

#pagesObject

returns an array of PDF::Reader::Page objects, one for each page in the source PDF.

reader = PDF::Reader.new("somefile.pdf")

reader.pages.each do |page|
  puts page.fonts
  puts page.images
  puts page.text
end

See the docs for PDF::Reader::Page to read more about the methods available on each page



224
225
226
227
228
# File 'lib/pdf/reader.rb', line 224

def pages
  (1..self.page_count).map { |num|
    PDF::Reader::Page.new(@objects, num, :cache => @cache)
  }
end

#parse(io, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, parse it.



256
257
258
259
260
261
262
263
264
265
266
267
# File 'lib/pdf/reader.rb', line 256

def parse(io, receivers, opts = {})
  ohash    = ObjectHash.new(io)

  options = {:pages => true, :raw_text => false, :metadata => true}
  options.merge!(opts)

  strategies.each do |s|
    s.new(ohash, receivers, options).process
  end

  self
end

#pdf_versionObject



142
143
144
# File 'lib/pdf/reader.rb', line 142

def pdf_version
  @objects.pdf_version
end