Class: PDF::Reader
- Inherits:
-
Object
- Object
- PDF::Reader
- Defined in:
- lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/text_run.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/filter/lzw.rb,
lib/pdf/reader/glyph_hash.rb,
lib/pdf/reader/filter/null.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/page_layout.rb,
lib/pdf/reader/filter/flate.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/page_receiver.rb,
lib/pdf/reader/text_receiver.rb,
lib/pdf/reader/filter/ascii85.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/font_descriptor.rb,
lib/pdf/reader/filter/ascii_hex.rb,
lib/pdf/reader/filter/depredict.rb,
lib/pdf/reader/resource_methods.rb,
lib/pdf/reader/abstract_strategy.rb,
lib/pdf/reader/filter/run_length.rb,
lib/pdf/reader/metadata_strategy.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb,
lib/pdf/reader/synchronized_cache.rb,
lib/pdf/reader/transformation_matrix.rb,
lib/pdf/reader/standard_security_handler.rb,
lib/pdf/reader/width_calculator/built_in.rb,
lib/pdf/reader/width_calculator/composite.rb,
lib/pdf/reader/width_calculator/true_type.rb,
lib/pdf/reader/width_calculator/type_zero.rb,
lib/pdf/reader/width_calculator/type_one_or_three.rb
Overview
Copyright © 2010 James Healy ([email protected])
Defined Under Namespace
Modules: Filter, ResourceMethods, WidthCalculator Classes: AbstractStrategy, Buffer, CMap, Encoding, EncryptedPDFError, Error, Font, FontDescriptor, FormXObject, GlyphHash, InvalidObjectError, LZW, MalformedPDFError, MetadataStrategy, ObjectCache, ObjectHash, ObjectStream, Page, PageLayout, PageReceiver, PageTextReceiver, PagesStrategy, Parser, PrintReceiver, Reference, RegisterReceiver, StandardSecurityHandler, Stream, SynchronizedCache, TextReceiver, TextRun, Token, TransformationMatrix, UnsupportedFeatureError, XRef
Constant Summary collapse
- STANDARD_ENCODING_TO_NAME =
Definition of the StandardEncoding taken from the Adobe website
{ 32 => :space, 33 => :exclam, 34 => :quotedbl, 35 => :numbersign, 36 => :dollar, 37 => :percent, 38 => :ampersand, 39 => :quoteright, 40 => :parenleft, 41 => :parenright, 42 => :asterisk, 43 => :plus, 44 => :comma, 45 => :hyphen, 46 => :period, 47 => :slash, 48 => :zero, 49 => :one, 50 => :two, 51 => :three, 52 => :four, 53 => :five, 54 => :six, 55 => :seven, 56 => :eight, 57 => :nine, 58 => :colon, 59 => :semicolon, 60 => :less, 61 => :equal, 62 => :greater, 63 => :question, 64 => :at, 65 => :A, 66 => :B, 67 => :C, 68 => :D, 69 => :E, 70 => :F, 71 => :G, 72 => :H, 73 => :I, 74 => :J, 75 => :K, 76 => :L, 77 => :M, 78 => :N, 79 => :O, 80 => :P, 81 => :Q, 82 => :R, 83 => :S, 84 => :T, 85 => :U, 86 => :V, 87 => :W, 88 => :X, 89 => :Y, 90 => :Z, 91 => :bracketleft, 92 => :backslash, 93 => :bracketright, 94 => :asciicircum, 95 => :underscore, 96 => :quoteleft, 97 => :a, 98 => :b, 99 => :c, 100 => :d, 101 => :e, 102 => :f, 103 => :g, 104 => :h, 105 => :i, 106 => :j, 107 => :k, 108 => :l, 109 => :m, 110 => :n, 111 => :o, 112 => :p, 113 => :q, 114 => :r, 115 => :s, 116 => :t, 117 => :u, 118 => :v, 119 => :w, 120 => :x, 121 => :y, 122 => :z, 123 => :braceleft, 124 => :bar, 125 => :braceright, 126 => :asciitilde, 161 => :exclamdown, 162 => :cent, 163 => :sterling, 164 => :fraction, 165 => :yen, 166 => :florin, 167 => :section, 168 => :currency, 169 => :quotesingle, 170 => :quotedblleft, 171 => :guillemotleft, 172 => :guilsinglleft, 173 => :guilsinglright, 174 => :fi, 175 => :fl, 177 => :endash, 178 => :dagger, 179 => :daggerdbl, 180 => :periodcentered, 182 => :paragraph, 183 => :bullet, 184 => :quotesinglbase, 185 => :quotedblbase, 186 => :quotedblright, 187 => :guillemotright, 188 => :ellipsis, 189 => :perthousand, 191 => :questiondown, 193 => :grave, 194 => :acute, 195 => :circumflex, 196 => :tilde, 197 => :macron, 198 => :breve, 199 => :dotaccent, 200 => :dieresis, 202 => :ring, 203 => :cedilla, 205 => :hungarumlaut, 206 => :ogonek, 207 => :caron, 208 => :emdash, 225 => :AE, 227 => :ordfeminine, 232 => :Lslash, 233 => :Oslash, 234 => :OE, 235 => :ordmasculine, 241 => :ae, 245 => :dotlessi, 248 => :lslash, 249 => :oslash, 250 => :oe, 251 => :germandbls, }
- MAC_ROMAN_ENCODING_TO_NAME =
{ 32 => :space, 33 => :exclam, 34 => :quotedbl, 35 => :numbersign, 36 => :dollar, 37 => :percent, 38 => :ampersand, 39 => :quotesingle, 40 => :parenleft, 41 => :parenright, 42 => :asterisk, 43 => :plus, 44 => :comma, 45 => :hyphen, 46 => :period, 47 => :slash, 48 => :zero, 49 => :one, 50 => :two, 51 => :three, 52 => :four, 53 => :five, 54 => :six, 55 => :seven, 56 => :eight, 57 => :nine, 58 => :colon, 59 => :semicolon, 60 => :less, 61 => :equal, 62 => :greater, 63 => :question, 64 => :at, 65 => :A, 66 => :B, 67 => :C, 68 => :D, 69 => :E, 70 => :F, 71 => :G, 72 => :H, 73 => :I, 74 => :J, 75 => :K, 76 => :L, 77 => :M, 78 => :N, 79 => :O, 80 => :P, 81 => :Q, 82 => :R, 83 => :S, 84 => :T, 85 => :U, 86 => :V, 87 => :W, 88 => :X, 89 => :Y, 90 => :Z, 91 => :bracketleft, 92 => :backslash, 93 => :bracketright, 94 => :asciicircum, 95 => :underscore, 96 => :grave, 97 => :a, 98 => :b, 99 => :c, 100 => :d, 101 => :e, 102 => :f, 103 => :g, 104 => :h, 105 => :i, 106 => :j, 107 => :k, 108 => :l, 109 => :m, 110 => :n, 111 => :o, 112 => :p, 113 => :q, 114 => :r, 115 => :s, 116 => :t, 117 => :u, 118 => :v, 119 => :w, 120 => :x, 121 => :y, 122 => :z, 123 => :braceleft, 124 => :bar, 125 => :braceright, 126 => :asciitilde, 128 => :Adieresis, 129 => :Aring, 130 => :Ccedilla, 131 => :Eacute, 132 => :Ntilde, 133 => :Odieresis, 134 => :Udieresis, 135 => :aacute, 136 => :agrave, 137 => :acircumflex, 138 => :adieresis, 139 => :atilde, 140 => :aring, 141 => :ccedilla, 142 => :eacute, 143 => :egrave, 144 => :ecircumflex, 145 => :edieresis, 146 => :iacute, 147 => :igrave, 148 => :icircumflex, 149 => :idieresis, 150 => :ntilde, 151 => :oacute, 152 => :ograve, 153 => :ocircumflex, 154 => :odieresis, 155 => :otilde, 156 => :uacute, 157 => :ugrave, 158 => :ucircumflex, 159 => :udieresis, 160 => :dagger, 161 => :degree, 162 => :cent, 163 => :sterling, 164 => :section, 165 => :bullet, 166 => :paragraph, 167 => :germandbls, 168 => :registered, 169 => :copyright, 170 => :trademark, 171 => :acute, 172 => :dieresis, 173 => :notequal, 174 => :AE, 175 => :Oslash, 176 => :infinity, 177 => :plusminus, 178 => :lessequal, 179 => :greaterequal, 180 => :yen, 181 => :mu, 182 => :partialdiff, 183 => :summation, 184 => :product, 185 => :pi, 186 => :integral, 187 => :ordfeminine, 188 => :ordmasculine, 189 => :Omega, 190 => :ae, 191 => :oslash, 192 => :questiondown, 193 => :exclamdown, 194 => :logicalnot, 195 => :radical, 196 => :florin, 197 => :approxequal, 198 => :Delta, 199 => :guillemotleft, 200 => :guillemotright, 201 => :ellipsis, 202 => :nobreakspace, 203 => :Agrave, 204 => :Atilde, 205 => :Otilde, 206 => :OE, 207 => :oe, 208 => :endash, 209 => :emdash, 210 => :quotedblleft, 211 => :quotedblright, 212 => :quoteleft, 213 => :quoteright, 214 => :divide, 215 => :lozenge, 216 => :ydieresis, 217 => :Ydieresis, 218 => :fraction, 219 => :currency, 220 => :guilsinglleft, 221 => :guilsinglright, 222 => :fi, 223 => :fl, 224 => :daggerdbl, 225 => :periodcentered, 226 => :quotesinglbase, 227 => :quotedblbase, 228 => :perthousane, 229 => :Acircumflex, 230 => :Ecircumflex, 231 => :Aacute, 232 => :Edieresis, 233 => :Egrave, 234 => :Iacute, 235 => :Icircumflex, 236 => :Idieresis, 237 => :Igrave, 238 => :Oacute, 239 => :Ocircumflex, 240 => :apple, 241 => :Ograve, 242 => :Uacute, 243 => :Ucircumflex, 244 => :Ugrave, 245 => :dotlessi, 246 => :circumflex, 247 => :tilde, 248 => :macron, 249 => :breve, 250 => :dotaccent, 251 => :ring, 252 => :cedilla, 253 => :hungarumlaut, 254 => :ogonek, 255 => :caron, }
- WIN_ANSI_ENCODING_TO_NAME =
NOTE: the windows encoding has some additional key/value pairs which are not included in the following hash:
nbspace 160/a0 <- same as no break space sfthyphen 173/ad <- same as hyphen middot 183/b7 <- same as period center
{ :space => 32, :exclam => 33, :quotedbl => 34, :numbersign => 35, :dollar => 36, :percent => 37, :ampersand => 38, :quotesingle => 39, :parenleft => 40, :parenright => 41, :asterisk => 42, :plus => 43, :comma => 44, :hyphen => 45, :period => 46, :slash => 47, :zero => 48, :one => 49, :two => 50, :three => 51, :four => 52, :five => 53, :six => 54, :seven => 55, :eight => 56, :nine => 57, :colon => 58, :semicolon => 59, :less => 60, :equal => 61, :greater => 62, :question => 63, :at => 64, :A => 65, :B => 66, :C => 67, :D => 68, :E => 69, :F => 70, :G => 71, :H => 72, :I => 73, :J => 74, :K => 75, :L => 76, :M => 77, :N => 78, :O => 79, :P => 80, :Q => 81, :R => 82, :S => 83, :T => 84, :U => 85, :V => 86, :W => 87, :X => 88, :Y => 89, :Z => 90, :bracketleft => 91, :backslash => 92, :bracketright => 93, :asciicircum => 94, :underscore => 95, :grave => 96, :a => 97, :b => 98, :c => 99, :d => 100, :e => 101, :f => 102, :g => 103, :h => 104, :i => 105, :j => 106, :k => 107, :l => 108, :m => 109, :n => 110, :o => 111, :p => 112, :q => 113, :r => 114, :s => 115, :t => 116, :u => 117, :v => 118, :w => 119, :x => 120, :y => 121, :z => 122, :braceleft => 123, :bar => 124, :braceright => 125, :asciitilde => 126, :Adieresis => 196, :Aring => 197, :Ccedilla => 199, :Eacute => 201, :Ntilde => 209, :Odieresis => 214, :Udieresis => 220, :aacute => 225, :agrave => 224, :acircumflex => 226, :adieresis => 228, :atilde => 227, :aring => 229, :ccedilla => 231, :eacute => 233, :egrave => 232, :ecircumflex => 234, :edieresis => 235, :iacute => 237, :igrave => 236, :icircumflex => 238, :idieresis => 239, :ntilde => 241, :oacute => 243, :ograve => 242, :ocircumflex => 244, :odieresis => 246, :otilde => 245, :uacute => 250, :ugrave => 249, :ucircumflex => 251, :udieresis => 252, :dagger => 134, :degree => 176, :cent => 162, :sterling => 163, :section => 167, :bullet => 149, :paragraph => 182, :germandbls => 223, :registered => 174, :copyright => 169, :trademark => 153, :acute => 180, :dieresis => 168, :AE => 198, :Oslash => 216, :plusminus => 177, :yen => 165, :mu => 181, :ordfeminine => 170, :ordmasculine => 186, :ae => 230, :oslash => 248, :questiondown => 191, :exclamdown => 161, :logicalnot => 172, :florin => 131, :guillemotleft => 171, :guillemotright => 187, :ellipsis => 133, :nobreakspace => 160, :Agrave => 192, :Atilde => 195, :Otilde => 213, :OE => 140, :oe => 156, :endash => 150, :emdash => 151, :quotedblleft => 147, :quotedblright => 148, :quoteleft => 145, :quoteright => 146, :divide => 247, :ydieresis => 255, :Ydieresis => 159, :currency => 164, :guilsinglleft => 139, :guilsinglright => 155, :daggerdbl => 135, :periodcentered => 183, :quotesinglbase => 130, :quotedblbase => 132, :perthousane => 137, :Acircumflex => 194, :Ecircumflex => 202, :Aacute => 193, :Edieresis => 203, :Egrave => 200, :Iacute => 205, :Icircumflex => 206, :Idieresis => 207, :Igrave => 204, :Oacute => 211, :Ocircumflex => 212, :Ograve => 210, :Uacute => 218, :Ucircumflex => 219, :Ugrave => 217, :circumflex => 136, :tilde => 152, :macron => 175, :cedilla => 184, :Scaron => 138, :scaron => 154, :brokenbar => 166, :Eth => 208, :eth => 240, :Yacute => 221, :yacute => 253, :Thorn => 222, :thorn => 254, :multiply => 215, :onesuperior => 185, :twosuperior => 178, :threesuperior => 179, :onehalf => 189, :onequarter => 188, :threequarters => 190, }
Instance Attribute Summary collapse
-
#objects ⇒ Object
readonly
lowlevel hash-like access to all objects in the underlying PDF.
Class Method Summary collapse
-
.file(name, receivers, opts = {}) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will eventually be removed.
-
.object_file(name, id, gen = 0) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will eventually be removed.
-
.object_string(str, id, gen = 0) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will eventually be removed.
-
.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object
syntactic sugar for opening a PDF file.
-
.string(str, receivers, opts = {}) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will eventually be removed.
Instance Method Summary collapse
- #info ⇒ Object
-
#initialize(input = nil, opts = {}) ⇒ Reader
constructor
creates a new document reader for the provided PDF.
- #metadata ⇒ Object
-
#object(io, id, gen) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will eventually be removed.
-
#page(num) ⇒ Object
returns a single PDF::Reader::Page for the specified page.
- #page_count ⇒ Object
-
#pages ⇒ Object
returns an array of PDF::Reader::Page objects, one for each page in the source PDF.
-
#parse(io, receivers, opts = {}) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will eventually be removed.
- #pdf_version ⇒ Object
Constructor Details
#initialize(input = nil, opts = {}) ⇒ Reader
creates a new document reader for the provided PDF.
input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename
reader = PDF::Reader.new("somefile.pdf")
File.open("somefile.pdf","rb") do |file|
reader = PDF::Reader.new(file)
end
If the source file is encrypted you can provide a password for decrypting
reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113 114 115 116 117 118 119 |
# File 'lib/pdf/reader.rb', line 113 def initialize(input = nil, opts = {}) if input # support the deprecated Reader API @cache = PDF::Reader::ObjectCache.new opts.merge!(:cache => @cache) @objects = PDF::Reader::ObjectHash.new(input, opts) end end |
Instance Attribute Details
#objects ⇒ Object (readonly)
lowlevel hash-like access to all objects in the underlying PDF
96 97 98 |
# File 'lib/pdf/reader.rb', line 96 def objects @objects end |
Class Method Details
.file(name, receivers, opts = {}) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will
eventually be removed
Parse the file with the given name, sending events to the given receiver.
169 170 171 172 173 |
# File 'lib/pdf/reader.rb', line 169 def self.file(name, receivers, opts = {}) File.open(name,"rb") do |f| new.parse(f, receivers, opts) end end |
.object_file(name, id, gen = 0) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will
eventually be removed
Parse the file with the given name, returning an unmarshalled ruby version of represents the requested pdf object
192 193 194 195 196 |
# File 'lib/pdf/reader.rb', line 192 def self.object_file(name, id, gen = 0) File.open(name,"rb") { |f| new.object(f, id.to_i, gen.to_i) } end |
.object_string(str, id, gen = 0) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will
eventually be removed
Parse the given string, returning an unmarshalled ruby version of represents the requested pdf object
204 205 206 207 208 |
# File 'lib/pdf/reader.rb', line 204 def self.object_string(str, id, gen = 0) StringIO.open(str) { |s| new.object(s, id.to_i, gen.to_i) } end |
.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object
159 160 161 |
# File 'lib/pdf/reader.rb', line 159 def self.open(input, opts = {}, &block) yield PDF::Reader.new(input, opts) end |
.string(str, receivers, opts = {}) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will
eventually be removed
Parse the given string, sending events to the given receiver.
180 181 182 183 184 |
# File 'lib/pdf/reader.rb', line 180 def self.string(str, receivers, opts = {}) StringIO.open(str) do |s| new.parse(s, receivers, opts) end end |
Instance Method Details
#info ⇒ Object
121 122 123 124 |
# File 'lib/pdf/reader.rb', line 121 def info dict = @objects.deref(@objects.trailer[:Info]) doc_strings_to_utf8(dict) end |
#metadata ⇒ Object
126 127 128 129 130 131 132 133 134 135 |
# File 'lib/pdf/reader.rb', line 126 def stream = @objects.deref(root[:Metadata]) if stream.nil? nil else xml = stream.unfiltered_data xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding) xml end end |
#object(io, id, gen) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will
eventually be removed
Given an IO object that contains PDF data, return the contents of a single object
274 275 276 277 278 |
# File 'lib/pdf/reader.rb', line 274 def object (io, id, gen) @objects = ObjectHash.new(io) @objects.deref(Reference.new(id, gen)) end |
#page(num) ⇒ Object
returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page
reader = PDF::Reader.new("somefile.pdf")
page = reader.page(10)
puts page.text
See the docs for PDF::Reader::Page to read more about the methods available on each page
242 243 244 245 246 247 248 |
# File 'lib/pdf/reader.rb', line 242 def page(num) num = num.to_i if num < 1 || num > self.page_count raise ArgumentError, "valid pages are 1 .. #{self.page_count}" end PDF::Reader::Page.new(@objects, num, :cache => @cache) end |
#page_count ⇒ Object
137 138 139 140 |
# File 'lib/pdf/reader.rb', line 137 def page_count pages = @objects.deref(root[:Pages]) @page_count ||= @objects.deref(pages[:Count]) end |
#pages ⇒ Object
224 225 226 227 228 |
# File 'lib/pdf/reader.rb', line 224 def pages (1..self.page_count).map { |num| PDF::Reader::Page.new(@objects, num, :cache => @cache) } end |
#parse(io, receivers, opts = {}) ⇒ Object
DEPRECATED: this method was deprecated in version 1.0.0 and will
eventually be removed
Given an IO object that contains PDF data, parse it.
256 257 258 259 260 261 262 263 264 265 266 267 |
# File 'lib/pdf/reader.rb', line 256 def parse(io, receivers, opts = {}) ohash = ObjectHash.new(io) = {:pages => true, :raw_text => false, :metadata => true} .merge!(opts) strategies.each do |s| s.new(ohash, receivers, ).process end self end |
#pdf_version ⇒ Object
142 143 144 |
# File 'lib/pdf/reader.rb', line 142 def pdf_version @objects.pdf_version end |