Class: PDF::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/point.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/text_run.rb,
lib/pdf/reader/rectangle.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/resources.rb,
lib/pdf/reader/cid_widths.rb,
lib/pdf/reader/filter/lzw.rb,
lib/pdf/reader/glyph_hash.rb,
lib/pdf/reader/page_state.rb,
lib/pdf/reader/type_check.rb,
lib/pdf/reader/filter/null.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/page_layout.rb,
lib/pdf/reader/filter/flate.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/filter/ascii85.rb,
lib/pdf/reader/key_builder_v5.rb,
lib/pdf/reader/no_text_filter.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/font_descriptor.rb,
lib/pdf/reader/filter/ascii_hex.rb,
lib/pdf/reader/filter/depredict.rb,
lib/pdf/reader/filter/run_length.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb,
lib/pdf/reader/synchronized_cache.rb,
lib/pdf/reader/validating_receiver.rb,
lib/pdf/reader/rc4_security_handler.rb,
lib/pdf/reader/standard_key_builder.rb,
lib/pdf/reader/null_security_handler.rb,
lib/pdf/reader/transformation_matrix.rb,
lib/pdf/reader/zero_width_runs_filter.rb,
lib/pdf/reader/aes_v2_security_handler.rb,
lib/pdf/reader/aes_v3_security_handler.rb,
lib/pdf/reader/overlapping_runs_filter.rb,
lib/pdf/reader/advanced_text_run_filter.rb,
lib/pdf/reader/security_handler_factory.rb,
lib/pdf/reader/width_calculator/built_in.rb,
lib/pdf/reader/width_calculator/composite.rb,
lib/pdf/reader/width_calculator/true_type.rb,
lib/pdf/reader/width_calculator/type_zero.rb,
lib/pdf/reader/bounding_rectangle_runs_filter.rb,
lib/pdf/reader/unimplemented_security_handler.rb,
lib/pdf/reader/width_calculator/type_one_or_three.rb

Overview

typed: strict frozen_string_literal: true

Defined Under Namespace

Modules: Filter, WidthCalculator Classes: AdvancedTextRunFilter, AesV2SecurityHandler, AesV3SecurityHandler, BoundingRectangleRunsFilter, Buffer, CMap, CidWidths, Encoding, EncryptedPDFError, Error, EventPoint, Font, FontDescriptor, FormXObject, GlyphHash, InvalidObjectError, InvalidPageError, KeyBuilderV5, LZW, MalformedPDFError, NoTextFilter, NullSecurityHandler, ObjectCache, ObjectHash, ObjectStream, OverlappingRunsFilter, Page, PageLayout, PageState, PageTextReceiver, PagesStrategy, Parser, Point, PrintReceiver, Rc4SecurityHandler, Rectangle, Reference, RegisterReceiver, Resources, SecurityHandlerFactory, StandardKeyBuilder, Stream, SynchronizedCache, TextRun, Token, TransformationMatrix, TypeCheck, UnimplementedSecurityHandler, UnsupportedFeatureError, ValidatingReceiver, XRef, ZeroWidthRunsFilter

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input, opts = {}) ⇒ Reader

creates a new document reader for the provided PDF.

input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename

reader = PDF::Reader.new("somefile.pdf")

File.open("somefile.pdf","rb") do |file|
  reader = PDF::Reader.new(file)
end

If the source file is encrypted you can provide a password for decrypting

reader = PDF::Reader.new("somefile.pdf", :password => "apples")

Using this method directly is supported, but it’s more common to use ‘PDF::Reader.open`

: (String | Tempfile | IO | StringIO, ?Hash[untyped, untyped]) -> void



120
121
122
123
124
125
126
# File 'lib/pdf/reader.rb', line 120

def initialize(input, opts = {})
  @cache   = PDF::Reader::ObjectCache.new #: PDF::Reader::ObjectCache
  opts.merge!(:cache => @cache)
  @objects = PDF::Reader::ObjectHash.new(input, opts) #: PDF::Reader::ObjectHash
  @page_count = nil #: Integer | nil
  @root = nil #: Hash[Symbol, untyped] | nil
end

Instance Attribute Details

#objectsObject (readonly)

lowlevel hash-like access to all objects in the underlying PDF : PDF::Reader::ObjectHash



99
100
101
# File 'lib/pdf/reader.rb', line 99

def objects
  @objects
end

Class Method Details

.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object

syntactic sugar for opening a PDF file and the most common approach. Accepts the same arguments as new().

PDF::Reader.open("somefile.pdf") do |reader|
  puts reader.pdf_version
end

or

PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
  puts reader.pdf_version
end

: (String | Tempfile | IO, ?Hash[untyped, untyped]) { (PDF::Reader) -> void } -> untyped

Yields:



183
184
185
# File 'lib/pdf/reader.rb', line 183

def self.open(input, opts = {}, &block)
  yield PDF::Reader.new(input, opts)
end

Instance Method Details

#infoObject

Return a Hash with some basic information about the PDF file

: () -> Hash[untyped, untyped]?



131
132
133
134
# File 'lib/pdf/reader.rb', line 131

def info
  dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
  doc_strings_to_utf8(dict)
end

#metadataObject

Return a String with extra XML metadata provided by the author of the PDF file. Not always present.

: () -> String?



140
141
142
143
144
145
146
147
148
149
# File 'lib/pdf/reader.rb', line 140

def 
  stream = @objects.deref_stream(root[:Metadata])
  if stream.nil?
    nil
  else
    xml = stream.unfiltered_data
    xml.force_encoding("utf-8")
    xml
  end
end

#page(num) ⇒ Object

returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page

reader = PDF::Reader.new("somefile.pdf")
page   = reader.page(10)

puts page.text

See the docs for PDF::Reader::Page to read more about the methods available on each page

: (Integer) -> PDF::Reader::Page



227
228
229
230
231
232
233
# File 'lib/pdf/reader.rb', line 227

def page(num)
  num = num.to_i
  if num < 1 || num > self.page_count
    raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
  end
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
end

#page_countObject

To number of pages in this PDF

: () -> Integer



154
155
156
157
158
159
160
# File 'lib/pdf/reader.rb', line 154

def page_count
  pages = @objects.deref_hash(root[:Pages])
  unless pages.kind_of?(::Hash)
    raise MalformedPDFError, "Pages structure is missing #{pages.class}"
  end
  @page_count ||= @objects.deref_integer(pages[:Count]) || 0
end

#pagesObject

returns an array of PDF::Reader::Page objects, one for each page in the source PDF.

reader = PDF::Reader.new("somefile.pdf")

reader.pages.each do |page|
  puts page.fonts
  puts page.rectangles
  puts page.text
end

See the docs for PDF::Reader::Page to read more about the methods available on each page

: () -> Array



202
203
204
205
206
207
208
209
210
211
212
# File 'lib/pdf/reader.rb', line 202

def pages
  return [] if page_count <= 0

  (1..self.page_count).map do |num|
    begin
      PDF::Reader::Page.new(@objects, num, :cache => @cache)
    rescue InvalidPageError
      raise MalformedPDFError, "Missing data for page: #{num}"
    end
  end
end

#pdf_versionObject

The PDF version this file uses

: () -> Float



165
166
167
# File 'lib/pdf/reader.rb', line 165

def pdf_version
  @objects.pdf_version
end