Class: HexaPDF::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/hexapdf/parser.rb

Overview

Parses an IO stream according to PDF2.0 to get at the contained objects.

This class also contains higher-level methods for getting indirect objects and revisions.

See: PDF2.0 s7

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(io, document) ⇒ Parser

Creates a new parser for the given IO object.

PDF references are resolved using the associated Document object.



57
58
59
60
61
62
63
64
65
66
# File 'lib/hexapdf/parser.rb', line 57

def initialize(io, document)
  @io = io
  on_correctable_error = document.config['parser.on_correctable_error'].curry[document]
  @tokenizer = Tokenizer.new(io, on_correctable_error: on_correctable_error)
  @document = document
  @object_stream_data = {}
  @reconstructed_revision = nil
  @in_reconstruct_revision = false
  retrieve_pdf_header_offset_and_version
end

Instance Attribute Details

#ioObject (readonly)

The IO stream which is parsed.



52
53
54
# File 'lib/hexapdf/parser.rb', line 52

def io
  @io
end

Instance Method Details

#file_header_versionObject

Returns the PDF version number that is stored in the file header.

See: PDF2.0 s7.5.2



415
416
417
418
419
420
# File 'lib/hexapdf/parser.rb', line 415

def file_header_version
  unless @header_version
    raise_malformed("PDF file header is missing or corrupt", pos: 0)
  end
  @header_version
end

#linearized?Boolean

Returns true if the PDF file is a linearized file.

Note: The method uses heuristics to determine whether a PDF file is linearized. In case of slightly invalid or damaged PDFs that HexaPDF can recover from it is possible that this method returns true even though the PDF isn’t actually linearized.

Returns:

  • (Boolean)


78
79
80
81
82
83
84
85
86
87
88
# File 'lib/hexapdf/parser.rb', line 78

def linearized?
  @linearized ||=
    begin
      @tokenizer.pos = @header_offset
      3.times { @tokenizer.next_token } # parse: oid gen obj
      obj = @tokenizer.next_object
      obj.kind_of?(Hash) && obj.key?(:Linearized)
    rescue MalformedPDFError
      false
    end
end

#load_compressed_object(xref_entry) ⇒ Object

Loads the compressed object identified by the cross-reference entry.



225
226
227
228
229
230
231
232
233
234
235
# File 'lib/hexapdf/parser.rb', line 225

def load_compressed_object(xref_entry)
  unless @object_stream_data.key?(xref_entry.objstm)
    obj = @document.object(xref_entry.objstm)
    unless obj.respond_to?(:parse_stream)
      raise_malformed("Object with oid=#{xref_entry.objstm} is not an object stream")
    end
    @object_stream_data[xref_entry.objstm] = obj.parse_stream
  end

  [*@object_stream_data[xref_entry.objstm].object_by_index(xref_entry.pos), xref_entry.gen, nil]
end

#load_object(xref_entry) ⇒ Object

Loads the indirect (potentially compressed) object specified by the given cross-reference entry.

For information about the xref_entry argument, have a look at HexaPDF::XRefSection and HexaPDF::XRefSection::Entry.



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/hexapdf/parser.rb', line 95

def load_object(xref_entry)
  obj, oid, gen, stream =
    case xref_entry.type
    when :in_use
      if xref_entry.pos == 0 && xref_entry.oid != 0
        # Handle seen-in-the-wild objects with invalid offset 0
        maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
        [nil, xref_entry.oid, xref_entry.gen, nil]
      else
        parse_indirect_object(xref_entry.pos)
      end
    when :free
      [nil, xref_entry.oid, xref_entry.gen, nil]
    when :compressed
      load_compressed_object(xref_entry)
    else
      raise_malformed("Invalid cross-reference type '#{xref_entry.type}' encountered")
    end

  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
    raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
                    "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
  end

  if obj.kind_of?(Reference)
    @document.deref(obj)
  else
    @document.wrap(obj, oid: oid, gen: gen, stream: stream)
  end
rescue HexaPDF::MalformedPDFError
  reconstructed_revision.object(xref_entry) ||
    @document.wrap(nil, oid: xref_entry.oid, gen: xref_entry.gen)
end

#load_revision(pos) ⇒ Object

Loads a single revision whose cross-reference section/stream is located at the given position.

Returns an HexaPDF::XRefSection object and the accompanying trailer dictionary.



241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/hexapdf/parser.rb', line 241

def load_revision(pos)
  if xref_section?(pos)
    xref_section, trailer = parse_xref_section_and_trailer(pos)
  else
    obj = load_object(XRefSection.in_use_entry(0, 0, pos))
    unless obj.respond_to?(:xref_section)
      raise_malformed("Object is not a cross-reference stream", pos: pos)
    end
    begin
      xref_section = obj.xref_section
    rescue MalformedPDFError => e
      e.pos = pos
      raise
    end
    trailer = obj.trailer
    unless xref_section.entry?(obj.oid, obj.gen)
      maybe_raise("Cross-reference stream doesn't contain entry for itself", pos: pos)
      xref_section.add_in_use_entry(obj.oid, obj.gen, pos)
    end
  end
  xref_section.delete(0)
  [xref_section, trailer]
end

#parse_indirect_object(offset = nil) ⇒ Object

Parses the indirect object at the specified offset.

This method is used by a PDF Document to load objects. It should not be used by any other object because invalid object positions lead to errors.

Returns an array containing [object, oid, gen, stream].

See: PDF2.0 s7.3.10, s7.3.8



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/hexapdf/parser.rb', line 137

def parse_indirect_object(offset = nil)
  @tokenizer.pos = offset + @header_offset if offset
  oid = @tokenizer.next_token
  gen = @tokenizer.next_token
  tok = @tokenizer.next_token
  unless oid.kind_of?(Integer) && gen.kind_of?(Integer) &&
      tok.kind_of?(Tokenizer::Token) && tok == 'obj'
    raise_malformed("No valid object found", pos: offset)
  end

  if (tok = @tokenizer.peek_token) && tok.kind_of?(Tokenizer::Token) && tok == 'endobj'
    maybe_raise("No indirect object value between 'obj' and 'endobj'", pos: @tokenizer.pos)
    object = nil
  else
    begin
      object = @tokenizer.next_object
    rescue MalformedPDFError
      if tok.kind_of?(Tokenizer::Token) && tok =~ /\A\d+endobj\z/
        # Handle often found invalid indirect object with missing whitespace after number
        maybe_raise("Missing whitespace after number'", pos: @tokenizer.pos)
        object = tok.to_i
        @tokenizer.pos -= 6
      else
        maybe_raise("Invalid value after '#{oid} #{gen} obj', treating as null", pos: @tokenizer.pos)
        return [nil, oid, gen, nil]
      end
    end
  end

  tok = @tokenizer.next_token

  if tok.kind_of?(Tokenizer::Token) && tok == 'stream'
    unless object.kind_of?(Hash)
      raise_malformed("A stream needs a dictionary, not a(n) #{object.class}", pos: offset)
    end
    tok1 = @tokenizer.next_byte
    if tok1 == 32 # space
      maybe_raise("Keyword stream followed by space instead of LF or CR/LF", pos: @tokenizer.pos)
      tok1 = @tokenizer.next_byte
    end
    tok2 = @tokenizer.next_byte if tok1 == 13 # CR
    if tok1 != 10 && tok1 != 13
      raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
    elsif tok1 == 13 && tok2 != 10
      maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
                  pos: @tokenizer.pos)
      @tokenizer.pos -= 1
    end

    # Note that getting :Length might move the IO pointer (when resolving references)
    pos = @tokenizer.pos
    length = if object[:Length].kind_of?(Integer)
               object[:Length]
             elsif object[:Length].kind_of?(Reference)
               @document.deref(object[:Length])&.value || 0
             else
               0
             end
    @tokenizer.pos = pos + length rescue pos

    tok = @tokenizer.next_token rescue nil
    unless tok.kind_of?(Tokenizer::Token) && tok == 'endstream'
      maybe_raise("Invalid stream length, keyword endstream not found", pos: @tokenizer.pos)
      @tokenizer.pos = pos
      if @tokenizer.scan_until(/(?=\n?endstream)/)
        length = @tokenizer.pos - pos
        tok = @tokenizer.next_token
      else
        raise_malformed("Stream content must be followed by keyword endstream",
                        pos: @tokenizer.pos)
      end
    end
    tok = @tokenizer.next_token

    object[:Length] = length
    stream = StreamData.new(@tokenizer.io, offset: pos, length: length,
                            filter: @document.unwrap(object[:Filter]),
                            decode_parms: @document.unwrap(object[:DecodeParms]))
  end

  unless tok.kind_of?(Tokenizer::Token) && tok == 'endobj'
    maybe_raise("Indirect object must be followed by keyword endobj", pos: @tokenizer.pos)
  end

  [object, oid, gen, stream]
end

#parse_xref_section_and_trailer(offset) ⇒ Object

Parses the cross-reference section at the given position and the following trailer and returns them as an array consisting of an HexaPDF::XRefSection instance and a hash.

This method can only parse cross-reference sections, not cross-reference streams!

See: PDF2.0 s7.5.4, s7.5.5; ADB1.7 sH.3-3.4.3



279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# File 'lib/hexapdf/parser.rb', line 279

def parse_xref_section_and_trailer(offset)
  @tokenizer.pos = offset + @header_offset
  token = @tokenizer.next_token
  unless token.kind_of?(Tokenizer::Token) && token == 'xref'
    raise_malformed("Xref section doesn't start with keyword xref", pos: @tokenizer.pos)
  end

  xref = XRefSection.new
  start = @tokenizer.next_token
  while start.kind_of?(Integer)
    number_of_entries = @tokenizer.next_token
    unless number_of_entries.kind_of?(Integer)
      raise_malformed("Invalid cross-reference subsection start", pos: @tokenizer.pos)
    end

    @tokenizer.skip_whitespace
    start.upto(start + number_of_entries - 1) do |oid|
      pos, gen, type = @tokenizer.next_xref_entry do |recoverable|
        maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos,
                    force: !recoverable)
      end
      if xref.entry?(oid)
        next
      elsif type == 'n'
        if pos == 0 || gen > 65535
          maybe_raise("Invalid in use cross-reference entry for object number #{oid}",
                      pos: @tokenizer.pos)
          xref.add_free_entry(oid, gen)
        else
          xref.add_in_use_entry(oid, gen, pos)
        end
      else
        xref.add_free_entry(oid, gen)
      end
    end
    start = @tokenizer.next_token
  end

  unless start.kind_of?(Tokenizer::Token) && start == 'trailer'
    raise_malformed("Trailer doesn't start with keyword trailer", pos: @tokenizer.pos)
  end

  trailer = @tokenizer.next_object
  unless trailer.kind_of?(Hash)
    raise_malformed("Trailer is #{trailer.class} instead of dictionary ", pos: @tokenizer.pos)
  end

  unless trailer[:Prev] || xref.max_oid == 0 || xref.entry?(0)
    first_entry = xref[xref.oids[0]]
    test_entry = xref[xref.oids[-1]]
    @tokenizer.pos = test_entry.pos + @header_offset
    test_oid = @tokenizer.next_token
    first_oid = first_entry.oid

    force_failure = !first_entry.free? || first_entry.gen != 65535 ||
      !test_oid.kind_of?(Integer) || xref.oids[-1] - test_oid != first_oid
    maybe_raise("Main cross-reference section has invalid numbering",
                pos: offset + @header_offset, force: force_failure)

    new_xref = XRefSection.new
    xref.oids.each do |oid|
      entry = xref[oid]
      entry.oid -= first_oid
      new_xref.send(:[]=, entry.oid, entry.gen, entry)
    end
    xref = new_xref
  end

  [xref, trailer]
end

#reconstructed?Boolean

Returns true if the PDF file was damaged and could be reconstructed.

Returns:

  • (Boolean)


69
70
71
# File 'lib/hexapdf/parser.rb', line 69

def reconstructed?
  !@reconstructed_revision.nil?
end

#reconstructed_revisionObject

Returns the reconstructed revision.



408
409
410
# File 'lib/hexapdf/parser.rb', line 408

def reconstructed_revision
  @reconstructed_revision ||= reconstruct_revision
end

#startxref_offsetObject

Returns the offset of the main cross-reference section/stream.

Implementation note: Normally, the %%EOF marker has to be on the last line, however, Adobe viewers relax this restriction and so do we.

If strict parsing is disabled, the whole file is searched for the offset.

See: PDF2.0 s7.5.5, ADB1.7 sH.3-3.4.4



358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# File 'lib/hexapdf/parser.rb', line 358

def startxref_offset
  return @startxref_offset if defined?(@startxref_offset)

  @io.seek(0, IO::SEEK_END)
  step_size = 1024
  pos = @io.pos
  eof_not_found = pos == 0
  startxref_missing = startxref_mangled = false
  startxref_offset = nil

  while pos != 0
    @io.pos = [pos - step_size, 0].max
    pos = @io.pos
    lines = @io.read(step_size + 40).split(/[\r\n]+/)

    # Need to iterate through the whole lines array in case there are multiple %%EOF to try
    eof_index = 0
    while (eof_index = lines[0..(eof_index - 1)].rindex {|l| l.strip == '%%EOF' })
      if eof_index > 0 && lines[eof_index - 1].strip =~ /\Astartxref\s(\d+)\z/
        startxref_offset = $1.to_i
        startxref_mangled = true
        break # we found it even if it the syntax is not entirely correct
      elsif eof_index < 2
        startxref_missing = true
        break
      elsif lines[eof_index - 2].strip != "startxref"
        startxref_missing = true
      else
        startxref_offset = lines[eof_index - 1].to_i
        break # we found it
      end
    end
    eof_not_found ||= !eof_index
    break if startxref_offset
  end

  if startxref_mangled
    maybe_raise("PDF file trailer keyword startxref on same line as value", pos: pos)
  elsif startxref_missing
    maybe_raise("PDF file trailer is missing startxref keyword", pos: pos,
                force: !startxref_offset)
  elsif eof_not_found
    maybe_raise("PDF file trailer with end-of-file marker not found", pos: pos,
                force: !startxref_offset)
  end

  @startxref_offset = startxref_offset
end

#xref_section?(offset) ⇒ Boolean

Looks at the given offset and returns true if there is a cross-reference section at that position.

Returns:

  • (Boolean)


267
268
269
270
271
# File 'lib/hexapdf/parser.rb', line 267

def xref_section?(offset)
  @tokenizer.pos = offset + @header_offset
  token = @tokenizer.peek_token
  token.kind_of?(Tokenizer::Token) && token == 'xref'
end