Module: Mindee::PDF::PDFTools

Defined in:
lib/mindee/pdf/pdf_tools.rb

Overview

Monkey-patching for Origami

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.add_content_to_page(page, xobject_name, width, height) ⇒ Object



104
105
106
107
108
# File 'lib/mindee/pdf/pdf_tools.rb', line 104

def self.add_content_to_page(page, xobject_name, width, height)
  content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
  content_stream = Origami::Stream.new(content)
  page.Contents = content_stream
end

.create_xobject(image) ⇒ Object



73
74
75
76
# File 'lib/mindee/pdf/pdf_tools.rb', line 73

def self.create_xobject(image)
  image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
  Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
end

.determine_colorspace(image) ⇒ Object



95
96
97
98
99
100
101
102
# File 'lib/mindee/pdf/pdf_tools.rb', line 95

def self.determine_colorspace(image)
  colorspace = image.data['colorspace']
  case colorspace
  when 'CMYK' then :DeviceCMYK
  when 'Gray', 'PseudoClass Gray' then :DeviceGray
  else :DeviceRGB
  end
end

.determine_filter(image) ⇒ Object



86
87
88
89
90
91
92
93
# File 'lib/mindee/pdf/pdf_tools.rb', line 86

def self.determine_filter(image)
  filter = image.data['properties']['filter']
  case filter
  when %r{Zip}i then :FlateDecode
  when %r{LZW}i then :LZWDecode
  else :DCTDecode
  end
end

.process_image_xobject(image_data, image_quality, width, height) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/mindee/pdf/pdf_tools.rb', line 115

def self.process_image_xobject(image_data, image_quality, width, height)
  compressed_data = Image::ImageCompressor.compress_image(
    image_data,
    quality: image_quality,
    max_width: width,
    max_height: height
  )

  new_image = Origami::Graphics::ImageXObject.new
  new_image.data = compressed_data
  new_image.Width = width
  new_image.Height = height
  new_image.ColorSpace = :DeviceRGB
  new_image.BitsPerComponent = 8

  new_image
end

.set_page_dimensions(page, width, height) ⇒ Object



110
111
112
113
# File 'lib/mindee/pdf/pdf_tools.rb', line 110

def self.set_page_dimensions(page, width, height)
  page[:MediaBox] = [0, 0, width, height]
  page[:CropBox] = [0, 0, width, height]
end

.set_xobject_properties(xobject, image) ⇒ Object



78
79
80
81
82
83
84
# File 'lib/mindee/pdf/pdf_tools.rb', line 78

def self.set_xobject_properties(xobject, image)
  xobject.dictionary[:BitsPerComponent] = 8
  xobject.dictionary[:Filter] = determine_filter(image)
  xobject.dictionary[:Width] = image[:width]
  xobject.dictionary[:Height] = image[:height]
  xobject.dictionary[:ColorSpace] = determine_colorspace(image)
end

.source_text?(pdf_data) ⇒ Boolean

Checks whether the file has source_text. Sends false if the file isn't a PDF.

Parameters:

  • pdf_data (StringIO)

Returns:

  • (Boolean)

    True if the pdf has source text, false otherwise.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/mindee/pdf/pdf_tools.rb', line 48

def self.source_text?(pdf_data)
  begin
    pdf_data.rewind
    pdf = Origami::PDF.read(pdf_data)

    pdf.each_page do |page|
      next unless page[:Contents]

      contents = page[:Contents].solve
      contents = [contents] unless contents.is_a?(Origami::Array)

      contents.each do |stream_ref|
        stream = stream_ref.solve
        return true if stream_has_text?(stream)
      end
    end

    false
  end

  false
rescue Origami::InvalidPDFError
  false
end

.stream_has_text?(stream) ⇒ Boolean

Checks a PDFs stream content for text operators See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.

Parameters:

  • stream (StringIO)

    Stream object from a PDFs page.

Returns:

  • (Boolean)

    True if a text operator is found in the stream.



37
38
39
40
41
42
43
# File 'lib/mindee/pdf/pdf_tools.rb', line 37

def self.stream_has_text?(stream)
  data = stream.data
  return false if data.nil? || data.empty?

  text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
  text_operators.any? { |op| data.include?(op) }
end

Instance Method Details

#to_io_stream(params = {}) ⇒ StringIO

Returns:

  • (StringIO)


8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/mindee/pdf/pdf_tools.rb', line 8

def to_io_stream(params = {})
  options = {
    delinearize: true,
    recompile: true,
    decrypt: false,
  }
  options.update(params)

  if frozen? # incompatible flags with frozen doc (signed)
    options[:recompile] = nil
    options[:rebuild_xrefs] = nil
    options[:noindent] = nil
    options[:obfuscate] = false
  end
  load_all_objects unless @loaded

  intents_as_pdfa1 if options[:intent] =~ %r{pdf[/-]?A1?/i}
  delinearize! if options[:delinearize] && linearized?
  compile(options) if options[:recompile]

  io_stream = StringIO.new(output(options))
  io_stream.set_encoding Encoding::BINARY
  io_stream
end