Module: Mindee::PDF::PDFTools
- Defined in:
- lib/mindee/pdf/pdf_tools.rb
Overview
Monkey-patching for Origami
Class Method Summary collapse
- .add_content_to_page(page, xobject_name, width, height) ⇒ Object
- .create_xobject(image) ⇒ Object
- .determine_colorspace(image) ⇒ Object
- .determine_filter(image) ⇒ Object
- .process_image_xobject(image_data, image_quality, width, height) ⇒ Object
- .set_page_dimensions(page, width, height) ⇒ Object
- .set_xobject_properties(xobject, image) ⇒ Object
-
.source_text?(pdf_data) ⇒ Boolean
Checks whether the file has source_text.
-
.stream_has_text?(stream) ⇒ Boolean
Checks a PDFs stream content for text operators See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
Instance Method Summary collapse
Class Method Details
.add_content_to_page(page, xobject_name, width, height) ⇒ Object
104 105 106 107 108 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 104 def self.add_content_to_page(page, xobject_name, width, height) content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n" content_stream = Origami::Stream.new(content) page.Contents = content_stream end |
.create_xobject(image) ⇒ Object
73 74 75 76 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 73 def self.create_xobject(image) image_io = Mindee::Image::ImageUtils.image_to_stringio(image) Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg') end |
.determine_colorspace(image) ⇒ Object
95 96 97 98 99 100 101 102 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 95 def self.determine_colorspace(image) colorspace = image.data['colorspace'] case colorspace when 'CMYK' then :DeviceCMYK when 'Gray', 'PseudoClass Gray' then :DeviceGray else :DeviceRGB end end |
.determine_filter(image) ⇒ Object
86 87 88 89 90 91 92 93 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 86 def self.determine_filter(image) filter = image.data['properties']['filter'] case filter when %r{Zip}i then :FlateDecode when %r{LZW}i then :LZWDecode else :DCTDecode end end |
.process_image_xobject(image_data, image_quality, width, height) ⇒ Object
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 115 def self.process_image_xobject(image_data, image_quality, width, height) compressed_data = Image::ImageCompressor.compress_image( image_data, quality: image_quality, max_width: width, max_height: height ) new_image = Origami::Graphics::ImageXObject.new new_image.data = compressed_data new_image.Width = width new_image.Height = height new_image.ColorSpace = :DeviceRGB new_image.BitsPerComponent = 8 new_image end |
.set_page_dimensions(page, width, height) ⇒ Object
110 111 112 113 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 110 def self.set_page_dimensions(page, width, height) page[:MediaBox] = [0, 0, width, height] page[:CropBox] = [0, 0, width, height] end |
.set_xobject_properties(xobject, image) ⇒ Object
78 79 80 81 82 83 84 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 78 def self.set_xobject_properties(xobject, image) xobject.dictionary[:BitsPerComponent] = 8 xobject.dictionary[:Filter] = determine_filter(image) xobject.dictionary[:Width] = image[:width] xobject.dictionary[:Height] = image[:height] xobject.dictionary[:ColorSpace] = determine_colorspace(image) end |
.source_text?(pdf_data) ⇒ Boolean
Checks whether the file has source_text. Sends false if the file isn't a PDF.
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 48 def self.source_text?(pdf_data) begin pdf_data.rewind pdf = Origami::PDF.read(pdf_data) pdf.each_page do |page| next unless page[:Contents] contents = page[:Contents].solve contents = [contents] unless contents.is_a?(Origami::Array) contents.each do |stream_ref| stream = stream_ref.solve return true if stream_has_text?(stream) end end false end false rescue Origami::InvalidPDFError false end |
.stream_has_text?(stream) ⇒ Boolean
Checks a PDFs stream content for text operators See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
37 38 39 40 41 42 43 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 37 def self.stream_has_text?(stream) data = stream.data return false if data.nil? || data.empty? text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"'] text_operators.any? { |op| data.include?(op) } end |
Instance Method Details
#to_io_stream(params = {}) ⇒ StringIO
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/mindee/pdf/pdf_tools.rb', line 8 def to_io_stream(params = {}) = { delinearize: true, recompile: true, decrypt: false, } .update(params) if frozen? # incompatible flags with frozen doc (signed) [:recompile] = nil [:rebuild_xrefs] = nil [:noindent] = nil [:obfuscate] = false end load_all_objects unless @loaded intents_as_pdfa1 if [:intent] =~ %r{pdf[/-]?A1?/i} delinearize! if [:delinearize] && linearized? compile() if [:recompile] io_stream = StringIO.new(output()) io_stream.set_encoding Encoding::BINARY io_stream end |