Class: PDF::Reader::Encoding

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/encoding.rb

Overview

Util class for working with string encodings in PDF files. Mostly used to convert strings of various PDF-dialect encodings into UTF-8.

Constant Summary collapse

CONTROL_CHARS =

:nodoc:

[0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31]
UNKNOWN_CHAR =

: Integer # ▯

0x25AF

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(enc) ⇒ Encoding

: (Hash[Symbol, untyped] | Symbol | nil) -> void



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/pdf/reader/encoding.rb', line 42

def initialize(enc)
  # maps from character codes to Unicode codepoints
  @mapping  = default_mapping #: Hash[Integer, Integer]

  # maps from character codes to UTF-8 strings.
  @string_cache  = {} #: Hash[Integer, String]

  @enc_name = :StandardEncoding #: Symbol
  if enc.kind_of?(Hash)
    @enc_name = enc[:Encoding] || enc[:BaseEncoding]
  elsif enc && enc.respond_to?(:to_sym)
    @enc_name = enc.to_sym
  end

  @unpack   = get_unpack(@enc_name) #: String
  @map_file = get_mapping_file(@enc_name) #: String | nil
  @differences = nil #: Hash[Integer, Integer] | nil
  @glyphlist = nil #: PDF::Reader::GlyphHash | nil

  load_mapping(@map_file) if @map_file

  if enc.is_a?(Hash) && enc[:Differences]
    self.differences = enc[:Differences]
  end
end

Instance Attribute Details

#unpackObject (readonly)

: String



39
40
41
# File 'lib/pdf/reader/encoding.rb', line 39

def unpack
  @unpack
end

Instance Method Details

#differencesObject

: () -> Hash[Integer, Integer]



96
97
98
99
# File 'lib/pdf/reader/encoding.rb', line 96

def differences
  # this method is only used by the spec tests
  @differences ||= {}
end

#differences=(diff) ⇒ Object

set the differences table for this encoding. should be an array in the following format:

[25, :A, 26, :B]

The array alternates between a decimal byte number and a glyph name to map to that byte

To save space the following array is also valid and equivalent to the previous one

[25, :A, :B]

: (Array[Integer | Symbol]) -> Hash[Integer, Integer]



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/pdf/reader/encoding.rb', line 78

def differences=(diff)
  PDF::Reader::Error.validate_type(diff, "diff", Array)

  @differences = {}
  byte = 0
  diff.each do |val|
    if val.kind_of?(Numeric)
      byte = val.to_i
    elsif codepoint = glyphlist.name_to_unicode(val)
      @differences[byte] = val
      @mapping[byte] = codepoint
      byte += 1
    end
  end
  @differences
end

#int_to_name(glyph_code) ⇒ Object

convert an integer glyph code into an Adobe glyph name.

int_to_name(65)
=> [:A]

: (Integer) -> Array



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/pdf/reader/encoding.rb', line 132

def int_to_name(glyph_code)
  if @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
    []
  elsif differences[glyph_code]
    [differences[glyph_code]]
  elsif @mapping[glyph_code]
    val = @mapping[glyph_code]
    if val
      glyphlist.unicode_to_name(val)
    else
      []
    end
  else
    []
  end
end

#int_to_utf8_string(glyph_code) ⇒ Object

: (Integer) -> String



122
123
124
# File 'lib/pdf/reader/encoding.rb', line 122

def int_to_utf8_string(glyph_code)
  @string_cache[glyph_code] ||= internal_int_to_utf8_string(glyph_code)
end

#to_utf8(str) ⇒ Object

convert the specified string to utf8

  • unpack raw bytes into codepoints

  • replace any that have entries in the differences table with a glyph name

  • convert codepoints from source encoding to Unicode codepoints

  • convert any glyph names to Unicode codepoints

  • replace characters that didn’t convert to Unicode nicely with something valid

  • pack the final array of Unicode codepoints into a utf-8 string

  • mark the string as utf-8 if we’re running on a M17N aware VM

: (String) -> String



113
114
115
116
117
118
119
# File 'lib/pdf/reader/encoding.rb', line 113

def to_utf8(str)
  if utf8_conversion_impossible?
    little_boxes(str.unpack(unpack).size)
  else
    convert_to_utf8(str)
  end
end