Module: Sterilizer

Defined in:
lib/sterilizer.rb,
lib/sterilizer/version.rb

Constant Summary collapse

VERSION =
"0.0.4"

Instance Method Summary collapse

Instance Method Details

#default_encodingObject



76
77
78
# File 'lib/sterilizer.rb', line 76

def default_encoding
  Encoding.default_internal || "UTF-8"
end

#encoding_is_default?Boolean

Returns:

  • (Boolean)


39
40
41
# File 'lib/sterilizer.rb', line 39

def encoding_is_default?
  self.encoding == default_encoding
end

#find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/sterilizer.rb', line 51

def find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false)
  # If we've already tried to guess the encoding, resort to picking one at random until valid
  if guessed_already
    provisional_encoding = Encoding.list.detect{ |encoding| !ignoring.include?(encoding) }
  else # On first run, we'll try and guess the character encoding
    provisional_encoding = guess_encoding
  end

  # If the provisional encoding is valid when string is forced to it, select it otherwise continue to find one
  if valid_when_forced?(provisional_encoding)
    provisional_encoding
  else
    find_a_valid_encoding(ignoring << provisional_encoding, :guess_failed)
  end
end

#force_encoding_with(encoding) ⇒ Object



72
73
74
# File 'lib/sterilizer.rb', line 72

def force_encoding_with(encoding)
  self.force_encoding(encoding).encode(default_encoding, :invalid => :replace, :undef => :replace)
end

#guess_encoding(guesser = CharDet) ⇒ Object

Use an external library to attempt to (silently) guess the encoding



68
69
70
# File 'lib/sterilizer.rb', line 68

def guess_encoding(guesser = CharDet)
  Encoding.find(guesser.detect(self, :silent => true)["encoding"])
end

#sterilize!Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/sterilizer.rb', line 5

def sterilize!

  return self unless !!defined?(Encoding)
  
  # return if valid encoding, simple encode it to UTF-8
  return self.encode!(default_encoding, self.encoding, { :undef => :replace, :invalid => :replace }) if self.valid_encoding?
  
  # return if encoding is valid and equal to default_internal
  return self if valid_and_default?

  # force to default encoding if valid when forced
  return self.force_encoding(default_encoding) if valid_when_forced?

  # At this point, we know the string is not valid encoding, if the encoding is UTF-8,
  # we must try a different encoding that is valid before forcefully encoding to UTF-8
  # Otherwise, the encoding type is non-default. If it is valid, encode it to UTF-8, otherwise
  # find an alternative before forcefully encoding to UTF-8
  if encoding_is_default?
    # Might have a situation where encoding is the same as default, but it's not valid
    # Force it to something else so we can String#encode
    non_default_encoding = find_a_valid_encoding
    force_encoding_with(non_default_encoding)
  else
    if valid_when_forced?(self.encoding)
      self.encode!(default_encoding, self.encoding, { :undef => :replace, :invalid => :replace})
    else
      alternative_encoding = find_a_valid_encoding(self.encoding)
      force_encoding_with(alternative_encoding)
    end
  end
rescue
  self.force_encoding_with("ASCII")
end

#valid_and_default?Boolean

Returns:

  • (Boolean)


43
44
45
# File 'lib/sterilizer.rb', line 43

def valid_and_default?
  self.valid_encoding? && encoding_is_default?
end

#valid_when_forced?(encoding = default_encoding) ⇒ Boolean

Returns:

  • (Boolean)


47
48
49
# File 'lib/sterilizer.rb', line 47

def valid_when_forced?(encoding = default_encoding)
  self.dup.force_encoding(encoding).valid_encoding?
end