Module: Sterilizer
- Defined in:
- lib/sterilizer.rb,
lib/sterilizer/version.rb
Constant Summary collapse
- VERSION =
"0.0.4"
Instance Method Summary collapse
- #default_encoding ⇒ Object
- #encoding_is_default? ⇒ Boolean
- #find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false) ⇒ Object
- #force_encoding_with(encoding) ⇒ Object
-
#guess_encoding(guesser = CharDet) ⇒ Object
Use an external library to attempt to (silently) guess the encoding.
- #sterilize! ⇒ Object
- #valid_and_default? ⇒ Boolean
- #valid_when_forced?(encoding = default_encoding) ⇒ Boolean
Instance Method Details
#default_encoding ⇒ Object
76 77 78 |
# File 'lib/sterilizer.rb', line 76 def default_encoding Encoding.default_internal || "UTF-8" end |
#encoding_is_default? ⇒ Boolean
39 40 41 |
# File 'lib/sterilizer.rb', line 39 def encoding_is_default? self.encoding == default_encoding end |
#find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/sterilizer.rb', line 51 def find_a_valid_encoding(ignoring = [default_encoding], guessed_already = false) # If we've already tried to guess the encoding, resort to picking one at random until valid if guessed_already provisional_encoding = Encoding.list.detect{ |encoding| !ignoring.include?(encoding) } else # On first run, we'll try and guess the character encoding provisional_encoding = guess_encoding end # If the provisional encoding is valid when string is forced to it, select it otherwise continue to find one if valid_when_forced?(provisional_encoding) provisional_encoding else find_a_valid_encoding(ignoring << provisional_encoding, :guess_failed) end end |
#force_encoding_with(encoding) ⇒ Object
72 73 74 |
# File 'lib/sterilizer.rb', line 72 def force_encoding_with(encoding) self.force_encoding(encoding).encode(default_encoding, :invalid => :replace, :undef => :replace) end |
#guess_encoding(guesser = CharDet) ⇒ Object
Use an external library to attempt to (silently) guess the encoding
68 69 70 |
# File 'lib/sterilizer.rb', line 68 def guess_encoding(guesser = CharDet) Encoding.find(guesser.detect(self, :silent => true)["encoding"]) end |
#sterilize! ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/sterilizer.rb', line 5 def sterilize! return self unless !!defined?(Encoding) # return if valid encoding, simple encode it to UTF-8 return self.encode!(default_encoding, self.encoding, { :undef => :replace, :invalid => :replace }) if self.valid_encoding? # return if encoding is valid and equal to default_internal return self if valid_and_default? # force to default encoding if valid when forced return self.force_encoding(default_encoding) if valid_when_forced? # At this point, we know the string is not valid encoding, if the encoding is UTF-8, # we must try a different encoding that is valid before forcefully encoding to UTF-8 # Otherwise, the encoding type is non-default. If it is valid, encode it to UTF-8, otherwise # find an alternative before forcefully encoding to UTF-8 if encoding_is_default? # Might have a situation where encoding is the same as default, but it's not valid # Force it to something else so we can String#encode non_default_encoding = find_a_valid_encoding force_encoding_with(non_default_encoding) else if valid_when_forced?(self.encoding) self.encode!(default_encoding, self.encoding, { :undef => :replace, :invalid => :replace}) else alternative_encoding = find_a_valid_encoding(self.encoding) force_encoding_with(alternative_encoding) end end rescue self.force_encoding_with("ASCII") end |
#valid_and_default? ⇒ Boolean
43 44 45 |
# File 'lib/sterilizer.rb', line 43 def valid_and_default? self.valid_encoding? && encoding_is_default? end |
#valid_when_forced?(encoding = default_encoding) ⇒ Boolean
47 48 49 |
# File 'lib/sterilizer.rb', line 47 def valid_when_forced?(encoding = default_encoding) self.dup.force_encoding(encoding).valid_encoding? end |