Class: String

Inherits:

Object

Object
String

show all

Defined in:: lib/rcs-common/sanitize.rb,
lib/rcs-common/binary.rb,
lib/rcs-common/utf16le.rb,
lib/rcs-common/keywords.rb,
lib/rcs-common/pascalize.rb

Overview

here we are re-opening the ruby String class, the namespace must not be specified

Direct Known Subclasses

RCS::Common::WinFirewall::AdvfirewallResponse

Constant Summary collapse

REMOVE_INVALID_CHARS_REGEXP =

Regexp.new(/([^[:alnum:][:graph:]\n\r])+/u)

Instance Method Summary collapse

#binary_add_at_offset(offset, value) ⇒ Object
#binary_patch(match, replace) ⇒ Object
#binary_patch_at_offset(offset, replace) ⇒ Object
#force_utf8(modify_self = false) ⇒ Object
#force_utf8! ⇒ Object
#keywords ⇒ Object
#pascalize ⇒ Object

returns a string encoded into a pascalized form.
#remove_invalid_chars ⇒ Object
#safe_utf8_encode ⇒ Object
#safe_utf8_encode_invalid ⇒ Object
#strip_html_tags ⇒ Object
#terminate_utf16le ⇒ Object
#to_binary ⇒ Object
#to_utf16le ⇒ Object
#to_utf16le_binary ⇒ Object
#to_utf16le_binary_null ⇒ Object
#unpascalize ⇒ Object

returns a string decoded from its pascalized form.
#unpascalize_ary ⇒ Object

returns an array containing all the concatenated pascalized strings.
#utf16le_to_utf8 ⇒ Object

Instance Method Details

#binary_add_at_offset(offset, value) ⇒ `Object`

Raises:

(OutOfBounds)

# File 'lib/rcs-common/binary.rb', line 46

def binary_add_at_offset(offset, value)
  io = StringIO.new(self)

  # check for boundaries
  raise OutOfBounds if offset < 0
  raise OutOfBounds if offset > io.size

  io.pos = offset
  current = io.read(4).unpack('I').first
  current += value
  current = [current].pack('I')

  io.pos = offset
  io.write current
  io.close
  self
end

#binary_patch(match, replace) ⇒ `Object`

Raises:

(MatchNotFound)

# File 'lib/rcs-common/binary.rb', line 24

def binary_patch(match, replace)
  raise MatchNotFound unless self[match]
  # use the block form to avoid the regexp in the replace string
  self.gsub!(match.force_encoding('ASCII-8BIT')) do |param|
    replace.force_encoding('ASCII-8BIT')
  end
end

#binary_patch_at_offset(offset, replace) ⇒ `Object`

Raises:

(OutOfBounds)

# File 'lib/rcs-common/binary.rb', line 32

def binary_patch_at_offset(offset, replace)
  io = StringIO.new(self)

  # check for boundaries
  raise OutOfBounds if offset < 0
  raise OutOfBounds if offset > io.size
  raise OutOfBoundsString if offset + replace.bytesize > io.size

  io.pos = offset
  io.write replace
  io.close
  self
end

#force_utf8(modify_self = false) ⇒ `Object`

# File 'lib/rcs-common/sanitize.rb', line 13

def force_utf8(modify_self = false)
  src_encoding = valid_encoding? ? encoding.to_s : 'BINARY'
  dst_encoding = 'UTF-8'

  args = [dst_encoding, src_encoding, {:invalid => :replace, :undef => :replace, replace: ''}]

  modify_self ? encode!(*args) : encode(*args)
end

#force_utf8! ⇒ `Object`



22
23
24

# File 'lib/rcs-common/sanitize.rb', line 22

def force_utf8!
  force_utf8(true)
end

#keywords ⇒ `Object`

# File 'lib/rcs-common/keywords.rb', line 10

def keywords

  # make a copy of itself to preserve the original
  keywords = self.dup

  # sanitize the input UTF-8
  keywords.force_utf8!

  # remove everything that is not alphanumeric
  keywords.gsub!(/([^[:alnum:]])+/u, ' ')
  #keywords.gsub!(/[(,%&@_":;!\#\-\*\[\]\{\}\?\\\+\'\.\/)]/, ' ')

  # returns a copy of str with leading and trailing whitespace removed.
  keywords.strip!

  # convert to lowercase
  keywords.downcase!

  # split on spaces
  keywords = keywords.split " "

  # remove too long words
  # it is with a very high probability a meaningless word (like encoded or something)
  keywords.delete_if {|w| w.size > 25}

  # remove duplicate words
  keywords.uniq!

  # sort the array
  keywords.sort!

  keywords
rescue Exception => e
  #puts e.message if debug
  #puts e.backtrace.first if debug
  # fallback case
  []
end

#pascalize ⇒ `Object`

returns a string encoded into a pascalized form

# File 'lib/rcs-common/pascalize.rb', line 7

def pascalize
  # the pascalized version is composed as follow:
  # - 4 bytes len in front
  # - UTF-16LE encoded string
  # - UTF-16LE null terminator
  pascalized = [self.encode('UTF-16LE').bytesize + 2].pack('I')
  pascalized += self.encode('UTF-16LE').unpack('H*').pack('H*') 
  pascalized += "\x00\x00"

  # BINARY is an alias for ASCII-8BIT
  return pascalized.encode!('ASCII-8BIT')
end

#remove_invalid_chars ⇒ `Object`



9
10
11

# File 'lib/rcs-common/sanitize.rb', line 9

def remove_invalid_chars
  self.force_utf8.gsub(REMOVE_INVALID_CHARS_REGEXP, ' ')
end

#safe_utf8_encode ⇒ `Object`

# File 'lib/rcs-common/utf16le.rb', line 79

def safe_utf8_encode
  self.force_encoding('UTF-8')
  self.encode! 'UTF-8', 'UTF-8', invalid: :replace, undef: :replace, replace: ''
end

#safe_utf8_encode_invalid ⇒ `Object`

# File 'lib/rcs-common/utf16le.rb', line 71

def safe_utf8_encode_invalid
  return self if self.encoding == Encoding::UTF_8 and self.valid_encoding?
  self.safe_utf8_encode
  return self if self.valid_encoding?
  self.force_encoding('BINARY')
  self.encode! 'BINARY', 'UTF-8', invalid: :replace, undef: :replace, replace: '?'
end

#strip_html_tags ⇒ `Object`

# File 'lib/rcs-common/sanitize.rb', line 26

def strip_html_tags
  copy = self.dup

  # Strip HTML tags
  copy.gsub!(/<[^>]*>/, '')

  # Strip encoded &amp; repetitively encoded HTML tags
  copy.gsub!(/&amp;(amp;)*lt;.*?&amp;(amp;)*gt;/im, '')

  # Strip HTML entities and repetitively encoded entities
  # Or decode with http://htmlentities.rubyforge.org/
  copy.gsub!(/&amp;(amp;)*((#x?)?[a-f0-9]+|[a-z]+);/i, ' ')

  copy
end

#terminate_utf16le ⇒ `Object`



59
60
61

# File 'lib/rcs-common/utf16le.rb', line 59

def terminate_utf16le
  self.force_encoding('UTF-16LE') + "\0".encode('UTF-16LE')
end

#to_binary ⇒ `Object`



46
47
48

# File 'lib/rcs-common/utf16le.rb', line 46

def to_binary
  self.unpack("H*").pack("H*")
end

#to_utf16le ⇒ `Object`



63
64
65

# File 'lib/rcs-common/utf16le.rb', line 63

def to_utf16le
  self.encode('UTF-16LE')
end

#to_utf16le_binary ⇒ `Object`



50
51
52

# File 'lib/rcs-common/utf16le.rb', line 50

def to_utf16le_binary
  self.encode('UTF-16LE').to_binary
end

#to_utf16le_binary_null ⇒ `Object`

# File 'lib/rcs-common/utf16le.rb', line 54

def to_utf16le_binary_null
  # with null termination
  (self + "\0").to_utf16le_binary
end

#unpascalize ⇒ `Object`

returns a string decoded from its pascalized form

# File 'lib/rcs-common/pascalize.rb', line 21

def unpascalize
  begin
    # get the len (unsigned int 4 bytes)
    len = self.unpack('I')
    # sanity check to avoid
    return nil unless len.first <= self.length - 4
    # get the string
    unpascalized = self.slice(4, len.first).force_encoding('UTF-16LE')
    # convert to UTF-8
    unpascalized.encode!('UTF-8')
    # remove the trailing zero
    unpascalized.chop!
    
    return unpascalized
  rescue
    return nil
  end
end

#unpascalize_ary ⇒ `Object`

returns an array containing all the concatenated pascalized strings

# File 'lib/rcs-common/pascalize.rb', line 41

def unpascalize_ary
  many = []
  buffer = self
  len = 0
  
  begin
    # len of the current token
    len += buffer.unpack('I').first + 4
    # unpascalize the token
    str = buffer.unpascalize
    # add to the result array
    many << str unless str.nil?
    # move the pointer after the token
    buffer = self.slice(len, self.length)
    # sanity check
    break if buffer.nil?
  end while buffer.length != 0
  
  return many
end

#utf16le_to_utf8 ⇒ `Object`



67
68
69

# File 'lib/rcs-common/utf16le.rb', line 67

def utf16le_to_utf8
  self.force_encoding('UTF-16LE').encode('UTF-8').chomp("\0")
end

Class: String

Overview

Direct Known Subclasses

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#binary_add_at_offset(offset, value) ⇒ Object

#binary_patch(match, replace) ⇒ Object

#binary_patch_at_offset(offset, replace) ⇒ Object

#force_utf8(modify_self = false) ⇒ Object

#force_utf8! ⇒ Object

#keywords ⇒ Object

#pascalize ⇒ Object

#remove_invalid_chars ⇒ Object

#safe_utf8_encode ⇒ Object

#safe_utf8_encode_invalid ⇒ Object

#strip_html_tags ⇒ Object

#terminate_utf16le ⇒ Object

#to_binary ⇒ Object

#to_utf16le ⇒ Object

#to_utf16le_binary ⇒ Object

#to_utf16le_binary_null ⇒ Object

#unpascalize ⇒ Object

#unpascalize_ary ⇒ Object

#utf16le_to_utf8 ⇒ Object

#binary_add_at_offset(offset, value) ⇒ `Object`

#binary_patch(match, replace) ⇒ `Object`

#binary_patch_at_offset(offset, replace) ⇒ `Object`

#force_utf8(modify_self = false) ⇒ `Object`

#force_utf8! ⇒ `Object`

#keywords ⇒ `Object`

#pascalize ⇒ `Object`

#remove_invalid_chars ⇒ `Object`

#safe_utf8_encode ⇒ `Object`

#safe_utf8_encode_invalid ⇒ `Object`

#strip_html_tags ⇒ `Object`

#terminate_utf16le ⇒ `Object`

#to_binary ⇒ `Object`

#to_utf16le ⇒ `Object`

#to_utf16le_binary ⇒ `Object`

#to_utf16le_binary_null ⇒ `Object`

#unpascalize ⇒ `Object`

#unpascalize_ary ⇒ `Object`

#utf16le_to_utf8 ⇒ `Object`