Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/rcs-common/sanitize.rb,
lib/rcs-common/binary.rb,
lib/rcs-common/utf16le.rb,
lib/rcs-common/keywords.rb,
lib/rcs-common/pascalize.rb

Overview

here we are re-opening the ruby String class, the namespace must not be specified

Constant Summary collapse

REMOVE_INVALID_CHARS_REGEXP =
Regexp.new(/([^[:alnum:][:graph:]\n\r])+/u)

Instance Method Summary collapse

Instance Method Details

#binary_add_at_offset(offset, value) ⇒ Object

Raises:



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/rcs-common/binary.rb', line 46

def binary_add_at_offset(offset, value)
  io = StringIO.new(self)

  # check for boundaries
  raise OutOfBounds if offset < 0
  raise OutOfBounds if offset > io.size

  io.pos = offset
  current = io.read(4).unpack('I').first
  current += value
  current = [current].pack('I')

  io.pos = offset
  io.write current
  io.close
  self
end

#binary_patch(match, replace) ⇒ Object

Raises:



24
25
26
27
28
29
30
# File 'lib/rcs-common/binary.rb', line 24

def binary_patch(match, replace)
  raise MatchNotFound unless self[match]
  # use the block form to avoid the regexp in the replace string
  self.gsub!(match.force_encoding('ASCII-8BIT')) do |param|
    replace.force_encoding('ASCII-8BIT')
  end
end

#binary_patch_at_offset(offset, replace) ⇒ Object

Raises:



32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/rcs-common/binary.rb', line 32

def binary_patch_at_offset(offset, replace)
  io = StringIO.new(self)

  # check for boundaries
  raise OutOfBounds if offset < 0
  raise OutOfBounds if offset > io.size
  raise OutOfBoundsString if offset + replace.bytesize > io.size

  io.pos = offset
  io.write replace
  io.close
  self
end

#force_utf8(modify_self = false) ⇒ Object



13
14
15
16
17
18
19
20
# File 'lib/rcs-common/sanitize.rb', line 13

def force_utf8(modify_self = false)
  src_encoding = valid_encoding? ? encoding.to_s : 'BINARY'
  dst_encoding = 'UTF-8'

  args = [dst_encoding, src_encoding, {:invalid => :replace, :undef => :replace, replace: ''}]

  modify_self ? encode!(*args) : encode(*args)
end

#force_utf8!Object



22
23
24
# File 'lib/rcs-common/sanitize.rb', line 22

def force_utf8!
  force_utf8(true)
end

#keywordsObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/rcs-common/keywords.rb', line 10

def keywords

  # make a copy of itself to preserve the original
  keywords = self.dup

  # sanitize the input UTF-8
  keywords.force_utf8!

  # remove everything that is not alphanumeric
  keywords.gsub!(/([^[:alnum:]])+/u, ' ')
  #keywords.gsub!(/[(,%&@_":;!\#\-\*\[\]\{\}\?\\\+\'\.\/)]/, ' ')

  # returns a copy of str with leading and trailing whitespace removed.
  keywords.strip!

  # convert to lowercase
  keywords.downcase!

  # split on spaces
  keywords = keywords.split " "

  # remove too long words
  # it is with a very high probability a meaningless word (like encoded or something)
  keywords.delete_if {|w| w.size > 25}

  # remove duplicate words
  keywords.uniq!

  # sort the array
  keywords.sort!

  keywords
rescue Exception => e
  #puts e.message if debug
  #puts e.backtrace.first if debug
  # fallback case
  []
end

#pascalizeObject

returns a string encoded into a pascalized form



7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/rcs-common/pascalize.rb', line 7

def pascalize
  # the pascalized version is composed as follow:
  # - 4 bytes len in front
  # - UTF-16LE encoded string
  # - UTF-16LE null terminator
  pascalized = [self.encode('UTF-16LE').bytesize + 2].pack('I')
  pascalized += self.encode('UTF-16LE').unpack('H*').pack('H*') 
  pascalized += "\x00\x00"

  # BINARY is an alias for ASCII-8BIT
  return pascalized.encode!('ASCII-8BIT')
end

#remove_invalid_charsObject



9
10
11
# File 'lib/rcs-common/sanitize.rb', line 9

def remove_invalid_chars
  self.force_utf8.gsub(REMOVE_INVALID_CHARS_REGEXP, ' ')
end

#safe_utf8_encodeObject



79
80
81
82
# File 'lib/rcs-common/utf16le.rb', line 79

def safe_utf8_encode
  self.force_encoding('UTF-8')
  self.encode! 'UTF-8', 'UTF-8', invalid: :replace, undef: :replace, replace: ''
end

#safe_utf8_encode_invalidObject



71
72
73
74
75
76
77
# File 'lib/rcs-common/utf16le.rb', line 71

def safe_utf8_encode_invalid
  return self if self.encoding == Encoding::UTF_8 and self.valid_encoding?
  self.safe_utf8_encode
  return self if self.valid_encoding?
  self.force_encoding('BINARY')
  self.encode! 'BINARY', 'UTF-8', invalid: :replace, undef: :replace, replace: '?'
end

#strip_html_tagsObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/rcs-common/sanitize.rb', line 26

def strip_html_tags
  copy = self.dup

  # Strip HTML tags
  copy.gsub!(/<[^>]*>/, '')

  # Strip encoded &amp; repetitively encoded HTML tags
  copy.gsub!(/&amp;(amp;)*lt;.*?&amp;(amp;)*gt;/im, '')

  # Strip HTML entities and repetitively encoded entities
  # Or decode with http://htmlentities.rubyforge.org/
  copy.gsub!(/&amp;(amp;)*((#x?)?[a-f0-9]+|[a-z]+);/i, ' ')

  copy
end

#terminate_utf16leObject



59
60
61
# File 'lib/rcs-common/utf16le.rb', line 59

def terminate_utf16le
  self.force_encoding('UTF-16LE') + "\0".encode('UTF-16LE')
end

#to_binaryObject



46
47
48
# File 'lib/rcs-common/utf16le.rb', line 46

def to_binary
  self.unpack("H*").pack("H*")
end

#to_utf16leObject



63
64
65
# File 'lib/rcs-common/utf16le.rb', line 63

def to_utf16le
  self.encode('UTF-16LE')
end

#to_utf16le_binaryObject



50
51
52
# File 'lib/rcs-common/utf16le.rb', line 50

def to_utf16le_binary
  self.encode('UTF-16LE').to_binary
end

#to_utf16le_binary_nullObject



54
55
56
57
# File 'lib/rcs-common/utf16le.rb', line 54

def to_utf16le_binary_null
  # with null termination
  (self + "\0").to_utf16le_binary
end

#unpascalizeObject

returns a string decoded from its pascalized form



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/rcs-common/pascalize.rb', line 21

def unpascalize
  begin
    # get the len (unsigned int 4 bytes)
    len = self.unpack('I')
    # sanity check to avoid
    return nil unless len.first <= self.length - 4
    # get the string
    unpascalized = self.slice(4, len.first).force_encoding('UTF-16LE')
    # convert to UTF-8
    unpascalized.encode!('UTF-8')
    # remove the trailing zero
    unpascalized.chop!
    
    return unpascalized
  rescue
    return nil
  end
end

#unpascalize_aryObject

returns an array containing all the concatenated pascalized strings



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/rcs-common/pascalize.rb', line 41

def unpascalize_ary
  many = []
  buffer = self
  len = 0
  
  begin
    # len of the current token
    len += buffer.unpack('I').first + 4
    # unpascalize the token
    str = buffer.unpascalize
    # add to the result array
    many << str unless str.nil?
    # move the pointer after the token
    buffer = self.slice(len, self.length)
    # sanity check
    break if buffer.nil?
  end while buffer.length != 0
  
  return many
end

#utf16le_to_utf8Object



67
68
69
# File 'lib/rcs-common/utf16le.rb', line 67

def utf16le_to_utf8
  self.force_encoding('UTF-16LE').encode('UTF-8').chomp("\0")
end