Class: String

Inherits:
Object show all
Defined in:
lib/itrigga/core_ext/string.rb

Instance Method Summary collapse

Instance Method Details

#blank?Boolean

need to force encoding for ruby 1.9 otherwise regex fails when comparing string of 2 different encodings TODO : String.blank? - do we need to do force encoding? is UTF-8 a good default?

Returns:

  • (Boolean)


70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/itrigga/core_ext/string.rb', line 70

def blank?
  begin
    if Gem::Version.new(''+RUBY_VERSION) >= Gem::Version.new("1.9.0") # only for ruby 1.9+
      self.dup.force_encoding("UTF-8") !~ /\S/
    else
	    self !~ /\S/
    end
  rescue ArgumentError => e
    if e.message =~ /^(invalid\ byte\ sequence|incompatible\ character\ encodings)/
     empty?
    else
     raise e
    end
  end
end

#char_countObject



41
42
43
# File 'lib/itrigga/core_ext/string.rb', line 41

def char_count
  trim.size
end

#cjk_regexObject



45
46
47
# File 'lib/itrigga/core_ext/string.rb', line 45

def cjk_regex
  /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/
end

#contains_cjk?Boolean

Returns:

  • (Boolean)


49
50
51
# File 'lib/itrigga/core_ext/string.rb', line 49

def contains_cjk?
  !!(self =~ /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/)
end

#is_i?Boolean

Returns:

  • (Boolean)


18
19
20
# File 'lib/itrigga/core_ext/string.rb', line 18

def is_i?
  !!(self =~ /^[-+]?[0-9]([0-9]*)?$/)
end

#remove_whitespaceObject



14
15
16
# File 'lib/itrigga/core_ext/string.rb', line 14

def remove_whitespace
  self.gsub(" ","")
end

#to_active_record_conditionObject



64
65
66
# File 'lib/itrigga/core_ext/string.rb', line 64

def to_active_record_condition
  "%#{self.trim.gsub(/[[:space:]]+/, '%')}%"
end

#to_plain_textObject



9
10
11
# File 'lib/itrigga/core_ext/string.rb', line 9

def to_plain_text()
  ::Sanitize.clean( HTMLEntities.new.decode(self) ).to_s.strip
end


5
6
7
# File 'lib/itrigga/core_ext/string.rb', line 5

def to_plain_text_preserving_links(  )
  ::Sanitize.clean( self, :elements=>['a'], :attributes=>{'a'=>['href']} ).to_s.strip
end

#to_utf16leObject



60
61
62
# File 'lib/itrigga/core_ext/string.rb', line 60

def to_utf16le
  Iconv.conv('utf-16le', 'UTF-8', self)
end

#to_utf8Object

converts the encoding to UTF-8 regardless of current encoding



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/itrigga/core_ext/string.rb', line 87

def to_utf8
  text = self.dup
  # shiny new ruby 1.9 way
  return text if text.encoding.name == "UTF-8" && text.valid_encoding? # already utf-8 yay!    
  
  encodings = [
    "UTF-8",
    "ISO-8859-1",
    "UTF-16BE",
    "UTF-16LE",
    "UTF-32BE",
    "UTF-32LE",
    "Windows-1251",
    "UTF-7",
    "US-ASCII", 
    "ASCII-8BIT"
  ]
  
  encodings.each do |encoding|
    if (text.force_encoding(encoding).valid_encoding? rescue false)
      return text.force_encoding(encoding).encode("UTF-8")
    end
  end    
      
  # ok so we are out of suggestions. Just return the string and hope that its ok
  text
  
end

#trimObject

Removes starting, trailing whitespace and double spaces



56
57
58
# File 'lib/itrigga/core_ext/string.rb', line 56

def trim
  self.gsub(/^(.*[^\s])\s+$/, '\1').gsub(/^\s*(.*)$/, '\1')
end

#word_countObject

How many words are in this string Includes duplicates



26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/itrigga/core_ext/string.rb', line 26

def word_count
  #re = /[\p{Word}\p{Punct}]/u # <-- tried several forms of this regex, this appears to be the only one that works correctly with Cyrillic AND Arabic script
  #re = /\s+/u
  #to_utf8.scan(re).size
  
  split.inject(0) do |sum, word|
    if word.contains_cjk?
      sum += word.scan(cjk_regex).size   # => ONLY work in Ruby 1.9. 
                           #    Search for other methods to do this for 1.8
    else
      sum += 1
    end
  end
end