Module: StringUtils
- Extended by:
- StringUtils
- Included in:
- StringUtils
- Defined in:
- lib/string_utils.rb,
lib/string_utils/version.rb,
lib/string_utils/transliteration.rb
Overview
StringUtils is a library that provides various handy string manipulation methods Example usage:
* StringUtils.truncate("hello world", 10, "...") #=> "hello..."
* StringUtils.normalize_name "\302\240 Gran Via/Avda.de Asturias " #=> :Gran Via / Avda. de Asturias"
* StringUtils.urlify("waßer") #=> "wasser"
* StringUtils.normalize_punctuation(" , a,,b ,") #=> "a, b"
Constant Summary collapse
- NBSP =
"\302\240"
- WHITESPACE_MATCHER =
"(?:\s|#{NBSP})"
- WHITESPACE =
/#{WHITESPACE_MATCHER}/
- NOT_WHITESPACE =
"[^\s#{NBSP}]"
- WHITESPACES =
/#{WHITESPACE_MATCHER}+/
- VERSION =
"1.0.8"
- TRANSLITERATIONS =
Based on transliteration table from i18n v0.5.0
{ # Latin "À" =>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE", "Ç" =>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I", "Î" =>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O", "Õ" =>"O", "Ö"=>"O", "×"=>"x", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü" =>"U", "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã" =>"a", "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê" =>"e", "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ" =>"n", "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù" =>"u", "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā" =>"A", "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć" =>"c", "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď" =>"D", "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ" =>"e", "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ" =>"G", "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ" =>"g", "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī" =>"I", "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı" =>"i", "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ" =>"k", "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ" =>"L", "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ" =>"n", "Ň"=>"N", "ň"=>"n", "ʼn"=>"'n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō" =>"O", "ō"=>"o", "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ" =>"oe", "Ŕ"=>"R", "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś" =>"S", "ś"=>"s", "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š" =>"s", "Ţ"=>"T", "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ" =>"U", "ũ"=>"u", "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů" =>"u", "Ű"=>"U", "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ" =>"Y", "ŷ"=>"y", "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž" =>"Z", "ž"=>"z", # Cyrillic "Ґ" =>"G", "Ё"=>"YO", "Є"=>"E", "Ї"=>"YI", "І"=>"I", "А" =>"A", "Б"=>"B", "В"=>"V", "Г"=>"G", "Д" =>"D", "Е"=>"E", "Ж"=>"ZH", "З"=>"Z", "И"=>"I", "Й" =>"Y", "К"=>"K", "Л"=>"L", "М"=>"M", "Н"=>"N", "О" =>"O", "П"=>"P", "Р"=>"R", "С"=>"S", "Т"=>"T", "У" =>"U", "Ф"=>"F", "Х"=>"H", "Ц"=>"TS", "Ч"=>"CH", "Ш" =>"SH", "Щ"=>"SCH", "Ъ"=>"'", "Ы"=>"Y", "Ь"=>"", "Э" =>"E", "Ю"=>"YU", "Я"=>"YA", "і"=>"i", "ґ" =>"g", "ё"=>"yo", "№"=>"#", "є"=>"e", "ї" =>"yi", "а"=>"a", "б"=>"b", "в" =>"v", "г"=>"g", "д"=>"d", "е"=>"e", "ж"=>"zh", "з" =>"z", "и"=>"i", "й"=>"y", "к"=>"k", "л"=>"l", "м" =>"m", "н"=>"n", "о"=>"o", "п"=>"p", "р"=>"r", "с" =>"s", "т"=>"t", "у"=>"u", "ф"=>"f", "х"=>"h", "ц" =>"ts", "ч"=>"ch", "ш"=>"sh", "щ"=>"sch", "ъ"=>"'", "ы" =>"y", "ь"=>"", "э"=>"e", "ю"=>"yu", "я"=>"ya", # Greek 'α' => 'a', 'η' => 'h', 'ν' => 'n', 'τ' => 't', 'β' => 'b', 'θ' => 'th', 'ξ' => 'x', 'υ' => 'y', 'γ' => 'g', 'ι' => 'i', 'ο' => 'o', 'φ' => 'f', 'δ' => 'd', 'κ' => 'k', 'π' => 'p', 'χ' => 'ch', 'ε' => 'e', 'λ' => 'l', 'ρ' => 'r', 'ψ' => 'ps', 'ζ' => 'z', 'μ' => 'm', 'σ' => 's', 'ω' => 'w', 'Θ' => 'Th', 'Ξ' => 'X', 'Γ' => 'G', 'Φ' => 'F', 'Δ' => 'D', 'Π' => 'P', 'Λ' => 'L', 'Ρ' => 'R', 'Ψ' => 'Ps', 'Σ' => 'S', 'Ω' => 'W' }
Instance Method Summary collapse
-
#mb_charify(text) ⇒ Object
Returns a unicode compatible version of the string.
-
#normalize_name(value, options = {}) ⇒ Object
Normalizes whitespace “a , a” => “a, a” “a ,a” => “a, a” “a,a” => “a, a” “a/b” => “a / b”, “a/ b” => “a / b”, “a /b” => “a / b” Removes trailing and leading [.,] options: => true (default false).
-
#normalize_punctuation(str) ⇒ Object
Collapses spaces and commas Fixes spacing around the following characters: ,.;:& Removes consecutive character dupes Removes trailing and leading commas.
-
#truncate(text, *args) ⇒ Object
Truncates the string The result will be
:length
or shorter, and the words will not be cut in the middle Arguments: :length => Integer (default: 30) :omission => String (default: “…”). -
#urlify(string, opts = {}) ⇒ Object
Converts a string to a nicely readable URL opts: :default_replacement – string to use for unknown characters (Default: “”) :whitespace_replacement – string to use to replace whitespace+ (Default: “-”).
Instance Method Details
#mb_charify(text) ⇒ Object
Returns a unicode compatible version of the string
support any of:
* ruby 1.9 sane utf8 support
* rails 2.1 workaround for crappy ruby 1.8 utf8 support
* rails 2.2 workaround for crappy ruby 1.8 utf8 support
hooray!
166 167 168 169 170 171 172 173 174 |
# File 'lib/string_utils.rb', line 166 def mb_charify(text) if RUBY_VERSION >= '1.9' text.dup elsif text.respond_to?(:mb_chars) text.frozen? ? text.dup.mb_chars : text.mb_chars else raise "StringUtils: No unicode support for strings" end end |
#normalize_name(value, options = {}) ⇒ Object
Normalizes whitespace “a , a” => “a, a” “a ,a” => “a, a” “a,a” => “a, a” “a/b” => “a / b”, “a/ b” => “a / b”, “a /b” => “a / b” Removes trailing and leading [.,] options: => true (default false)
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/string_utils.rb', line 73 def normalize_name(value, = {}) value = mb_charify(value) # Normalize whitespace value.gsub!("\n", ' ') value.gsub!(WHITESPACES, ' ') value.strip! # Remove trailing and leading ., value.gsub!(/^[.,]/, '') value.gsub!(/[.,]$/, '') # Remove quote pairs. Imperfect, but good enough value.gsub!(/\A['"]+(.*)['"]+\z/, '\1') # "a ,a" => "a, a" # "a,a" => "a, a" # "a , a" => "a, a" value.gsub!(/#{WHITESPACE_MATCHER}([,.])/, '\1') value.gsub!(/([,.])(#{NOT_WHITESPACE})/, '\1 \2') # "//" => "/" value.gsub!(/\/+/, '/') # "a/b" => "a / b", "a/ b" => "a / b", "a /b" => "a / b" value.gsub!(/(#{NOT_WHITESPACE})\//, '\1 /') value.gsub!(/\/(#{NOT_WHITESPACE})/, '/ \1') if [:titleize] value = value.titleize value.gsub!(/#{WHITESPACE_MATCHER}(Of|And|A|An|The|To)#{WHITESPACE_MATCHER}/) { |m| "#{m.downcase}" } end value.to_s end |
#normalize_punctuation(str) ⇒ Object
Collapses spaces and commas Fixes spacing around the following characters:
,.;:&
Removes consecutive character dupes Removes trailing and leading commas
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/string_utils.rb', line 31 def normalize_punctuation(str) s = str.dup s.gsub! /\s+/, ' ' s.gsub(/\s*&,/) # Collapse w/s around all s.gsub! /\s*([:,&.;])\s*/, '\1' # Collapse consecutive dupes s.gsub! /([.,;&:])+/ , '\1' # Collapse leading and trailing punctuation s.gsub! /^\s*[,:&;.]|[.;&:,]\s*$/, '' # Add whitespaces s.gsub! /([,.;:])(\S)/, '\1 \2' s.gsub! /(\S)([&])(\S)/, '\1 \2 \3' s.strip! s end |
#truncate(text, *args) ⇒ Object
Truncates the string The result will be :length
or shorter, and the words will not be cut in the middle Arguments: :length => Integer (default: 30) :omission => String (default: “…”)
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/string_utils.rb', line 113 def truncate(text, *args) = args.last.is_a?(Hash) ? args.pop : {} # support either old or Rails 2.2 calling convention: unless args.empty? [:length] = args[0] || 30 [:omission] = args[1] || "…" end = {:length => 30, :omission => "…"}.merge() [:length] = [:length].to_i return "" if !text chars = mb_charify(text) # If we can return it straight away or rstrip it and return it, we do it here if chars.length <= [:length] return text elsif (chars = rstrip_with_nbsp(chars)).length <= [:length] return chars.to_s end omission = mb_charify([:omission]) # Here we know we have to remove at least 1 word # 1. Get the first l characters # 2. Remove the last word if it's a part # 3. Add omission length_wo_omission = [:length] - omission.length return '' if length_wo_omission < 0 result = rstrip_with_nbsp(chars[0...length_wo_omission] || "") # Remove the last word unless we happened to trim it exactly already unless chars[length_wo_omission] =~ WHITESPACE || result.length < length_wo_omission len = result.split(WHITESPACES).last len &&= len.length result = rstrip_with_nbsp(result[0...(result.length - (len || 0))]) end result += [:omission] result.to_s end |
#urlify(string, opts = {}) ⇒ Object
Converts a string to a nicely readable URL opts: :default_replacement – string to use for unknown characters (Default: “”) :whitespace_replacement – string to use to replace whitespace+ (Default: “-”)
57 58 59 60 61 62 63 64 |
# File 'lib/string_utils.rb', line 57 def urlify(string, opts = {}) opts = {:whitespace_replacement => '-', :default_replacement => ""}.merge(opts) string = string.gsub(WHITESPACES, opts[:whitespace_replacement]) string.strip! string.gsub!(/[^\x00-\x7f]/u) { |char| TRANSLITERATIONS[char] || opts[:default_replacement] } string.gsub!(/[^a-z0-9\-+_]/, opts[:default_replacement]) string end |