Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/hebrew.rb

Overview

extend String class

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.is_codepoint_nikkud_cp1255(cp) ⇒ Object



143
144
145
146
# File 'lib/hebrew.rb', line 143

def self.is_codepoint_nikkud_cp1255(cp)
  return ((cp > 191 && cp < 205) or [209, 210].include?(cp))
  #NIKKUD_CP1255.include?(cp) # cleaner, but much slower
end

.is_codepoint_nikkud_utf8(cp) ⇒ Object

NIKKUD_CP1255.include?(cp) # cleaner, but much slower



147
148
149
150
# File 'lib/hebrew.rb', line 147

def self.is_codepoint_nikkud_utf8(cp)
  return ((cp > 0x05af && cp < 0x05bd) or [0x05c1, 0x05c2].include?(cp))
  #NIKKUD_UTF8.include?(cp) # cleaner, but much slower
end

.is_final_by_encoding(c, encoding) ⇒ Object

this will return true if the first parameter is a final letter in the encoding of the second parameter



162
163
164
165
166
167
168
169
# File 'lib/hebrew.rb', line 162

def self.is_final_by_encoding(c, encoding)
  case encoding
  when Encoding::UTF_8
    FIANLS_UTF8.include?(c)
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    FINALS_CP1255.include?(c)
  end
end

.is_nikkud_by_encoding(c, encoding) ⇒ Object

this will return true if the first parameter is a nikkud character in the encoding of the second parameter



152
153
154
155
156
157
158
159
160
# File 'lib/hebrew.rb', line 152

def self.is_nikkud_by_encoding(c, encoding)
  case encoding
  when Encoding::UTF_8
    self.is_codepoint_nikkud_utf8(c.codepoints.first)
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    self.is_codepoint_nikkud_cp1255(c.codepoints.first)
  # TODO: add Mac encoding?
  end
end

Instance Method Details

#any_hebrew?Boolean

this will return true if the string contains any Hebrew character (short circuit)

Returns:

  • (Boolean)


77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/hebrew.rb', line 77

def any_hebrew?
  case self.encoding
  when Encoding::UTF_8
    self.each_codepoint {|cp| return true if is_hebrew_codepoint_utf8(cp) }
    return false
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    self.each_codepoint {|cp| return true if is_hebrew_codepoint_cp1255(cp) }
    return false
  else
    return false
  end
end

#any_nikkud?Boolean

Returns:

  • (Boolean)


116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/hebrew.rb', line 116

def any_nikkud?
  func = case self.encoding
    when Encoding::UTF_8
      :is_codepoint_nikkud_utf8
    when Encoding::WINDOWS_1255 || Encoding::CP1255
      :is_codepoint_nikkud_cp1255
    else
      :falsehood
    end
  self.each_codepoint{|cp| return true if String.send(func, cp)}
  return false
end

#falsehoodObject



90
91
92
# File 'lib/hebrew.rb', line 90

def falsehood
  false
end

#is_hebrew_codepoint_cp1255(cp) ⇒ Object



129
130
131
# File 'lib/hebrew.rb', line 129

def is_hebrew_codepoint_cp1255(cp)
  return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
end

#is_hebrew_codepoint_utf8(cp) ⇒ Object



132
133
134
# File 'lib/hebrew.rb', line 132

def is_hebrew_codepoint_utf8(cp)
  return (cp >= HEB_UTF8_START && cp <= HEB_UTF8_END)
end

#is_nikkud(c) ⇒ Object

this will return true if the parameter is a nikkud character



139
140
141
# File 'lib/hebrew.rb', line 139

def is_nikkud(c)
  self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding
end

#naive_full_nikkudObject

this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won’t always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/hebrew.rb', line 95

def naive_full_nikkud
  ret = ''
  prev_char = nil
  case self.encoding
  when Encoding::UTF_8
    self.each_char do |c|
      if c.codepoints[0] == HEB_UTF8_QUBBUTS
        ret += 'וּ' # replace Qubbuts with vav and shuruk
      else
        ret += c
      end
      ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
      ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
      prev_char = c
    end
    return ret.gsub("\u05b4יי","\u05b4י").gsub("\u05b4י\u05bcי", "\u05b4\u05bcי") # get rid of extraneous yods possibly added because we weren't looking ahead
  else
    return nil # not implemented for other encodings for now.
  end
end

#strip_hebrewObject



23
24
25
26
27
28
29
30
# File 'lib/hebrew.rb', line 23

def strip_hebrew
  case self.encoding
  when Encoding::UTF_8
    strip_hebrew_utf8
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    strip_hebrew_cp1255
  end
end

#strip_hebrew_cp1255Object



40
41
42
43
44
45
46
47
48
# File 'lib/hebrew.rb', line 40

def strip_hebrew_cp1255
  target = ''.force_encoding('windows-1255')
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
      target << cp.chr(Encoding::CP1255) # is there a neater way?
    end
  }
  return target
end

#strip_hebrew_utf8Object



31
32
33
34
35
36
37
38
39
# File 'lib/hebrew.rb', line 31

def strip_hebrew_utf8
  target = ''
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
      target << cp.chr(Encoding::UTF_8)
    end
  }
  return target
end

#strip_nikkudObject

this will return the string, stripped of any Hebrew nikkud characters



50
51
52
53
54
55
56
57
# File 'lib/hebrew.rb', line 50

def strip_nikkud
  case self.encoding
  when Encoding::UTF_8
    strip_nikkud_utf8
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    strip_nikkud_cp1255
  end
end

#strip_nikkud_cp1255Object



58
59
60
61
62
63
64
65
66
# File 'lib/hebrew.rb', line 58

def strip_nikkud_cp1255
  target = ''.force_encoding('windows-1255')
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_cp1255(cp)
      target << cp.chr(Encoding::CP1255) # is there a neater way?
    end
  }
  return target
end

#strip_nikkud_utf8Object



67
68
69
70
71
72
73
74
75
# File 'lib/hebrew.rb', line 67

def strip_nikkud_utf8
  target = ''
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_utf8(cp)
      target << cp.chr(Encoding::UTF_8)
    end
  }
  return target
end