Class: String

Inherits:

Object

Object
String

show all

Defined in:: lib/hebrew.rb

Overview

extend String class

Class Method Summary collapse

.is_codepoint_nikkud_cp1255(cp) ⇒ Object
.is_codepoint_nikkud_utf8(cp) ⇒ Object

NIKKUD_CP1255.include?(cp) # cleaner, but much slower.
.is_final_by_encoding(c, encoding) ⇒ Object

this will return true if the first parameter is a final letter in the encoding of the second parameter.
.is_nikkud_by_encoding(c, encoding) ⇒ Object

this will return true if the first parameter is a nikkud character in the encoding of the second parameter.

Instance Method Summary collapse

#any_hebrew? ⇒ Boolean

this will return true if the string contains any Hebrew character (short circuit).
#any_nikkud? ⇒ Boolean
#falsehood ⇒ Object
#is_hebrew_codepoint_cp1255(cp) ⇒ Object
#is_hebrew_codepoint_utf8(cp) ⇒ Object
#is_nikkud(c) ⇒ Object

this will return true if the parameter is a nikkud character.
#naive_full_nikkud ⇒ Object

this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels.
#strip_hebrew ⇒ Object
#strip_hebrew_cp1255 ⇒ Object
#strip_hebrew_utf8 ⇒ Object
#strip_nikkud ⇒ Object

this will return the string, stripped of any Hebrew nikkud characters.
#strip_nikkud_cp1255 ⇒ Object
#strip_nikkud_utf8 ⇒ Object

Class Method Details

.is_codepoint_nikkud_cp1255(cp) ⇒ `Object`

# File 'lib/hebrew.rb', line 143

def self.is_codepoint_nikkud_cp1255(cp)
  return ((cp > 191 && cp < 205) or [209, 210].include?(cp))
  #NIKKUD_CP1255.include?(cp) # cleaner, but much slower
end

.is_codepoint_nikkud_utf8(cp) ⇒ `Object`

NIKKUD_CP1255.include?(cp) # cleaner, but much slower

# File 'lib/hebrew.rb', line 147

def self.is_codepoint_nikkud_utf8(cp)
  return ((cp > 0x05af && cp < 0x05bd) or [0x05c1, 0x05c2].include?(cp))
  #NIKKUD_UTF8.include?(cp) # cleaner, but much slower
end

.is_final_by_encoding(c, encoding) ⇒ `Object`

this will return true if the first parameter is a final letter in the encoding of the second parameter

# File 'lib/hebrew.rb', line 162

def self.is_final_by_encoding(c, encoding)
  case encoding
  when Encoding::UTF_8
    FIANLS_UTF8.include?(c)
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    FINALS_CP1255.include?(c)
  end
end

.is_nikkud_by_encoding(c, encoding) ⇒ `Object`

this will return true if the first parameter is a nikkud character in the encoding of the second parameter

# File 'lib/hebrew.rb', line 152

def self.is_nikkud_by_encoding(c, encoding)
  case encoding
  when Encoding::UTF_8
    self.is_codepoint_nikkud_utf8(c.codepoints.first)
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    self.is_codepoint_nikkud_cp1255(c.codepoints.first)
  # TODO: add Mac encoding?
  end
end

Instance Method Details

#any_hebrew? ⇒ `Boolean`

this will return true if the string contains any Hebrew character (short circuit)

Returns:

(Boolean)

# File 'lib/hebrew.rb', line 77

def any_hebrew?
  case self.encoding
  when Encoding::UTF_8
    self.each_codepoint {|cp| return true if is_hebrew_codepoint_utf8(cp) }
    return false
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    self.each_codepoint {|cp| return true if is_hebrew_codepoint_cp1255(cp) }
    return false
  else
    return false
  end
end

#any_nikkud? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/hebrew.rb', line 116

def any_nikkud?
  func = case self.encoding
    when Encoding::UTF_8
      :is_codepoint_nikkud_utf8
    when Encoding::WINDOWS_1255 || Encoding::CP1255
      :is_codepoint_nikkud_cp1255
    else
      :falsehood
    end
  self.each_codepoint{|cp| return true if String.send(func, cp)}
  return false
end

#falsehood ⇒ `Object`



90
91
92

# File 'lib/hebrew.rb', line 90

def falsehood
  false
end

#is_hebrew_codepoint_cp1255(cp) ⇒ `Object`



129
130
131

# File 'lib/hebrew.rb', line 129

def is_hebrew_codepoint_cp1255(cp)
  return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
end

#is_hebrew_codepoint_utf8(cp) ⇒ `Object`



132
133
134

# File 'lib/hebrew.rb', line 132

def is_hebrew_codepoint_utf8(cp)
  return (cp >= HEB_UTF8_START && cp <= HEB_UTF8_END)
end

#is_nikkud(c) ⇒ `Object`

this will return true if the parameter is a nikkud character



139
140
141

# File 'lib/hebrew.rb', line 139

def is_nikkud(c)
  self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding
end

#naive_full_nikkud ⇒ `Object`

this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won’t always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).

# File 'lib/hebrew.rb', line 95

def naive_full_nikkud
  ret = ''
  prev_char = nil
  case self.encoding
  when Encoding::UTF_8
    self.each_char do |c|
      if c.codepoints[0] == HEB_UTF8_QUBBUTS
        ret += 'וּ' # replace Qubbuts with vav and shuruk
      else
        ret += c
      end
      ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
      ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
      prev_char = c
    end
    return ret.gsub("\u05b4יי","\u05b4י").gsub("\u05b4י\u05bcי", "\u05b4\u05bcי") # get rid of extraneous yods possibly added because we weren't looking ahead
  else
    return nil # not implemented for other encodings for now.
  end
end

#strip_hebrew ⇒ `Object`

# File 'lib/hebrew.rb', line 23

def strip_hebrew
  case self.encoding
  when Encoding::UTF_8
    strip_hebrew_utf8
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    strip_hebrew_cp1255
  end
end

#strip_hebrew_cp1255 ⇒ `Object`

# File 'lib/hebrew.rb', line 40

def strip_hebrew_cp1255
  target = ''.force_encoding('windows-1255')
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
      target << cp.chr(Encoding::CP1255) # is there a neater way?
    end
  }
  return target
end

#strip_hebrew_utf8 ⇒ `Object`

# File 'lib/hebrew.rb', line 31

def strip_hebrew_utf8
  target = ''
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
      target << cp.chr(Encoding::UTF_8)
    end
  }
  return target
end

#strip_nikkud ⇒ `Object`

this will return the string, stripped of any Hebrew nikkud characters

# File 'lib/hebrew.rb', line 50

def strip_nikkud
  case self.encoding
  when Encoding::UTF_8
    strip_nikkud_utf8
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    strip_nikkud_cp1255
  end
end

#strip_nikkud_cp1255 ⇒ `Object`

# File 'lib/hebrew.rb', line 58

def strip_nikkud_cp1255
  target = ''.force_encoding('windows-1255')
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_cp1255(cp)
      target << cp.chr(Encoding::CP1255) # is there a neater way?
    end
  }
  return target
end

#strip_nikkud_utf8 ⇒ `Object`

# File 'lib/hebrew.rb', line 67

def strip_nikkud_utf8
  target = ''
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_utf8(cp)
      target << cp.chr(Encoding::UTF_8)
    end
  }
  return target
end

Class: String

Overview

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.is_codepoint_nikkud_cp1255(cp) ⇒ Object

.is_codepoint_nikkud_utf8(cp) ⇒ Object

.is_final_by_encoding(c, encoding) ⇒ Object

.is_nikkud_by_encoding(c, encoding) ⇒ Object

Instance Method Details

#any_hebrew? ⇒ Boolean

#any_nikkud? ⇒ Boolean

#falsehood ⇒ Object

#is_hebrew_codepoint_cp1255(cp) ⇒ Object

#is_hebrew_codepoint_utf8(cp) ⇒ Object

#is_nikkud(c) ⇒ Object

#naive_full_nikkud ⇒ Object

#strip_hebrew ⇒ Object

#strip_hebrew_cp1255 ⇒ Object

#strip_hebrew_utf8 ⇒ Object

#strip_nikkud ⇒ Object

#strip_nikkud_cp1255 ⇒ Object

#strip_nikkud_utf8 ⇒ Object