Class: Langusta::LangProfile
- Inherits:
-
Object
- Object
- Langusta::LangProfile
- Defined in:
- lib/langusta/lang_profile.rb
Constant Summary collapse
- MINIMUM_FREQ =
2
- LESS_FREQ_RATIO =
100_000
Instance Attribute Summary collapse
-
#freq ⇒ Object
readonly
Returns the value of attribute freq.
-
#n_words ⇒ Object
readonly
Returns the value of attribute n_words.
-
#name ⇒ Object
readonly
Returns the value of attribute name.
Class Method Summary collapse
-
.load_from_file(filename) ⇒ LangProfile
Constructs a language profile from a file.
Instance Method Summary collapse
-
#add(gram) ⇒ Object
Adds a given NGram to this language profile.
-
#initialize(name, freq = {}, n_words = Array.new(NGram::N_GRAM, 0)) ⇒ LangProfile
constructor
A new instance of LangProfile.
- #omit_less_freq ⇒ Object
Constructor Details
#initialize(name, freq = {}, n_words = Array.new(NGram::N_GRAM, 0)) ⇒ LangProfile
Returns a new instance of LangProfile.
26 27 28 29 |
# File 'lib/langusta/lang_profile.rb', line 26 def initialize(name, freq={}, n_words = Array.new(NGram::N_GRAM, 0)) Guard.klass(name, String, __method__) @name, @freq, @n_words = name, freq, n_words end |
Instance Attribute Details
#freq ⇒ Object (readonly)
Returns the value of attribute freq.
7 8 9 |
# File 'lib/langusta/lang_profile.rb', line 7 def freq @freq end |
#n_words ⇒ Object (readonly)
Returns the value of attribute n_words.
7 8 9 |
# File 'lib/langusta/lang_profile.rb', line 7 def n_words @n_words end |
#name ⇒ Object (readonly)
Returns the value of attribute name.
7 8 9 |
# File 'lib/langusta/lang_profile.rb', line 7 def name @name end |
Class Method Details
.load_from_file(filename) ⇒ LangProfile
Constructs a language profile from a file. Converts all NGrams from UTF-8 to Unicode codepoints.
12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/langusta/lang_profile.rb', line 12 def self.load_from_file(filename) json = Yajl::Parser.parse(File.new(filename)) freq = json['freq'].inject({}) do |acc, kv| key, value = kv acc[Langusta.utf82cp(key)] = value acc end self.new(json['name'] || (raise CorruptProfileError.new("Missing profile name")), freq, json['n_words'] || (raise CorruptProfileError.new("Missing number of words value"))) end |
Instance Method Details
#add(gram) ⇒ Object
Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
33 34 35 36 37 38 39 40 41 42 |
# File 'lib/langusta/lang_profile.rb', line 33 def add(gram) return if gram.nil? Guard.klass(gram, Array, __method__) length = gram.size return if length < 1 or length > NGram::N_GRAM @n_words[length - 1] += 1 @freq[gram] ||= 0 @freq[gram] += 1 end |
#omit_less_freq ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/langusta/lang_profile.rb', line 44 def omit_less_freq threshold = @n_words[0] / LESS_FREQ_RATIO threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ keys = Set.new(@freq.keys) roman = 0 keys.each do |key| count = @freq[key] if count <= threshold @n_words[key.size - 1] -= count @freq.delete(key) else # temp workaround if RegexHelper::ROMAN_REGEX.match(Langusta.cp2utf8(key)) roman += count end end end if roman < @n_words[0] / 3 keys2 = Set.new(@freq.keys) keys2.each do |key| # temp workaround if RegexHelper::INCL_ROMAN_REGEX.match(Langusta.cp2utf8(key)) @n_words[key.size - 1] -= @freq[key] @freq.delete(key) end end end end |