Class: LanguageDetector::Profile

Inherits:
Object
  • Object
show all
Defined in:
lib/language_detector/profile.rb

Constant Summary collapse

IGNORE_CHARACTERS =
[?., ?\,, ?:, ?;, ?\w, ?\n]
LIMIT =
2000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name) ⇒ Profile

Returns a new instance of Profile.



21
22
23
24
25
26
# File 'lib/language_detector/profile.rb', line 21

def initialize(name)
  @name = name
  @ignore_characters = {}
  IGNORE_CHARACTERS.each {|p| @ignore_characters[p] = 1}
  @ngrams = {}
end

Instance Attribute Details

#nameObject (readonly)

Returns the value of attribute name.



19
20
21
# File 'lib/language_detector/profile.rb', line 19

def name
  @name
end

#ngramsObject (readonly)

Returns the value of attribute ngrams.



19
20
21
# File 'lib/language_detector/profile.rb', line 19

def ngrams
  @ngrams
end

Instance Method Details

#_init_with_string(str, ngram_count) ⇒ Object



88
89
90
91
92
93
94
95
96
# File 'lib/language_detector/profile.rb', line 88

def _init_with_string str, ngram_count
  tokens = tokenize(str)
  tokens.each {|token|
    count_ngram token, 2, ngram_count
    count_ngram token, 3, ngram_count
    count_ngram token, 4, ngram_count
    count_ngram token, 5, ngram_count
  }
end

#append_next_token(tokens, new_token) ⇒ Object



45
46
47
48
49
# File 'lib/language_detector/profile.rb', line 45

def append_next_token(tokens, new_token)
  if !new_token.empty?
    tokens << new_token
  end
end

#compute_distance(profile) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/language_detector/profile.rb', line 6

def compute_distance profile
  distance = 0
  profile.ngrams.each {|k, v|
    n = @ngrams[k]
    if n
      distance += (v - n).abs
    else
      distance += LanguageDetector::Profile::LIMIT
    end
  }
  return distance
end

#count_ngram(token, n, counts) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/language_detector/profile.rb', line 98

def count_ngram token, n, counts
  if n > 1 && token.length >= n
    token = "_#{token}#{'_' * (n-1)}"
  end

  i = 0
  while i + n <= token.length

    s = ''
    j = 0

    while j < n
      s << token[i+j]
      j += 1
    end

    if counts[s]
      counts[s] = counts[s] + 1
    else
      counts[s] = 1
    end
    i += 1
  end
  return counts
end

#init_with_string(str) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/language_detector/profile.rb', line 74

def init_with_string str
  ngram_count = {}

  _init_with_string str, ngram_count

  a = ngram_count.sort {|a,b| b[1] <=> a[1]}
  i = 1
  a.each {|t|
    @ngrams[t[0]] = i
    i += 1
    break if i > LIMIT
  }
end

#init_with_training_file(filename) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/language_detector/profile.rb', line 55

def init_with_training_file filename
  ngram_count = {}

  path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))

  File.open(path).each_line{ |line|
    _init_with_string line, ngram_count
  }

  a = ngram_count.sort {|a,b| b[1] <=> a[1]}

  i = 1
  a.each {|t|
    @ngrams[t[0]] = i
    i += 1
    break if i > LIMIT
  }
end

#is_valid_character?(char) ⇒ Boolean

Returns:

  • (Boolean)


51
52
53
# File 'lib/language_detector/profile.rb', line 51

def is_valid_character? char
  char.match(/[^a-z]/).nil?
end

#tokenize(line) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/language_detector/profile.rb', line 28

def tokenize line
  tokens = []
  new_token = ''

  line.downcase.each_char {|c|
    if is_valid_character?(c)
      new_token << c
    else
      append_next_token(tokens, new_token)
      new_token = ''
    end
  }

  append_next_token(tokens, new_token)
  return tokens
end