Class: LanguageDetector::Profile

Inherits:

Object

Object
LanguageDetector::Profile

show all

Defined in:: lib/language_detector.rb

Constant Summary collapse

LIMIT =

PUNCTUATION_REGEX =

/[\W^_\d]+/

Instance Attribute Summary collapse

#name ⇒ Object

Returns the value of attribute name.
#ngrams ⇒ Object

Returns the value of attribute ngrams.

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ `Profile`

Returns a new instance of Profile.

# File 'lib/language_detector.rb', line 150

def initialize(*args)
  args = args.first

  @name = args[:name] || ""
  @ngrams = {}

  init_with_string(args[:text]) if args[:text]
  init_with_file(args[:file]) if args[:file]
end

Instance Attribute Details

#name ⇒ `Object`

Returns the value of attribute name.



148
149
150

# File 'lib/language_detector.rb', line 148

def name
  @name
end

#ngrams ⇒ `Object`

Returns the value of attribute ngrams.



148
149
150

# File 'lib/language_detector.rb', line 148

def ngrams
  @ngrams
end

Instance Method Details

#compute_distance(other_profile) ⇒ `Object`

# File 'lib/language_detector.rb', line 160

def compute_distance(other_profile)
  distance = 0
  other_profile.ngrams.each do |k, v|
    n = @ngrams[k]
    if n = @ngrams[k]
      distance += (v - n).abs
    else
      distance += LIMIT
    end
  end

  distance
end

#count_ngram(token, n, counts) ⇒ `Object`

# File 'lib/language_detector.rb', line 207

def count_ngram(token, n, counts)
  token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.length >= n
  
  n.upto(token.length).with_index do |t, i|
    s = ''

    0.upto(n-1) { |j| s << token[i+j] }
    counts[s] = counts.has_key?(s) ? counts[s]+=1 : 1
  end

  counts
end

#generate_ngrams(str, ngram_count) ⇒ `Object`

# File 'lib/language_detector.rb', line 197

def generate_ngrams(str, ngram_count)
  tokens = tokenize(str)
  tokens.each do |token|
    2.upto(5) { |n| count_ngram(token, n, ngram_count) }
  end
end

#init_with_file(filename) ⇒ `Object`

# File 'lib/language_detector.rb', line 174

def init_with_file(filename)
  ngram_count = Hash.new(0)

  path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
  File.open(path).each_line {|line| generate_ngrams(line, ngram_count) }
  puts "training with " + path

  ngram_count.sort {|a,b| b[1] <=> a[1]}.each_with_index do |t, i|
    ngrams[t[0]] = (i+1)
    break if i > LIMIT
  end
end

#init_with_string(str) ⇒ `Object`

# File 'lib/language_detector.rb', line 187

def init_with_string(str)
  ngram_count = {}
  generate_ngrams(str, ngram_count)

  ngram_count.sort {|a,b| b[1] <=> a[1]}.each_with_index do |t, i|
    @ngrams[t[0]] = (i+1)
    break if i > LIMIT
  end
end

#is_punctuation?(char) ⇒ `Boolean`

Returns:

(Boolean)

205	# File 'lib/language_detector.rb', line 205 def is_punctuation?(char); char =~ PUNCTUATION_REGEX; end

#tokenize(str) ⇒ `Object`

204	# File 'lib/language_detector.rb', line 204 def tokenize(str) str.split(PUNCTUATION_REGEX); end

Class: LanguageDetector::Profile

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ Profile

Instance Attribute Details

#name ⇒ Object

#ngrams ⇒ Object

Instance Method Details

#compute_distance(other_profile) ⇒ Object

#count_ngram(token, n, counts) ⇒ Object

#generate_ngrams(str, ngram_count) ⇒ Object

#init_with_file(filename) ⇒ Object

#init_with_string(str) ⇒ Object

#is_punctuation?(char) ⇒ Boolean

#tokenize(str) ⇒ Object

#initialize(*args) ⇒ `Profile`

#name ⇒ `Object`

#ngrams ⇒ `Object`

#compute_distance(other_profile) ⇒ `Object`

#count_ngram(token, n, counts) ⇒ `Object`

#generate_ngrams(str, ngram_count) ⇒ `Object`

#init_with_file(filename) ⇒ `Object`

#init_with_string(str) ⇒ `Object`

#is_punctuation?(char) ⇒ `Boolean`

#tokenize(str) ⇒ `Object`