Class: Scylla::Classifier

Inherits:
Object
  • Object
show all
Defined in:
lib/scylla/classifier.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(limit = 10, ngrams = 400, threshold = 1.04) ⇒ Classifier

limit : Up to how many matching language results should be displayed ngrams : The total number of ngrams that are stored for each language threshold: The threshold score for matches



8
9
10
11
12
# File 'lib/scylla/classifier.rb', line 8

def initialize(limit = 10, ngrams = 400, threshold = 1.04)
  @limit = limit
  @ngrams = ngrams
  @threshold = threshold
end

Instance Attribute Details

#inputObject

Returns the value of attribute input.



3
4
5
# File 'lib/scylla/classifier.rb', line 3

def input
  @input
end

#limitObject

Returns the value of attribute limit.



3
4
5
# File 'lib/scylla/classifier.rb', line 3

def limit
  @limit
end

#ngramsObject

Returns the value of attribute ngrams.



3
4
5
# File 'lib/scylla/classifier.rb', line 3

def ngrams
  @ngrams
end

#thresholdObject

Returns the value of attribute threshold.



3
4
5
# File 'lib/scylla/classifier.rb', line 3

def threshold
  @threshold
end

Instance Method Details

#classifyObject

Classifies @input to a list of languages in order of best match



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/scylla/classifier.rb', line 28

def classify
  results = Hash.new
  languages = Scylla::Loader.languages
  if languages.empty?
    p "No languages (.lm files) found in + " + Scylla::Loader.dir + ". Please run rake scylla:train after placing your training texts in the source_texts directory."
    return
  end
  sg = Scylla::Generator.new
  unknown = sg.create_lm(@input)
  languages.each_key do |key|
    ngram = languages[key]
    results[key] = get_score(unknown, ngram)
  end
  results = results.sort {|a,b| a[1]<=>b[1]}
  a = results[0][1]
  answers = [results.shift[0]]
  while (!results.empty? and results[0][1] < (@threshold * a))
    answers << results.shift[0]
  end
  return answers
end

#classify_file(path) ⇒ Object

Classifies a file to a list of languages in order of best match



21
22
23
24
25
# File 'lib/scylla/classifier.rb', line 21

def classify_file(path)
  @input = ""
  File.readlines(path).each { |line| @input += " " + line.strip }
  classify
end

#classify_string(text) ⇒ Object

Classifies a string to a list of languages in order of best match



15
16
17
18
# File 'lib/scylla/classifier.rb', line 15

def classify_string(text)
  @input = text
  classify
end

#get_score(unknown, ngram) ⇒ Object

Gets the score of the text in question compared to a particular language



51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/scylla/classifier.rb', line 51

def get_score(unknown, ngram)
  i, p = 0,0
  max_size = [unknown.size, 400].min
  while i < max_size
    if (ngram[unknown[i]])
      p += (ngram[unknown[i]]-i).abs
    else
      p += @ngrams
    end
    i += 1
  end
  return p
end