Class: Clarifier::StopWords

Inherits:
Object
  • Object
show all
Defined in:
lib/clarifier/stop_words.rb

Constant Summary collapse

@@lists =
{}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(desired_stopwords = nil, training_threshold = 0.8) ⇒ StopWords

Returns a new instance of StopWords.



12
13
14
15
16
17
18
19
20
21
# File 'lib/clarifier/stop_words.rb', line 12

def initialize(desired_stopwords = nil, training_threshold = 0.8)
  if desired_stopwords.kind_of?(Array)
    @stopwords = desired_stopwords
  elsif @@lists[desired_stopwords]
    @stopwords = @@lists[desired_stopwords]
  else
    @stopwords = @@lists[:en_gb_basic]
  end
  @training_threshold = training_threshold
end

Instance Attribute Details

#stopwordsObject

Returns the value of attribute stopwords.



4
5
6
# File 'lib/clarifier/stop_words.rb', line 4

def stopwords
  @stopwords
end

Class Method Details

.listsObject



8
9
10
# File 'lib/clarifier/stop_words.rb', line 8

def self.lists
  @@lists
end

Instance Method Details

#clarify(input) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/clarifier/stop_words.rb', line 23

def clarify(input)
  new_string = input.dup

  @stopwords.each do |word|
    new_string.gsub!(/(^|\s)#{Regexp.escape(word)}(\s|$)/i, '\1\2')
  end

  new_string.gsub!(/\s+/, ' ')
  new_string.strip!

  new_string
end

#refine(doc) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
# File 'lib/clarifier/stop_words.rb', line 42

def refine(doc)
  @word_counts ||= Hash.new(0)
  @training_doc_count ||= 0
  @training_doc_count += 1
  words = doc.split
  words.uniq!
  words.each do |word|
    @word_counts[word] += 1
  end
  select_stopwords_from_training
end

#resetObject



36
37
38
39
40
# File 'lib/clarifier/stop_words.rb', line 36

def reset
  @word_counts = Hash.new(0)
  @training_doc_count = 0
  @stopwords = []
end

#select_stopwords_from_trainingObject



63
64
65
66
67
68
69
70
# File 'lib/clarifier/stop_words.rb', line 63

def select_stopwords_from_training
  @stopwords = []
  @word_counts.each do |word, count|
    if count.to_f / @training_doc_count >= @training_threshold
      @stopwords << word
    end
  end
end

#train(docs, threshold = @training_threshold) ⇒ Object



54
55
56
57
58
59
60
61
# File 'lib/clarifier/stop_words.rb', line 54

def train(docs, threshold = @training_threshold)
  @word_counts = Hash.new(0)
  @training_doc_count = 0
  @training_threshold = threshold
  docs.each do |doc|
    refine(doc)
  end
end