Class: Clarifier::StopWords
- Inherits:
-
Object
- Object
- Clarifier::StopWords
- Defined in:
- lib/clarifier/stop_words.rb
Constant Summary collapse
- @@lists =
{}
Instance Attribute Summary collapse
-
#stopwords ⇒ Object
Returns the value of attribute stopwords.
Class Method Summary collapse
Instance Method Summary collapse
- #clarify(input) ⇒ Object
-
#initialize(desired_stopwords = nil, training_threshold = 0.8) ⇒ StopWords
constructor
A new instance of StopWords.
- #refine(doc) ⇒ Object
- #reset ⇒ Object
- #select_stopwords_from_training ⇒ Object
- #train(docs, threshold = @training_threshold) ⇒ Object
Constructor Details
#initialize(desired_stopwords = nil, training_threshold = 0.8) ⇒ StopWords
Returns a new instance of StopWords.
12 13 14 15 16 17 18 19 20 21 |
# File 'lib/clarifier/stop_words.rb', line 12 def initialize(desired_stopwords = nil, training_threshold = 0.8) if desired_stopwords.kind_of?(Array) @stopwords = desired_stopwords elsif @@lists[desired_stopwords] @stopwords = @@lists[desired_stopwords] else @stopwords = @@lists[:en_gb_basic] end @training_threshold = training_threshold end |
Instance Attribute Details
#stopwords ⇒ Object
Returns the value of attribute stopwords.
4 5 6 |
# File 'lib/clarifier/stop_words.rb', line 4 def stopwords @stopwords end |
Class Method Details
.lists ⇒ Object
8 9 10 |
# File 'lib/clarifier/stop_words.rb', line 8 def self.lists @@lists end |
Instance Method Details
#clarify(input) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/clarifier/stop_words.rb', line 23 def clarify(input) new_string = input.dup @stopwords.each do |word| new_string.gsub!(/(^|\s)#{Regexp.escape(word)}(\s|$)/i, '\1\2') end new_string.gsub!(/\s+/, ' ') new_string.strip! new_string end |
#refine(doc) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/clarifier/stop_words.rb', line 42 def refine(doc) @word_counts ||= Hash.new(0) @training_doc_count ||= 0 @training_doc_count += 1 words = doc.split words.uniq! words.each do |word| @word_counts[word] += 1 end select_stopwords_from_training end |
#reset ⇒ Object
36 37 38 39 40 |
# File 'lib/clarifier/stop_words.rb', line 36 def reset @word_counts = Hash.new(0) @training_doc_count = 0 @stopwords = [] end |
#select_stopwords_from_training ⇒ Object
63 64 65 66 67 68 69 70 |
# File 'lib/clarifier/stop_words.rb', line 63 def select_stopwords_from_training @stopwords = [] @word_counts.each do |word, count| if count.to_f / @training_doc_count >= @training_threshold @stopwords << word end end end |
#train(docs, threshold = @training_threshold) ⇒ Object
54 55 56 57 58 59 60 61 |
# File 'lib/clarifier/stop_words.rb', line 54 def train(docs, threshold = @training_threshold) @word_counts = Hash.new(0) @training_doc_count = 0 @training_threshold = threshold docs.each do |doc| refine(doc) end end |