Class: FeldtRuby::WordCounter
Direct Known Subclasses
Constant Summary collapse
- StopWords =
["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours ", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]
Instance Method Summary collapse
- #count(word) ⇒ Object
- #count_word(word) ⇒ Object
- #count_words(string) ⇒ Object
-
#initialize ⇒ WordCounter
constructor
A new instance of WordCounter.
- #invidual_words_in_string(str) ⇒ Object
- #is_stop_word?(word) ⇒ Boolean
-
#merge! ⇒ Object
Merge words together that are pluralis or -ing (or -ming) forms of each other.
-
#preprocess_word(word) ⇒ Object
Ensure it has canonical form.
- #top_words(numberOfWords) ⇒ Object
- #words ⇒ Object
Constructor Details
#initialize ⇒ WordCounter
Returns a new instance of WordCounter.
2 3 4 |
# File 'lib/feldtruby/word_counter.rb', line 2 def initialize @counts = Hash.new(0) end |
Instance Method Details
#count(word) ⇒ Object
28 29 30 |
# File 'lib/feldtruby/word_counter.rb', line 28 def count(word) @counts[preprocess_word(word)] end |
#count_word(word) ⇒ Object
11 12 13 14 |
# File 'lib/feldtruby/word_counter.rb', line 11 def count_word(word) w = preprocess_word(word) @counts[w] += 1 unless is_stop_word?(w) end |
#count_words(string) ⇒ Object
20 21 22 |
# File 'lib/feldtruby/word_counter.rb', line 20 def count_words(string) invidual_words_in_string(string).map {|w| count_word(w)} end |
#invidual_words_in_string(str) ⇒ Object
16 17 18 |
# File 'lib/feldtruby/word_counter.rb', line 16 def invidual_words_in_string(str) str.downcase.split(/[^\w-]+/) end |
#is_stop_word?(word) ⇒ Boolean
38 39 40 |
# File 'lib/feldtruby/word_counter.rb', line 38 def is_stop_word?(word) StopWords.include?(word) end |
#merge! ⇒ Object
Merge words together that are pluralis or -ing (or -ming) forms of each other. Destructive, so only use this after all words have been added.
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/feldtruby/word_counter.rb', line 44 def merge! words = @counts.keys base_words = words.select {|w| w[-1,1] != "s" && w[-4,4] != "ming" && w[-3,3] != "ing"} non_base = words - base_words ending_in_s = non_base.select {|w| w[-1,1] == "s"} ending_in_ing = non_base.select {|w| w[-3,3] == "ing"} ending_in_ming = non_base.select {|w| w[-4,4] == "ming"} base_words.each do |base_word| merged_word = base_word count = @counts[base_word] if ending_in_s.include?(base_word + "s") count += @counts[base_word + "s"] @counts.delete(base_word + "s") merged_word += "|#{base_word}s" end if ending_in_ming.include?(base_word + "ming") count += @counts[base_word + "ming"] @counts.delete(base_word + "ming") merged_word += "|#{base_word}ming" end if ending_in_ing.include?(base_word + "ing") count += @counts[base_word + "ing"] @counts.delete(base_word + "ing") merged_word += "|#{base_word}ing" end if merged_word != base_word @counts[merged_word] = count @counts.delete(base_word) end end end |
#preprocess_word(word) ⇒ Object
Ensure it has canonical form
7 8 9 |
# File 'lib/feldtruby/word_counter.rb', line 7 def preprocess_word(word) word.strip.downcase end |
#top_words(numberOfWords) ⇒ Object
32 33 34 |
# File 'lib/feldtruby/word_counter.rb', line 32 def top_words(numberOfWords) @counts.to_a.sort_by {|e| e.last}[-numberOfWords, numberOfWords].reverse end |
#words ⇒ Object
24 25 26 |
# File 'lib/feldtruby/word_counter.rb', line 24 def words @counts.keys end |