Class: FeldtRuby::WordCounter

Inherits:
Object
  • Object
show all
Defined in:
lib/feldtruby/word_counter.rb

Direct Known Subclasses

NgramWordCounter

Constant Summary collapse

StopWords =
["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours ", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]

Instance Method Summary collapse

Constructor Details

#initializeWordCounter

Returns a new instance of WordCounter.



2
3
4
# File 'lib/feldtruby/word_counter.rb', line 2

def initialize
  @counts = Hash.new(0)
end

Instance Method Details

#count(word) ⇒ Object



28
29
30
# File 'lib/feldtruby/word_counter.rb', line 28

def count(word)
  @counts[preprocess_word(word)]
end

#count_word(word) ⇒ Object



11
12
13
14
# File 'lib/feldtruby/word_counter.rb', line 11

def count_word(word)
  w = preprocess_word(word)
  @counts[w] += 1 unless is_stop_word?(w)
end

#count_words(string) ⇒ Object



20
21
22
# File 'lib/feldtruby/word_counter.rb', line 20

def count_words(string)
  invidual_words_in_string(string).map {|w| count_word(w)}
end

#invidual_words_in_string(str) ⇒ Object



16
17
18
# File 'lib/feldtruby/word_counter.rb', line 16

def invidual_words_in_string(str)
  str.downcase.split(/[^\w-]+/)
end

#is_stop_word?(word) ⇒ Boolean

Returns:

  • (Boolean)


38
39
40
# File 'lib/feldtruby/word_counter.rb', line 38

def is_stop_word?(word)
  StopWords.include?(word)
end

#merge!Object

Merge words together that are pluralis or -ing (or -ming) forms of each other. Destructive, so only use this after all words have been added.



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/feldtruby/word_counter.rb', line 44

def merge!
  words = @counts.keys
  base_words = words.select {|w| w[-1,1] != "s" && w[-4,4] != "ming" && w[-3,3] != "ing"}
  non_base = words - base_words
  ending_in_s = non_base.select {|w| w[-1,1] == "s"}
  ending_in_ing = non_base.select {|w| w[-3,3] == "ing"}
  ending_in_ming = non_base.select {|w| w[-4,4] == "ming"}
  base_words.each do |base_word|
    merged_word = base_word
    count = @counts[base_word]
    if ending_in_s.include?(base_word + "s")
      count += @counts[base_word + "s"]
      @counts.delete(base_word + "s")
      merged_word += "|#{base_word}s"
    end
    if ending_in_ming.include?(base_word + "ming")
      count += @counts[base_word + "ming"]
      @counts.delete(base_word + "ming")
      merged_word += "|#{base_word}ming"
    end
    if ending_in_ing.include?(base_word + "ing")
      count += @counts[base_word + "ing"]
      @counts.delete(base_word + "ing")
      merged_word += "|#{base_word}ing"
    end
    if merged_word != base_word
      @counts[merged_word] = count
      @counts.delete(base_word)
    end
  end
end

#preprocess_word(word) ⇒ Object

Ensure it has canonical form



7
8
9
# File 'lib/feldtruby/word_counter.rb', line 7

def preprocess_word(word)
  word.strip.downcase
end

#top_words(numberOfWords) ⇒ Object



32
33
34
# File 'lib/feldtruby/word_counter.rb', line 32

def top_words(numberOfWords)
  @counts.to_a.sort_by {|e| e.last}[-numberOfWords, numberOfWords].reverse
end

#wordsObject



24
25
26
# File 'lib/feldtruby/word_counter.rb', line 24

def words
  @counts.keys
end