Class: Yawc

Inherits:
Object
  • Object
show all
Defined in:
lib/yawc.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(s, level: 2) ⇒ Yawc

level:

2 strips out ignore_words and stop_words
3 strips out dictionary words


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/yawc.rb', line 27

def initialize(s, level: 2)
  
  @stopwords = Words2DotDat.stopwords
  
  a = case level
  when 2
    words(s).split
  when 3
    list = words(s).split
    list - Words2DotDat.words
  end
  
  h = a.group_by(&:to_s).\
      inject({}){|r, x| r.merge(String.new(x[0]).\
                                  force_encoding("utf-8") => x[-1].length)}
  @to_h = h.sort_by(&:last).reverse.to_h

end

Instance Attribute Details

#to_hObject (readonly)

Returns the value of attribute to_h.



21
22
23
# File 'lib/yawc.rb', line 21

def to_h
  @to_h
end

Instance Method Details

#words(s) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/yawc.rb', line 46

def words(s)
  
  # words source:  http://norvig.com/mayzner.html
  
  ignorewords = ["the", "of", "and", "to", "in", "a", "is", "that", "for",
                 "it", "as", "was", "with", "be", "by", "on", "not", "he", 
                 "i", "this", "are", "or","his", "from", "at", "which", 
                 "but", "have", "an", "had", "they", "you", "were", 
                 "their", "one", "all", "we", "can", "her", "has", "there",
                 "been", "if", "more", "when", "will", "would", "who", 
                 "so", "no"]

  s.downcase. 
    gsub(/\w+'\w+/,'').  # remove words containing an apostrophe
    gsub(/["']/,'').     # remove quotation marks
    gsub(/\W[^a-z|#]*(\w+)/,' \1 ').     # remove 
    #             non-alpabetical characters from start or beginning of words
    gsub(/\s.\s/,' ').                              # remove single digits 
    gsub(/\b(?:#{(ignorewords + @stopwords).join('|')})\b/,'').   # ignore common words
    gsub(/\B[^\w#]\B+/,'')              # remove any other items which are 
    #                                 not words or hashtags
  
end