Class: Yawc

Inherits:
Object
  • Object
show all
Defined in:
lib/yawc.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(s, level: 2) ⇒ Yawc

level:

2 strips out ignore_words and stop_words
3 strips out dictionary words


695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
# File 'lib/yawc.rb', line 695

def initialize(s, level: 2)
  

  @stopwords = STOPWORDS.strip.lines.map {|x| x.chomp}
  
  a = case level
  when 2
    words(s).split
  when 3
    list = words(s).split
    list - WordsDotDat.words
  end
  
  h = a.group_by(&:to_s).\
      inject({}){|r, x| r.merge(String.new(x[0]).\
                                  force_encoding("utf-8") => x[-1].length)}
  @to_h = h.sort_by(&:last).reverse.to_h

end

Instance Attribute Details

#to_hObject (readonly)

Returns the value of attribute to_h.



689
690
691
# File 'lib/yawc.rb', line 689

def to_h
  @to_h
end

Instance Method Details

#words(s) ⇒ Object



715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
# File 'lib/yawc.rb', line 715

def words(s)
  
  # words source:  http://norvig.com/mayzner.html
  
  ignorewords = ["the", "of", "and", "to", "in", "a", "is", "that", "for",
                 "it", "as", "was", "with", "be", "by", "on", "not", "he", 
                 "i", "this", "are", "or","his", "from", "at", "which", 
                 "but", "have", "an", "had", "they", "you", "were", 
                 "their", "one", "all", "we", "can", "her", "has", "there",
                 "been", "if", "more", "when", "will", "would", "who", 
                 "so", "no"]

  s.downcase. 
    gsub(/\w+'\w+/,'').  # remove words containing an apostrophe
    gsub(/["']/,'').     # remove quotation marks
    gsub(/\W[^a-z|#]+(\w+)/,' \1 ').     # remove 
    #             non-alpabetical characters from start or beginning of words
    gsub(/\s.\s/,' ').                              # remove single digits 
    gsub(/\b(?:#{(ignorewords + @stopwords).join('|')})\b/,'').   # ignore common words
    gsub(/\B[^\w#]\B+/,'')              # remove any other items which are 
    #                                 not words or hashtags
  
end